Merge branch 'develop' into enhance_reshape

a8cdd97e · caoying03 · 1d4dfc09 · 6c06841b · a8cdd97e · a8cdd97e
146 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -56,7 +56,7 @@ script:
    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
    export DOCS_DIR=`pwd`
    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2   
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
 notifications:
  email:
    on_success: change

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -144,6 +144,8 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/grpc)
+include(external/snappy)    # download snappy
+include(external/snappystream)
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)

--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -138,13 +138,14 @@ def main():
    avg_cost = fluid.layers.mean(x=cost)
    # Evaluator
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    batch_size = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size)
    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
-        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(batch_acc)
-        inference_program = fluid.io.get_inference_program(test_target)
    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
@@ -157,27 +158,30 @@ def main():
    # test
    def test(exe):
-        accuracy.reset(exe)
+        test_pass_acc = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])
-            exe.run(inference_program,
+            outs = exe.run(inference_program,
-                    feed={"pixel": img_data,
+                           feed={"pixel": img_data,
-                          "label": y_data})
+                                 "label": y_data},
+                           fetch_list=[batch_acc, batch_size])
+            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-        return accuracy.eval(exe)
+        return test_pass_acc.eval()
    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
+        train_pass_acc = fluid.average.WeightedAverage()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
-            accuracy.reset(exe)
+            train_pass_acc.reset()
            with profiler.profiler("CPU", 'total') as prof:
                for batch_id, data in enumerate(train_reader()):
                    ts = time.time()
@@ -187,13 +191,14 @@ def main():
                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                    y_data = y_data.reshape([-1, 1])
-                    loss, acc = exe.run(
+                    loss, acc, b_size = exe.run(
                        trainer_prog,
                        feed={"pixel": img_data,
                              "label": y_data},
-                        fetch_list=[avg_cost] + accuracy.metrics)
+                        fetch_list=[avg_cost, batch_acc, batch_size])
                    iters += 1
                    num_samples += len(data)
+                    train_pass_acc.add(value=acc, weight=b_size)
                    print(
                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
                        % (pass_id, iters, loss, acc,
@@ -201,7 +206,7 @@ def main():
                    )  # The accuracy is the accumulation of batches, but not the current batch.
            pass_elapsed = time.time() - start_time
-            pass_train_acc = accuracy.eval(exe)
+            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
            print(
                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND})
        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
        BUILD_IN_SOURCE     1
        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
+                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
@@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
            )"
        )
-        INSTALL(CODE "execute_process(
-            COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
-                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
-            )"
-        )
    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+include (ExternalProject)
+# NOTE: snappy is needed when linking with recordio
+SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+ExternalProject_Add(
+    extern_snappy
+    GIT_REPOSITORY "https://github.com/google/snappy"
+    GIT_TAG "1.1.7"
+    PREFIX          ${SNAPPY_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DSNAPPY_BUILD_TESTS:BOOL=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_COMMAND   make -j8
+    INSTALL_COMMAND make install
+)
+add_library(snappy STATIC IMPORTED GLOBAL)
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
+             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+include_directories(${SNAPPY_INCLUDE_DIR})
+add_dependencies(snappy extern_snappy)
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+include (ExternalProject)
+# NOTE: snappy is needed when linking with recordio
+SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+ExternalProject_Add(
+        extern_snappystream
+        GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+        GIT_TAG "0.2.8"
+        PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+                        CMAKE_CACHE_ARGS
+                        -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+        BUILD_COMMAND   make -j8
+        INSTALL_COMMAND make install
+        DEPENDS snappy
+)
+add_library(snappystream STATIC IMPORTED GLOBAL)
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
+        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+add_dependencies(snappystream extern_snappystream)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
    else()
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+      find_fluid_modules(${TARGET_NAME})
    endif()
    if(cc_library_DEPS)
      # Don't need link libwarpctc.so
      if("${cc_library_DEPS};" MATCHES "warpctc;")
@@ -263,7 +265,8 @@ function(nv_library TARGET_NAME)
      if (nv_library_SHARED OR nv_library_shared) # build *.so
        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
      else()
-          cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+        cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
      endif()
      if (nv_library_DEPS)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
 # make package for paddle fluid shared and static library
 function(copy TARGET)
    set(options "")
    set(oneValueArgs "")
    set(multiValueArgs SRCS DSTS DEPS)
    cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
    list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
    list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -42,13 +55,21 @@ copy(glog_lib
  DSTS ${dst_dir} ${dst_dir}/lib
 )
-IF(NOT PROTOBUF_FOUND)
+if(NOT PROTOBUF_FOUND)
    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
    copy(protobuf_lib
-      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
      DSTS ${dst_dir} ${dst_dir}/lib
    )
-ENDIF(NOT PROTOBUF_FOUND)
+endif()
+if(NOT CBLAS_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    copy(openblas_lib
+      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+      DSTS ${dst_dir} ${dst_dir}
+    )
+endif()
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@@ -66,8 +87,8 @@ copy(memory_lib
 )
 set(module "inference")
-copy(inference_lib DEPENDS paddle_fluid_shared
+copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
@@ -83,6 +104,4 @@ copy(string_lib
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
-add_custom_target(inference_lib_dist DEPENDS 
+add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
-  inference_lib framework_lib memory_lib platform_lib string_lib
-  gflags_lib glog_lib protobuf_lib eigen3_lib)
--- a/doc/design/cpp_data_feeding.md
+++ b/doc/design/cpp_data_feeding.md
@@ -20,9 +20,8 @@ class ReaderBase {
    PADDLE_ENFORCE(!shapes_.empty());
  }
  // Read the next batch of data. (A 'batch' can be only one instance)
+  // If the next batch doesn't exist, the '*out' will be an empty std::vector.
  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
-  // Show whether the next bacth exists.
-  virtual bool HasNext() const = 0;
  // Reinitialize the reader and read the file from the begin.
  virtual void ReInit() = 0;

--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
 ### paddle::framework::Tensor到EigenTensor的转换
-如上一小节所示，在具体的计算中，我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
+如上一小节所示，在具体的计算中，我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
 以EigenTensor为例，做一个介绍
@@ -125,7 +125,7 @@ From是EigenTensor模板提供的一个接口，可以实现从paddle::framework
 在Eigen中，不同rank的Tensor是不同类型，Vector是rank为1的Tensor。需要额外注意的是，EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor，在这里用EigenVector来表示；而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作，压扁成为Eigen的一维Tensor，类型仍然为EigenVector。
-更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。
+更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)。

--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
 ### paddle::framework::Tensor到EigenTensor的转换
-As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
+As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
 Using EigenTensor as an example:
@@ -125,7 +125,7 @@ EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
 In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
-For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
+For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file.

--- a/doc/v2/howto/optimization/pprof_1.png
+++ b/doc/v2/howto/optimization/pprof_1.png
--- a/doc/v2/howto/optimization/pprof_2.png
+++ b/doc/v2/howto/optimization/pprof_2.png
--- a/doc/fluid/howto/optimization/timeline.jpeg
+++ b/doc/fluid/howto/optimization/timeline.jpeg
--- a/doc/fluid/howto/optimization/timeline.md
+++ b/doc/fluid/howto/optimization/timeline.md
+## how to use timeline tool to do profile
+1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[],
+	                    use_program_cache=True)
+	            ...
+	```
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
+	![chrome tracing](./tracing.jpeg)
+1. The resulting timeline should be like:
+	![chrome timeline](./timeline.jpeg)
--- a/doc/fluid/howto/optimization/tracing.jpeg
+++ b/doc/fluid/howto/optimization/tracing.jpeg
--- a/doc/fluid/read_source.md
+++ b/doc/fluid/read_source.md
@@ -2,17 +2,17 @@
 Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
-Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework
-Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators
-Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory
-Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform
 # Compile Time
-The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto).
 ```python
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
@@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost)
 - Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
 - Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
  - Every Layer has one or more operators and variables/parameters
-    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+    - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files:
-      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+      - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h)
-      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
+      - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h)
-      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+      - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h)
 - Optimizer: `fluid.optimizer.SGD`. It does the following
  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
@@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(),
        fetch_list=[avg_cost])
 ```
- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h)
-  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+  - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)]
  - Feeds the data: `feed=feeder.feed(data)`
  - Evaluates all the operators
  - Fetches the result: `fetch_list=[avg_cost]`
 - Other worth looking files:
-  - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+  - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live
-    - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+    - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live
-      - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
+      - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory)
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -34,15 +34,15 @@ PaddlePaddle可以使用常用的Python包管理工具
   :align: center
 ..  csv-table:: 各个版本最新的whl包
-    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
-    :widths: 1, 3, 3, 3
+    :widths: 1, 3, 3
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
 .. _pip_dependency:

--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -37,15 +37,15 @@ If the links below shows up the login form, just click "Log in as guest" to star
   :align: center
 ..  csv-table:: whl package of each version
-    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :header: "version", "cp27-cp27mu", "cp27-cp27m"
-    :widths: 1, 3, 3, 3
+    :widths: 1, 3, 3
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
 .. _pip_dependency:

--- a/doc/v2/getstarted/index_cn.rst
+++ b/doc/v2/getstarted/index_cn.rst
 新手入门
 ============
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
 ..  toctree::
  :maxdepth: 1
  quickstart_cn.rst
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+..  toctree::
+  :maxdepth: 1
  concepts/use_concepts_cn.rst
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
-## 安装与编译C-API预测库
+## 安装、编译与链接C-API预测库
-### 概述
+### 直接下载安装
-使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库，只需在编译时需配制下面这些编译选项：
+从CI系统中下载最新的C-API开发包进行安装，用户可以从下面的表格中找到需要的版本：
-必须配置选项：
+<table>
- `WITH_C_API`，必须配置为`ON`。
+<thead>
+<tr>
-推荐配置选项：
+<th>版本说明</th>
- `WITH_PYTHON`，推荐配置为`OFF`
+<th>C-API</th>
- `WITH_SWIG_PY`，推荐配置为`OFF`
+</tr>
- `WITH_GOLANG`，推荐设置为`OFF`
+</thead>
+<tbody>
-可选配置选项：
+<tr>
- `WITH_GPU`，可配置为`ON/OFF`
+<td>cpu_avx_mkl</td>
- `WITH_MKL`，可配置为`ON/OFF`
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
-对推荐配置中的选项建议按照设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+<tr>
+<td>cpu_avx_openblas</td>
+<td>暂无</td>
+</tr>
+<tr>
+<td>cpu_noavx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda7.5_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr></tbody></table>
+### 从源码编译
+用户也可以从 PaddlePaddle 核心代码编译C-API链接库，只需在编译时配制下面这些编译选项：
+<table>
+<thead>
+<tr>
+<th>选项</th>
+<th>值</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>WITH_C_API</td>
+<td>ON</td>
+</tr>
+<tr>
+<td>WITH_PYTHON</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_SWIG_PY</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_GOLANG</td>
+<td>OFF（推荐）</td>
+</tr>
+<tr>
+<td>WITH_GPU</td>
+<td>ON/OFF</td>
+</tr>
+<tr>
+<td>WITH_MKL</td>
+<td>ON/OFF</td>
+</tr></tbody></table>
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
 下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
@@ -100,23 +158,19 @@ cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
 目前提供三种链接方式：
-1. 链接`libpaddle_capi_shared.so` 动态库
+1. 链接`libpaddle_capi_shared.so` 动态库（这种方式最为简便，链接相对容易，**在无特殊需求情况下，推荐使用此方式**），需注意：
-    - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时，需注意：
+    1. 如果编译时指定编译CPU版本，且使用`OpenBLAS`数学库，在使用C-API开发预测程序时，只需要链接`libpaddle_capi_shared.so`这一个库。
-        1. 如果编译时指定编译CPU版本，且使用`OpenBLAS`数学库，在使用C-API开发预测程序时，只需要链接`libpaddle_capi_shared.so`这一个库。
+    1. 如果是用编译时指定CPU版本，且使用`MKL`数学库，由于`MKL`库有自己独立的动态库文件，在使用PaddlePaddle C-API开发预测程序时，需要自己链接MKL链接库。
-        1. 如果是用编译时指定CPU版本，且使用`MKL`数学库，由于`MKL`库有自己独立的动态库文件，在使用PaddlePaddle C-API开发预测程序时，需要自己链接MKL链接库。
+    1. 如果编译时指定编译GPU版本，CUDA相关库会在预测程序运行时动态装载，需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
-        1. 如果编译时指定编译GPU版本，CUDA相关库会在预测程序运行时动态装载，需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
-    - 这种方式最为简便，链接相对容易，**在无特殊需求情况下，推荐使用此方式**。
+2. 链接静态库 `libpaddle_capi_whole.a`，需注意：
+    1. 需要指定`-Wl,--whole-archive`链接选项。
-2. 链接静态库 `libpaddle_capi_whole.a`
+    1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库，可在`PADDLE_ROOT/third_party`下找到。
-    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时，需注意：
+    1. 如果在编译 C-API 时使用OpenBLAS数学库，需要显示地链接`libopenblas.a`。
-        1. 需要指定`-Wl,--whole-archive`链接选项。
+    1. 如果在编译 C-API 是使用MKL数学库，需要显示地链接MKL的动态库。
-        1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库，可在`PADDLE_ROOT/third_party`下找到。
-        1. 如果在编译 C-API 时使用OpenBLAS数学库，需要显示地链接`libopenblas.a`。
+3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`，需注意：
-        1. 如果在编译 C-API 是使用MKL数学库，需要显示地链接MKL的动态库。
+    1. 这种链接方式主要用于移动端预测。
+    1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
-3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`
+    1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
-    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时，需注意：
+    1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
-        1. 这种链接方式主要用于移动端预测。
-        1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
-        1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
-        1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
--- a/doc/v2/howto/index_cn.rst
+++ b/doc/v2/howto/index_cn.rst
 进阶使用
 ========
+PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。使用方式请参考：
 ..  toctree::
  :maxdepth: 1
  cmd_parameter/index_cn.rst
+PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务，具体环境配置和使用说明请参考：
+..  toctree::
+  :maxdepth: 1
  cluster/index_cn.rst
+PaddlePaddle提供了用于预测的C-API，关于C-API的使用，我们提供了如下指南:
+..  toctree::
+  :maxdepth: 1
  capi/index_cn.rst
+PaddlePaddle支持多种灵活和高效的循环神经网络，具体配置使用方式请参考：
+..  toctree::
+  :maxdepth: 1
  rnn/index_cn.rst
+关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优，请参考：
+..  toctree::
+  :maxdepth: 1
  optimization/gpu_profiling_cn.rst
--- a/doc/v2/howto/rnn/hierarchical_layer_cn.rst
+++ b/doc/v2/howto/rnn/hierarchical_layer_cn.rst
@@ -22,7 +22,7 @@
 pooling
 ========
-pooling 的使用示例如下，详细见 :ref:`api_v2.layer_pooling` 配置API。
+pooling 的使用示例如下。
 ..	code-block:: bash
@@ -47,7 +47,7 @@ pooling 的使用示例如下，详细见 :ref:`api_v2.layer_pooling` 配置API
 last_seq 和 first_seq
 =====================
-last_seq 的使用示例如下（ :ref:`api_v2.layer_first_seq` 类似），详细见 :ref:`api_v2.layer_last_seq` 配置API。
+last_seq 的使用示例如下（first_seq 类似）。
 ..	code-block:: bash
@@ -68,7 +68,7 @@ last_seq 的使用示例如下（ :ref:`api_v2.layer_first_seq` 类似），详
 expand
 ======
-expand 的使用示例如下，详细见 :ref:`api_v2.layer_expand` 配置API。
+expand 的使用示例如下。
 ..	code-block:: bash

--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -4,7 +4,7 @@
 单双层RNN API对比介绍
 #####################
-本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/reyoung/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
 示例1：双层RNN，子序列间无Memory
 ================================
@@ -166,11 +166,6 @@
 在上面代码中，单层和双层序列的使用和示例2中的示例类似，区别是同时处理了两个输入。而对于双层序列，两个输入的子序列长度也并不相同。但是，我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状，和\ :code:`emb2`\ 的序列形状一致。
-示例4：beam_search的生成
-========================
-TBD
 词汇表
 ======

--- a/doc/v2/howto/rnn/index_cn.rst
+++ b/doc/v2/howto/rnn/index_cn.rst
 RNN模型
 ===========
+循环神经网络（RNN）是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。
+这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。
+第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌：首先以简单的循环神经网络（vanilla RNN）为例，
+说明如何封装配置循环神经网络组件；然后更进一步的通过序列到序列（sequence to sequence）模型，逐步讲解如何构建完整而复杂的循环神经网络模型。
 ..  toctree::
  :maxdepth: 1
  rnn_config_cn.rst
+Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键，第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理，
+对Recurrent Group接口进行了详细说明。另外，对双层RNN（对应的输入为双层序列）及Recurrent Group在其中的使用进行了介绍。
+..  toctree::
+  :maxdepth: 1
  recurrent_group_cn.md
+第三部分对双层序列进行了解释说明，列出了PaddlePaddle中支持双层序列作为输入的Layer，并对其使用进行了逐一介绍。
+..  toctree::
+  :maxdepth: 1
  hierarchical_layer_cn.rst
+第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例，辅以效果相同的单层RNN网络配置作为对比，讲解了多种情况下双层RNN的使用。
+..  toctree::
+  :maxdepth: 1
  hrnn_rnn_api_compare_cn.rst
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(inference)
 add_subdirectory(string)
+add_subdirectory(recordio)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-if (WITH_GPU)
+if(WITH_GPU)
  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
-endif ()
+endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-if (WITH_GPU)
+if(WITH_GPU)
  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
 else()
  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
@@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry init math_function)
-cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+if(WITH_GPU)
-cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+else()
+  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+  cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+endif()
 cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
 cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)

--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -28,24 +28,19 @@ class Channel {
  virtual bool Send(T*) = 0;
  virtual bool Receive(T*) = 0;
  virtual size_t Cap() = 0;
+  virtual void Lock() = 0;
+  virtual void Unlock() = 0;
  virtual void Close() = 0;
  virtual ~Channel() {}
 };
 // Forward declaration of channel implementations.
-namespace details {
 template <typename T>
-class Buffered;
+class ChannelImpl;
-template <typename T>
-class UnBuffered;
-}  // namespace details
 template <typename T>
 Channel<T>* MakeChannel(size_t buffer_size) {
-  if (buffer_size > 0) {
+  return new ChannelImpl<T>(buffer_size);
-    return new details::Buffered<T>(buffer_size);
-  }
-  return new details::UnBuffered<T>();
 }
 template <typename T>
@@ -89,6 +84,19 @@ class ChannelHolder {
    if (IsInitialized()) holder_->Close();
  }
+  size_t Cap() {
+    if (IsInitialized()) return holder_->Cap();
+    return -1;
+  }
+  void Lock() {
+    if (IsInitialized()) holder_->Lock();
+  }
+  void Unlock() {
+    if (IsInitialized()) holder_->Unlock();
+  }
  inline bool IsInitialized() const { return holder_ != nullptr; }
  inline const std::type_index Type() {
@@ -106,6 +114,9 @@ class ChannelHolder {
    virtual const std::type_index Type() const = 0;
    virtual void* Ptr() const = 0;
    virtual void Close() = 0;
+    virtual void Lock() = 0;
+    virtual void Unlock() = 0;
+    virtual size_t Cap() = 0;
  };
  template <typename T>
@@ -115,11 +126,28 @@ class ChannelHolder {
    }
    virtual const std::type_index Type() const { return type_; }
    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
    virtual void Close() {
      if (channel_) channel_->Close();
    }
+    virtual size_t Cap() {
+      if (channel_)
+        return channel_->Cap();
+      else
+        return -1;
+    }
+    virtual void Lock() {
+      if (channel_) channel_->Lock();
+    }
+    virtual void Unlock() {
+      if (channel_) channel_->Unlock();
+    }
    std::unique_ptr<Channel<T>> channel_;
    const std::type_index type_;
  };
@@ -131,5 +159,4 @@ class ChannelHolder {
 }  // namespace framework
 }  // namespace paddle
-#include "paddle/fluid/framework/details/buffered_channel.h"
+#include "paddle/fluid/framework/channel_impl.h"
-#include "paddle/fluid/framework/details/unbuffered_channel.h"
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stddef.h>  // for size_t
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+template <typename T>
+class ChannelImpl : public paddle::framework::Channel<T> {
+  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
+ public:
+  virtual bool Send(T *);
+  virtual bool Receive(T *);
+  virtual size_t Cap() { return cap_; }
+  virtual void Lock();
+  virtual void Unlock();
+  virtual void Close();
+  ChannelImpl(size_t);
+  virtual ~ChannelImpl();
+ private:
+  struct QueueMessage {
+    T *data;
+    std::condition_variable_any cond;
+    bool chan_closed = false;
+    bool completed = false;
+    QueueMessage(T *item) : data(item) {}
+    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
+      cond.wait(lock, [this]() { return completed; });
+    }
+    void Notify() {
+      completed = true;
+      cond.notify_all();
+    }
+  };
+  bool send_return(bool value) {
+    send_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+  bool recv_return(bool value) {
+    recv_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+  size_t cap_;
+  std::recursive_mutex mu_;
+  bool closed_;
+  std::deque<T> buf_;
+  std::deque<std::shared_ptr<QueueMessage>> recvq;
+  std::deque<std::shared_ptr<QueueMessage>> sendq;
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+  std::condition_variable_any destructor_cond_;
+};
+template <typename T>
+ChannelImpl<T>::ChannelImpl(size_t capacity)
+    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
+  PADDLE_ENFORCE_GE(capacity, 0);
+}
+template <typename T>
+bool ChannelImpl<T>::Send(T *item) {
+  send_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  // If channel is closed, do nothing
+  if (closed_) {
+    lock.unlock();
+    // TODO(abhinavarora) Should panic on closed channel
+    return send_return(false);
+  }
+  // If there is a receiver, directly pass the value we want
+  // to send to the receiver, bypassing the channel buffer if any
+  if (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    // Do the data transfer
+    *(m->data) = std::move(*item);
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return send_return(true);
+  }
+  // Unbuffered channel will always bypass this
+  // If buffered channel has space in buffer,
+  // write the element to the buffer.
+  if (buf_.size() < cap_) {
+    // Copy to buffer
+    buf_.push_back(std::move(*item));
+    // Release lock and return true
+    lock.unlock();
+    return send_return(true);
+  }
+  // Block on channel, because some receiver will complete
+  // the operation for us
+  auto m = std::make_shared<QueueMessage>(item);
+  sendq.push_back(m);
+  m->Wait(lock);
+  // TODO(abhinavarora) Should panic on closed channel
+  return send_return(!m->chan_closed);
+}
+template <typename T>
+bool ChannelImpl<T>::Receive(T *item) {
+  recv_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  // If channel is closed and buffer is empty or
+  // channel is unbuffered
+  if (closed_ && buf_.empty()) {
+    lock.unlock();
+    return recv_return(false);
+  }
+  // If there is a sender, directly receive the value we want
+  // from the sender, bypassing the channel buffer if any
+  if (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    // Do the data transfer
+    *item = std::move(*(m->data));
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return recv_return(true);
+  }
+  // If this is a buffered channel and there are items in buffer
+  if (buf_.size() > 0) {
+    // Directly read from buffer
+    *item = std::move(buf_.front());
+    buf_.pop_front();
+    // Release lock and return true
+    lock.unlock();
+    return recv_return(true);
+  }
+  // No sender available, block on this channel
+  // Some receiver will complete the option for us
+  auto m = std::make_shared<QueueMessage>(item);
+  recvq.push_back(m);
+  m->Wait(lock);
+  return recv_return(!m->chan_closed);
+}
+template <typename T>
+void ChannelImpl<T>::Lock() {
+  mu_.lock();
+}
+template <typename T>
+void ChannelImpl<T>::Unlock() {
+  mu_.unlock();
+}
+template <typename T>
+void ChannelImpl<T>::Close() {
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  if (closed_) {
+    // TODO(abhinavarora): closing an already closed channel should panic
+    lock.unlock();
+    return;
+  }
+  closed_ = true;
+  // Empty the readers
+  while (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+  // Empty the senders
+  while (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+}
+template <typename T>
+ChannelImpl<T>::~ChannelImpl() {
+  Close();
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  destructor_cond_.wait(lock,
+                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -23,8 +23,19 @@ using paddle::framework::Channel;
 using paddle::framework::ChannelHolder;
 using paddle::framework::MakeChannel;
 using paddle::framework::CloseChannel;
-using paddle::framework::details::Buffered;
-using paddle::framework::details::UnBuffered;
+TEST(Channel, ChannelCapacityTest) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  EXPECT_EQ(ch->Cap(), buffer_size);
+  CloseChannel(ch);
+  delete ch;
+  ch = MakeChannel<size_t>(0);
+  EXPECT_EQ(ch->Cap(), 0U);
+  CloseChannel(ch);
+  delete ch;
+}
 void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
  unsigned sum_send = 0;
@@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
    }
  });
  for (int i = 0; i < 5; i++) {
-    int recv;
+    int recv = 999;
    EXPECT_EQ(ch->Receive(&recv), true);
    EXPECT_EQ(recv, i);
  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
  CloseChannel(ch);
  t.join();
  EXPECT_EQ(sum_send, 10U);
  delete ch;
 }
-TEST(Channel, MakeAndClose) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
-  {
-    // MakeChannel should return a buffered channel is buffer_size > 0.
-    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-  {
-    // MakeChannel should return an un-buffered channel is buffer_size = 0.
-    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-}
 TEST(Channel, SufficientBufferSizeDoesntBlock) {
  const size_t buffer_size = 10;
  auto ch = MakeChannel<size_t>(buffer_size);
@@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
 TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  const size_t buffer_size = 10;
  auto ch = MakeChannel<size_t>(buffer_size);
-  size_t sum = 0;
  std::thread t([&]() {
    // Try to write more than buffer size.
    for (size_t i = 0; i < 2 * buffer_size; ++i) {
@@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
      else
        EXPECT_EQ(ch->Send(&i), false);
-      sum += i;
    }
  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  EXPECT_EQ(sum, 45U);
  CloseChannel(ch);
  t.join();
  delete ch;
@@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
        },
        &thread_ended[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all the threads are blocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  // This should unblock all receivers
  CloseChannel(ch);
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  for (size_t i = 0; i < num_threads; i++) t[i].join();
 }
-void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
        },
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  if (dynamic_cast<Buffered<int> *>(ch)) {
+  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
    int ct = 0;
    for (size_t i = 0; i < num_threads; i++) {
@@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
  // This should unblock all senders
  CloseChannel(ch);
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  if (dynamic_cast<Buffered<int> *>(ch)) {
+  if (isBuffered) {
    // Verify that only 1 send was successful
    int ct = 0;
    for (size_t i = 0; i < num_threads; i++) {
@@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
 //  any senders waiting for channel to have write space
 TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
  auto ch = MakeChannel<int>(1);
-  ChannelCloseUnblocksSendersTest(ch);
+  ChannelCloseUnblocksSendersTest(ch, true);
  delete ch;
 }
@@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
 //  unblocks any senders waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
  auto ch = MakeChannel<int>(0);
-  ChannelCloseUnblocksReceiversTest(ch);
+  ChannelCloseUnblocksSendersTest(ch, false);
  delete ch;
 }
@@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
    ch->Receive(&recv);
    EXPECT_EQ(recv, i);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  EXPECT_EQ(sum_send, 3U);
  CloseChannel(ch);
@@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
    ch->Send(&i);
    sum_send += i;
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  EXPECT_EQ(sum_send, 10U);
  EXPECT_EQ(sum_receive, 10U);
  // send three more elements
@@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
-void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  bool is_buffered_channel = false;
-  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
-  if (is_buffered_channel) {
+  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
    for (size_t i = 0; i < num_threads; i++) {
@@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  // Count number of successfuld sends
+  // Count number of successful sends
  int ct = 0;
  for (size_t i = 0; i < num_threads; i++) {
    if (send_success[i]) ct++;
  }
-  if (is_buffered_channel) {
+  if (isBuffered) {
    // Only 1 send must be successful
    EXPECT_EQ(ct, 1);
  } else {
@@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
 TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
  size_t buffer_size = 1;
  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockSenders(ch);
+  ChannelDestroyUnblockSenders(ch, true);
 }
 // This tests that destroying an unbuffered channel also unblocks
@@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
 TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockSenders(ch);
+  ChannelDestroyUnblockSenders(ch, false);
+}
+TEST(ChannelHolder, ChannelHolderCapacityTest) {
+  const size_t buffer_size = 10;
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(buffer_size);
+  EXPECT_EQ(ch->Cap(), buffer_size);
+  delete ch;
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  EXPECT_EQ(ch->Cap(), 0U);
+  delete ch;
 }
 void ChannelHolderSendReceive(ChannelHolder *ch) {
@@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
        },
        &thread_ended[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all the threads are blocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
  // This should unblock all receivers
  ch->close();
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
 }
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
@@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
        },
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
@@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
  // This should unblock all senders
  ch->close();
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
@@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
        &thread_ended[i], &send_success[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
@@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
        },
        &thread_ended[i]);
  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads are blocked
  for (size_t i = 0; i < num_threads; i++) {

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
    PassTensorData(&out, &in);
  }
+  // do data type transform
  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
    transformed = true;

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -16,13 +16,16 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace framework {
 inline proto::VarType::Type ToDataType(std::type_index type) {
  using namespace paddle::framework::proto;
-  if (typeid(float).hash_code() == type.hash_code()) {
+  if (typeid(platform::float16).hash_code() == type.hash_code()) {
+    return proto::VarType::FP16;
+  } else if (typeid(float).hash_code() == type.hash_code()) {
    return proto::VarType::FP32;
  } else if (typeid(double).hash_code() == type.hash_code()) {
    return proto::VarType::FP64;
@@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
 inline std::type_index ToTypeIndex(proto::VarType::Type type) {
  using namespace paddle::framework::proto;
  switch (type) {
+    case proto::VarType::FP16:
+      return typeid(platform::float16);
    case proto::VarType::FP32:
      return typeid(float);
    case proto::VarType::FP64:
@@ -59,6 +64,9 @@ template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
  using namespace paddle::framework::proto;
  switch (type) {
+    case proto::VarType::FP16:
+      visitor.template operator()<platform::float16>();
+      break;
    case proto::VarType::FP32:
      visitor.template operator()<float>();
      break;

--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -47,9 +47,16 @@ struct CastDataType {
      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
            CastDataTypeFunctor<InType, OutType>());
+#ifdef __NVCC__
+    } else if (platform::is_gpu_place(in_.place())) {
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+      context->Wait();
+#endif
    } else {
-      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupported place!");
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
    }
  }
 };
@@ -65,6 +72,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
  auto ctx = pool.Get(in.place());
  switch (src_type) {
+    case proto::VarType::FP16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::float16>(in, out, ctx));
+      break;
    case proto::VarType::FP32:
      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
      break;

--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
+data_type_transform.cc
\ No newline at end of file
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -22,32 +22,145 @@ TEST(DataTypeTransform, CPUTransform) {
  auto place = CPUPlace();
-  Tensor in;
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
-  Tensor out;
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
-  int data_number = 2 * 3;
-  for (int i = 0; i < data_number; ++i) {
-    ptr[i] = i / 3;
-  }
  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
                                  DataLayout::kAnyLayout, LibraryType::kPlain);
  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
                                  DataLayout::kAnyLayout, LibraryType::kPlain);
  auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  TransDataType(kernel_fp32, kernel_fp64, in, &out);
+  // data type transform from float32
-  double* out_data_double = out.data<double>();
+  {
-  for (int i = 0; i < data_number; ++i) {
+    Tensor in;
-    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+    Tensor out;
+    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i / 3;
+    }
+    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
+    }
+    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
+    }
  }
-  TransDataType(kernel_fp32, kernel_int32, in, &out);
+  // data type transform from/to float16
-  int* out_data_int = out.data<int>();
+  {
-  for (int i = 0; i < data_number; ++i) {
+    Tensor in;
-    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+    Tensor out;
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+    TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+    // transform double to float16
+    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+    TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+    TransDataType(kernel_int32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+    // transform int64 to float16
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+    TransDataType(kernel_int64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+    TransDataType(kernel_bool, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
  }
 }
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "gtest/gtest.h"
+TEST(DataTypeTransform, GPUTransform) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  auto cpu_place = CPUPlace();
+  auto gpu_place = CUDAPlace(0);
+  CUDADeviceContext context(gpu_place);
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  // data type transform from float32
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float arr[6] = {0, 1, 2, 3, 4, 5};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(in_ptr, arr, sizeof(arr));
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
+    }
+    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(arr[i]));
+    }
+  }
+  // data type transform from/to float16
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
+    float16 arr[6] = {float16(0), float16(1), float16(2),
+                      float16(3), float16(4), float16(5)};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(ptr, arr, sizeof(arr));
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+    // transform double to float16
+    double* in_data_double =
+        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+    // transform int64 to float16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
+  }
+}
--- a/paddle/fluid/framework/details/buffered_channel.h
+++ b/paddle/fluid/framework/details/buffered_channel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace framework {
-namespace details {
-// Four of the properties of Buffered Channel:
-// - A send to a full channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from an empty channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-template <typename T>
-class Buffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return cap_; }
-  virtual void Close();
-  virtual ~Buffered();
- private:
-  size_t cap_;
-  std::mutex mu_;
-  std::condition_variable empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::condition_variable destructor_cond_var_;
-  std::deque<T> channel_;
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  Buffered(size_t cap) : cap_(cap), closed_(false) {
-    PADDLE_ENFORCE_GT(cap, 0);
-  }
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-template <typename T>
-bool Buffered<T>::Send(T* item) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock,
-                      [this]() { return channel_.size() < cap_ || closed_; });
-  if (!closed_) {
-    channel_.push_back(std::move(*item));
-    lock.unlock();
-    empty_cond_var_.notify_one();
-    ret = true;
-  }
-  send_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-template <typename T>
-bool Buffered<T>::Receive(T* item) {
-  bool ret = false;
-  // Once the channel has been closed and all data has been consumed,
-  // just return false. Don't even try acquiring the mutex.
-  if (closed_ && channel_.empty()) {
-    return false;
-  }
-  recv_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  if (!channel_.empty()) {
-    *item = std::move(channel_.front());
-    channel_.pop_front();
-    full_cond_var_.notify_one();
-    ret = true;
-  }
-  recv_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-template <typename T>
-void Buffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-template <typename T>
-Buffered<T>::~Buffered() {
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  channel_.clear();
-  NotifyAllParticipants(&lock);
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  lock.lock();
-  destructor_cond_var_.wait(
-      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-template <typename T>
-void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  full_cond_var_.notify_all();
-  empty_cond_var_.notify_all();
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/unbuffered_channel.h
+++ b/paddle/fluid/framework/details/unbuffered_channel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include "paddle/fluid/framework/channel.h"
-namespace paddle {
-namespace framework {
-namespace details {
-// Four of the properties of UnBuffered Channel:
-// - A send to a channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from a channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-template <typename T>
-class UnBuffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return 0; }
-  virtual void Close();
-  virtual ~UnBuffered();
- private:
-  std::mutex mu_ch_;
-  // Mutex for readers and writers who are waiting for other reader
-  // and writer to complete execution
-  std::recursive_mutex mu_read_, mu_write_;
-  // reader_found_ is set true when a reader is ready to accept data
-  // writer_found_ is set true when a writer is ready to send data
-  // A transaction occurs only when both are true
-  std::atomic<bool> reader_found_{false}, writer_found_{false};
-  std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
-  T* item{nullptr};
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  UnBuffered() : closed_(false) {}
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-// This function implements the concept of how data should
-// be sent from a writer to a reader.
-template <typename T>
-bool UnBuffered<T>::Send(T* data) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  // Prevent other writers from entering
-  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
-  writer_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
-  // If writer comes first, it should wait till a reader arrives
-  cv_writer_.wait(cv_lock,
-                  [this]() { return reader_found_ == true || closed_; });
-  cv_reader_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> channel_lock(mu_ch_);
-    item = data;
-    channel_lock.unlock();
-    cv_channel_.notify_one();
-    channel_lock.lock();
-    cv_channel_.wait(channel_lock,
-                     [this]() { return item == nullptr || closed_; });
-    ret = true;
-  }
-  writer_found_ = false;
-  send_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-// This function implements the concept of how
-// data that was sent by a writer is read from a reader.
-template <typename T>
-bool UnBuffered<T>::Receive(T* data) {
-  bool ret = false;
-  // If channel is closed, we don't even want any reader to enter.
-  // Unlike a buffered channel, an unbuffered channel does not allow
-  // readers to read after closing because there is no buffer to be consumed.
-  if (closed_) return ret;
-  recv_ctr++;
-  // Prevent other readers from entering
-  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
-  reader_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
-  // If reader comes first, it should wait till a writer arrives
-  cv_reader_.wait(cv_lock,
-                  [this]() { return writer_found_ == true || closed_; });
-  cv_writer_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> lock_ch{mu_ch_};
-    // Reader should wait for the writer to first write its data
-    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
-    if (!closed_) {
-      *data = std::move(*item);
-      item = nullptr;
-      lock_ch.unlock();
-      ret = true;
-    }
-    cv_channel_.notify_one();
-  }
-  reader_found_ = false;
-  recv_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-// This function implements the sequence of events
-// that take place once the channel is closed.
-template <typename T>
-void UnBuffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-// This function implements the sequence of events
-// that are executed once the object of an UnBuffered
-// channel is destroyed.
-template <typename T>
-UnBuffered<T>::~UnBuffered() {
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-  lock.lock();
-  cv_destructor_.wait(lock,
-                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-// This function notifies all the readers, writers and
-// the channel condition variables.
-template <typename T>
-void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  cv_writer_.notify_all();
-  cv_channel_.notify_all();
-  cv_reader_.notify_all();
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -125,8 +125,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
+    VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);
+    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "

--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
  return shapes_[idx];
 }
-void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
-  if (iteration_pos_ >= buffer_.size()) {
-    // Reload buffer with new data
-    buffer_.clear();
-    buffer_.reserve(buffer_size_);
-    for (int i = 0; i < buffer_size_; ++i) {
-      if (reader_->HasNext()) {
-        buffer_.push_back(std::vector<LoDTensor>());
-        reader_->ReadNext(&buffer_.back());
-      } else {
-        break;
-      }
-    }
-    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
-    // optimize.
-    std::random_shuffle(buffer_.begin(), buffer_.end());
-    iteration_pos_ = 0;
-  }
-  out->clear();
-  if (!buffer_.empty()) {
-    std::swap(*out, buffer_[iteration_pos_++]);
-  }
-  // if buffer_ is empty, the 'out' will return as an empty vector.
-}
-void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
-  buffer_.clear();
-  buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
-    if (reader_->HasNext()) {
-      buffer_.push_back(std::vector<LoDTensor>());
-      reader_->ReadNext(&buffer_.back());
-    } else {
-      break;
-    }
-  }
-  // Concat instances
-  out->clear();
-  if (buffer_.empty()) {
-    // if buffer_ is empty, the 'out' will return as an empty vector.
-    return;
-  }
-  int out_num = buffer_[0].size();
-  out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
-    // Merge shape and check date type
-    std::type_index batch_type = buffer_[0][j].type();
-    DDim batch_shape = buffer_[0][j].dims();
-    for (size_t i = 1; i < buffer_.size(); ++i) {
-      std::type_index ins_type = buffer_[i][j].type();
-      DDim ins_shape = buffer_[i][j].dims();
-      PADDLE_ENFORCE_EQ(batch_type, ins_type);
-      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
-                        slice_ddim(ins_shape, 1, ins_shape.size()));
-      PADDLE_ENFORCE_GT(ins_shape[0], 0);
-      batch_shape[0] += ins_shape[0];
-    }
-    LoDTensor out_tensor;
-    out_tensor.Resize(batch_shape);
-    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
-    int64_t dst_offset = 0;
-    // Merge lod and data
-    LoD batch_lod;
-    for (size_t i = 0; i < buffer_.size(); ++i) {
-      DDim ins_shape = buffer_[i][j].dims();
-      LoD ins_lod = buffer_[i][j].lod();
-      if (i == 0) {
-        batch_lod = ins_lod;
-      } else {
-        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
-        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
-          auto& lod_level = batch_lod[level_idx];
-          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
-            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
-          }
-        }
-      }
-      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
-      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
-      dst_offset += ins_shape[0];
-    }
-    out_tensor.set_lod(batch_lod);
-    out->push_back(out_tensor);
-  }
-}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -26,7 +26,6 @@ class ReaderBase {
    PADDLE_ENFORCE(!shapes_.empty());
  }
  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
-  virtual bool HasNext() const = 0;
  virtual void ReInit() = 0;
@@ -52,91 +51,14 @@ class DecoratedReader : public ReaderBase {
    PADDLE_ENFORCE_NOT_NULL(reader_);
  }
-  bool HasNext() const override { return reader_->HasNext(); }
  void ReInit() override { reader_->ReInit(); }
 protected:
  ReaderBase* reader_;
 };
-// file readers
+// The ReaderHolder is used as reader' unified wrapper,
+// making it easier to access different type reader in Variables.
-template <typename T>
-class RandomDataGenerator : public FileReader {
- public:
-  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
-      : FileReader(shapes), min_(min), max_(max) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
-    unsigned int seed = std::random_device()();
-    engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-  }
-  void ReadNext(std::vector<LoDTensor>* out) override {
-    out->clear();
-    out->reserve(shapes_.size());
-    for (const DDim& shape : shapes_) {
-      PADDLE_ENFORCE_GE(
-          shape.size(), 2,
-          "The rank of reader's output data should be 2 at least.(Now it's %d)",
-          shape.size());
-      LoDTensor out_tensor;
-      out_tensor.Resize(shape);
-      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
-      int64_t numel = product(shape);
-      for (int64_t i = 0; i < numel; ++i) {
-        data[i] = dist_(engine_);
-      }
-      out->push_back(out_tensor);
-    }
-  }
-  bool HasNext() const override { return true; }
-  void ReInit() override { return; }
- private:
-  float min_;
-  float max_;
-  std::minstd_rand engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-// decorated readers
-class ShuffleReader : public DecoratedReader {
- public:
-  ShuffleReader(ReaderBase* reader, int buffer_size)
-      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
-    buffer_.reserve(buffer_size);
-  }
-  void ReadNext(std::vector<LoDTensor>* out) override;
- private:
-  int buffer_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-  size_t iteration_pos_;
-};
-class BatchReader : public DecoratedReader {
- public:
-  BatchReader(ReaderBase* reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
-    buffer_.reserve(batch_size_);
-  }
-  void ReadNext(std::vector<LoDTensor>* out) override;
- private:
-  int batch_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-};
-// The ReaderHolder is used as readers' unified wrapper,
-// making it easier to access different type readers in Variables.
 class ReaderHolder {
 public:
  void Reset(ReaderBase* reader) { reader_.reset(reader); }
@@ -144,7 +66,6 @@ class ReaderHolder {
  ReaderBase* Get() const { return reader_.get(); }
  void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
-  bool HasNext() const { return reader_->HasNext(); }
  void ReInit() { reader_->ReInit(); }
  DDim shape(size_t idx) const { return reader_->shape(idx); }

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                    const platform::DeviceContext& dev_ctx) {
-  // TODO(typhoonzero): serialize to ostream
  {  // the 1st field, uint32_t version
    constexpr uint32_t version = 0;
    os.write(reinterpret_cast<const char*>(&version), sizeof(version));

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -235,27 +235,53 @@ TEST(TensorToVector, Tensor) {
 TEST(TensorContainsNAN, CPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  Tensor src;
+  {
-  float* buf = src.mutable_data<float>({3}, CPUPlace());
+    Tensor src;
-  buf[0] = 0.0;
+    float* buf = src.mutable_data<float>({3}, CPUPlace());
-  buf[1] = NAN;
+    buf[0] = 0.0;
-  buf[2] = 0.0;
+    buf[1] = NAN;
-  ASSERT_TRUE(TensorContainsNAN(src));
+    buf[2] = 0.0;
-  buf[1] = 0.0;
+    ASSERT_TRUE(TensorContainsNAN(src));
-  ASSERT_FALSE(TensorContainsNAN(src));
+    buf[1] = 0.0;
+    ASSERT_FALSE(TensorContainsNAN(src));
+  }
+  {
+    Tensor src;
+    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    buf[0] = 0.0;
+    buf[1].x = 0x7fff;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsNAN(src));
+    buf[1] = 0.0;
+    ASSERT_FALSE(TensorContainsNAN(src));
+  }
 }
 TEST(TensorContainsInf, CPU) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  Tensor src;
+  {
-  double* buf = src.mutable_data<double>({3}, CPUPlace());
+    Tensor src;
-  buf[0] = 1.0;
+    double* buf = src.mutable_data<double>({3}, CPUPlace());
-  buf[1] = INFINITY;
+    buf[0] = 1.0;
-  buf[2] = 0.0;
+    buf[1] = INFINITY;
-  ASSERT_TRUE(TensorContainsInf(src));
+    buf[2] = 0.0;
-  buf[1] = 1.0;
+    ASSERT_TRUE(TensorContainsInf(src));
-  ASSERT_FALSE(TensorContainsInf(src));
+    buf[1] = 1.0;
+    ASSERT_FALSE(TensorContainsInf(src));
+  }
+  {
+    Tensor src;
+    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    buf[0] = 1.0;
+    buf[1].x = 0x7c00;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsInf(src));
+    buf[1] = 1.0;
+    ASSERT_FALSE(TensorContainsInf(src));
+  }
 }
 TEST(Tensor, FromAndToStream) {

--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) {
  buf[1] = 0.1;
  buf[2] = NAN;
 }
 static __global__ void FillInf(float* buf) {
  buf[0] = 0.0;
  buf[1] = INFINITY;
  buf[2] = 0.5;
 }
+static __global__ void FillNAN(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2].x = 0x7fff;
+}
+static __global__ void FillInf(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1].x = 0x7c00;
+  buf[2] = 0.5;
+}
 TEST(TensorContainsNAN, GPU) {
-  Tensor tensor;
+  using namespace paddle::platform;
-  platform::CUDAPlace gpu(0);
+  CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
+  auto& pool = DeviceContextPool::Instance();
  auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
+  {
-  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    Tensor tensor;
-  cuda_ctx->Wait();
+    float* buf = tensor.mutable_data<float>({3}, gpu);
-  ASSERT_TRUE(TensorContainsNAN(tensor));
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsNAN(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsNAN(tensor));
+  }
 }
 TEST(TensorContainsInf, GPU) {
-  Tensor tensor;
+  using namespace paddle::platform;
-  platform::CUDAPlace gpu(0);
+  CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
+  auto& pool = DeviceContextPool::Instance();
  auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
+  {
-  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    Tensor tensor;
-  cuda_ctx->Wait();
+    float* buf = tensor.mutable_data<float>({3}, gpu);
-  ASSERT_TRUE(TensorContainsInf(tensor));
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsInf(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsInf(tensor));
+  }
 }
 }  // namespace framework

--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -67,10 +67,10 @@ class ThreadPool {
      } catch (platform::EnforceNotMet ex) {
        return std::unique_ptr<platform::EnforceNotMet>(
            new platform::EnforceNotMet(ex));
-      } catch (...) {
+      } catch (const std::exception& e) {
-        LOG(FATAL)
+        LOG(FATAL) << "Unexpected exception is catched in thread pool. All "
-            << "Unexpected exception is catched in thread pool. All "
+                      "throwable exception in Fluid should be an EnforceNotMet."
-               "throwable exception in Fluid should be an EnforceNotMet.";
+                   << e.what();
      }
      return nullptr;
    });

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -5,7 +5,8 @@ cc_library(paddle_fluid_api
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 # Create static library
-cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -22,14 +22,14 @@ namespace paddle {
 namespace inference {
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
-  VLOG(3) << "loading model from " << filename;
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  inputfs.seekg(0, std::ios::end);
+  fin.seekg(0, std::ios::end);
  contents.clear();
-  contents.resize(inputfs.tellg());
+  contents.resize(fin.tellg());
-  inputfs.seekg(0, std::ios::beg);
+  fin.seekg(0, std::ios::beg);
-  inputfs.read(&contents[0], contents.size());
+  fin.read(&contents[0], contents.size());
-  inputfs.close();
+  fin.close();
 }
 bool IsPersistable(const framework::VarDesc* var) {
@@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                             const std::string& dirname) {
  std::string model_filename = dirname + "/__model__";
  std::string program_desc_str;
+  VLOG(3) << "loading model from " << model_filename;
  ReadBinaryFile(model_filename, program_desc_str);
  std::unique_ptr<framework::ProgramDesc> main_program(

--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size of input data");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 TEST(inference, image_classification) {
-  if (FLAGS_dirname.empty()) {
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
+                  "--batch_size=1 --repeat=1";
  }
  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,13 +32,11 @@ TEST(inference, image_classification) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  int64_t batch_size = 1;
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [0.0, 1.0].
  SetupTensor<float>(input,
-                     {batch_size, 3, 32, 32},
+                     {FLAGS_batch_size, 3, 32, 32},
                     static_cast<float>(0),
                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
@@ -46,7 +47,9 @@ TEST(inference, image_classification) {
  cpu_fetchs1.push_back(&output1);
  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << "--- CPU Runs: ---";
+  TestInference<paddle::platform::CPUPlace>(
+      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@@ -55,7 +58,9 @@ TEST(inference, image_classification) {
  cpu_fetchs2.push_back(&output2);
  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << "--- GPU Runs: ---";
+  TestInference<paddle::platform::CUDAPlace>(
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size of input data");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 TEST(inference, recognize_digits) {
-  if (FLAGS_dirname.empty()) {
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
+                  "--batch_size=1 --repeat=1";
  }
  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,77 +32,39 @@ TEST(inference, recognize_digits) {
  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  int64_t batch_size = 1;
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [-1.0, 1.0].
  SetupTensor<float>(input,
-                     {batch_size, 1, 28, 28},
+                     {FLAGS_batch_size, 1, 28, 28},
                     static_cast<float>(-1),
                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);
-  paddle::framework::LoDTensor output1;
+  for (auto is_combined : {false, true}) {
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    paddle::framework::LoDTensor output1;
-  cpu_fetchs1.push_back(&output1);
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    cpu_fetchs1.push_back(&output1);
-  // Run inference on CPU
+    // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
-  LOG(INFO) << output1.dims();
+    TestInference<paddle::platform::CPUPlace>(
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+    LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
+    paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
+    cpu_fetchs2.push_back(&output2);
-  // Run inference on CUDA GPU
+    // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
-  LOG(INFO) << output2.dims();
+    TestInference<paddle::platform::CUDAPlace>(
+        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+    LOG(INFO) << output2.dims();
-  CheckError<float>(output1, output2);
+    CheckError<float>(output1, output2);
 #endif
-}
-TEST(inference, recognize_digits_combine) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
  }
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(
-      input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-  CheckError<float>(output1, output2);
-#endif
 }
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor& input,
@@ -87,31 +88,60 @@ void CheckError(paddle::framework::LoDTensor& output1,
  EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
-template <typename Place, bool IsCombined = false>
+template <typename Place>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const int repeat = 1,
+                   const bool is_combined = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
  auto* scope = new paddle::framework::Scope();
+  // Profile the performance
+  paddle::platform::ProfilerState state;
+  if (paddle::platform::is_cpu_place(place)) {
+    state = paddle::platform::ProfilerState::kCPU;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    state = paddle::platform::ProfilerState::kCUDA;
+    // The default device_id of paddle::platform::CUDAPlace is 0.
+    // Users can get the device_id using:
+    //   int device_id = place.GetDeviceId();
+    paddle::platform::SetDeviceId(0);
+#else
+    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+  }
+  // Enable the profiler
+  paddle::platform::EnableProfiler(state);
  // 2. Initialize the inference_program and load parameters
  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  if (IsCombined) {
+  {
-    // All parameters are saved in a single file.
+    paddle::platform::RecordEvent record_event(
-    // Hard-coding the file names of program and parameters in unittest.
+        "init_program",
-    // The file names should be consistent with that used in Python API
+        paddle::platform::DeviceContextPool::Instance().Get(place));
-    //  `fluid.io.save_inference_model`.
-    std::string prog_filename = "__model_combined__";
+    if (is_combined) {
-    std::string param_filename = "__params_combined__";
+      // All parameters are saved in a single file.
-    inference_program = paddle::inference::Load(executor,
+      // Hard-coding the file names of program and parameters in unittest.
-                                                *scope,
+      // The file names should be consistent with that used in Python API
-                                                dirname + "/" + prog_filename,
+      //  `fluid.io.save_inference_model`.
-                                                dirname + "/" + param_filename);
+      std::string prog_filename = "__model_combined__";
-  } else {
+      std::string param_filename = "__params_combined__";
-    // Parameters are saved in separate files sited in the specified `dirname`.
+      inference_program =
-    inference_program = paddle::inference::Load(executor, *scope, dirname);
+          paddle::inference::Load(executor,
+                                  *scope,
+                                  dirname + "/" + prog_filename,
+                                  dirname + "/" + param_filename);
+    } else {
+      // Parameters are saved in separate files sited in the specified
+      // `dirname`.
+      inference_program = paddle::inference::Load(executor, *scope, dirname);
+    }
  }
  // 3. Get the feed_target_names and fetch_target_names
@@ -134,7 +164,21 @@ void TestInference(const std::string& dirname,
  }
  // 6. Run the inference program
-  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+  {
+    // Run repeat times to profile the performance
+    for (int i = 0; i < repeat; ++i) {
+      paddle::platform::RecordEvent record_event(
+          "run_inference",
+          paddle::platform::DeviceContextPool::Instance().Get(place));
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    }
+  }
+  // Disable the profiler and print the timing information
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
+                                    "profiler.txt");
+  paddle::platform::ResetProfiler();
  delete scope;
 }
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
@@ -13,6 +15,8 @@ function(op_library TARGET)
    set(cu_cc_srcs)
    set(cudnn_cu_cc_srcs)
    set(CUDNN_FILE)
+    set(mkldnn_cc_srcs)
+    set(MKLDNN_FILE)
    set(op_common_deps operator op_registry math_function)
    set(options "")
    set(oneValueArgs "")
@@ -36,12 +40,20 @@ function(op_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
            list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
        endif()
+        if(WITH_MKLDNN)
+            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            endif()
+        endif()
    else()
        foreach(src ${op_library_SRCS})
            if (${src} MATCHES ".*\\.cu$")
                list(APPEND cu_srcs ${src})
            elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+                list(APPEND mkldnn_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cu.cc$")
                list(APPEND cu_cc_srcs ${src})
            elseif(${src} MATCHES ".*\\.cc$")
@@ -62,15 +74,15 @@ function(op_library TARGET)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
    endif()
    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                ${op_common_deps})
    else()
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
+            ${op_common_deps})
    endif()
    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
@@ -101,7 +113,8 @@ function(op_library TARGET)
    # pybind USE_CPU_ONLY_OP
    list(LENGTH cu_srcs cu_srcs_len)
    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
        set(pybind_flag 1)
    endif()
@@ -112,6 +125,11 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
    endif()
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+    endif()
    # pybind USE_OP
    if (${pybind_flag} EQUAL 0)
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@@ -128,8 +146,8 @@ else()
    set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
+add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
-    add_subdirectory(detail)
    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
@@ -170,26 +188,31 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
-op_library(create_reader_op DEPS reader)
 if (WITH_GPU)
-    op_library(conv_op DEPS vol2col depthwise_conv)
+    op_library(conv_op DEPS vol2col depthwise_conv im2col)
 else()
-    op_library(conv_op DEPS vol2col)
+    op_library(conv_op DEPS vol2col im2col)
 endif()
-op_library(conv_transpose_op DEPS vol2col)
+op_library(conv_transpose_op DEPS vol2col im2col)
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
+op_library(concat_op DEPS concat)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
    op_library(${src})
 endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+add_subdirectory(reader)
+foreach(src ${READER_LIBRARY})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -63,13 +63,27 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
  }
 };
+class CastOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // CastOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
+REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker,
-                        ops::CastOpProtoMaker);
+                  ops::CastOpInferShape, ops::CastOpProtoMaker);
 REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                       ops::CastOpKernel<CPU, double>,
                       ops::CastOpKernel<CPU, int>,

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
               ops::ConcatOpGrad, false)
-REGISTER_OP_CPU_KERNEL(concat,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
-REGISTER_OP_CPU_KERNEL(concat_grad,
+REGISTER_OP_CPU_KERNEL(
-                       ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 namespace paddle {
@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
    auto place = ctx.GetPlace();
    out->mutable_data<T>(place);
-    auto out_stride = framework::stride_numel(out->dims());
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (axis == 0 && ins.size() < 10) {
-    size_t output_offset = 0;
+      size_t output_offset = 0;
-    // If axis >=1, copy to out immediately need to call many times
-    // of cuda memcpy. Copy the input to cpu and do the stride copy,
-    // then copy to gpu output.
-    if (platform::is_gpu_place(place) && axis >= 1) {
-      platform::CPUPlace copy_place;
-      auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
-      framework::Tensor cpu_out;
-      cpu_out.Resize(out->dims());
-      cpu_out.mutable_data<T>(copy_place);
-      auto& dev_ctx = ctx.device_context();
-      std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
-      for (auto* in : ins) {
-        std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
-        framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
-        cpu_ins.emplace_back(std::move(cpu_in));
-      }
-      // TODO(dzhwinter): overlap copy and compute stream
-      // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
-      dev_ctx.Wait();
-      for (auto& in : cpu_ins) {
-        auto& cpu_in = *in.get();
-        auto in_stride = framework::stride_numel(cpu_in.dims());
-        StridedNumelCopyWithAxis<T>(
-            cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
-            cpu_in.data<T>(), in_stride, in_stride[axis]);
-        output_offset += in_stride[axis];
-      }
-      framework::TensorCopy(cpu_out, place, dev_ctx, out);
-    } else {
      for (auto* in : ins) {
        auto in_stride = framework::stride_numel(in->dims());
+        auto out_stride = framework::stride_numel(out->dims());
        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
                                    out->data<T>() + output_offset, out_stride,
                                    in->data<T>(), in_stride, in_stride[axis]);
        output_offset += in_stride[axis];
      }
+    } else {
+      std::vector<framework::Tensor> inputs(ins.size());
+      for (size_t j = 0; j < ins.size(); ++j) {
+        inputs[j] = *ins[j];
+      }
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
+      concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
    }
  }
 };
@@ -86,16 +63,31 @@ class ConcatGradKernel : public framework::OpKernel<T> {
    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    size_t input_offset = 0;
-    auto in_stride = framework::stride_numel(in->dims());
-    for (auto& out : outs) {
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-      out->mutable_data<T>(ctx.GetPlace());
+    if (axis == 0 && outs.size() < 10) {
-      auto out_stride = framework::stride_numel(out->dims());
+      size_t input_offset = 0;
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+      auto in_stride = framework::stride_numel(in->dims());
-                                  out_stride, in->data<T>() + input_offset,
-                                  in_stride, out_stride[axis]);
+      for (auto& out : outs) {
-      input_offset += out_stride[axis];
+        out->mutable_data<T>(ctx.GetPlace());
+        auto out_stride = framework::stride_numel(out->dims());
+        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+                                    out_stride, in->data<T>() + input_offset,
+                                    in_stride, out_stride[axis]);
+        input_offset += out_stride[axis];
+      }
+    } else {
+      std::vector<framework::Tensor> outputs(outs.size());
+      for (size_t j = 0; j < outs.size(); ++j) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs[j] = *outs[j];
+      }
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
+          concat_grad_functor;
+      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
    }
  }
 };

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    // TODO(pzelazko-intel) add support for group convolution and dilation
+    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    PADDLE_ENFORCE(
+        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // TODO(pzelazko-intel): support more formats
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto weights_md =
+        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
+                                mkldnn::memory::format::oihw);
+    auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto src_memory =
+        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+    auto weights_memory =
+        mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
+        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                             mkldnn_engine);
+    // save conv_pd into global device context to be referred in backward path
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    // create convolution op primitive
+    auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory,
+                                                 weights_memory, dst_memory);
+    // push primitive to stream and wait until it's executed
+    std::vector<mkldnn::primitive> pipeline{conv_prim};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+ private:
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src,
+                       const mkldnn::memory::desc& weights,
+                       const mkldnn::memory::desc& dst,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine) const {
+    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
+        dst, stride_dims, padding_dims, padding_dims,
+        mkldnn::padding_kind::zero);
+    auto p_conv_pd =
+        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
+  }
+};
+template <typename T>
+class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const Tensor* filter = ctx.Input<Tensor>("Filter");
+    const Tensor* output = ctx.Input<Tensor>("Output");
+    const Tensor* output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    if (!input_grad && !filter_grad) return;
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Input("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data = nullptr;
+    T* filter_grad_data = nullptr;
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (filter_grad) {
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // TODO(pzelazko-intel): support more formats
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto diff_src_md = platform::MKLDNNMemDesc(
+        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    auto weights_md =
+        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
+                                mkldnn::memory::format::oihw);
+    auto diff_weights_md =
+        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
+                                mkldnn::memory::format::oihw);
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+    // create memory
+    auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine},
+                                          (void*)output_grad_data);
+    // Retrieve conv_pd from device context
+    auto conv_pd =
+        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_conv_pd));
+    PADDLE_ENFORCE(conv_pd != nullptr,
+                   "Fail to find conv_pd in device context");
+    // create backward conv primitive for weights
+    if (filter_grad) {
+      // create primitive descriptor
+      mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
+          ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md,
+                                      strides, paddings, *conv_pd,
+                                      mkldnn_engine);
+      // create memory
+      auto diff_weights_memory = mkldnn::memory(
+          {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
+      auto src_memory =
+          mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+      // create backward conv primitive for weights
+      auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
+          conv_bwd_weights_pd, src_memory, diff_dst_memory,
+          diff_weights_memory);
+      // push primitive and execute it
+      std::vector<mkldnn::primitive> pipeline{conv_bwd_weights_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+    if (input_grad) {
+      // create primitive descriptor
+      mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd =
+          ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md,
+                                   strides, paddings, *conv_pd, mkldnn_engine);
+      // create memory
+      auto diff_src_memory =
+          mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data);
+      auto weights_memory =
+          mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+      // create backward conv primitive for data
+      auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
+          conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory);
+      // push primitive to stream and wait until it's executed
+      std::vector<mkldnn::primitive> pipeline{conv_bwd_data_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+  }  // Compute()
+ private:
+  mkldnn::convolution_backward_weights::primitive_desc
+  ConvBwdWeightsPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
+      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings,
+      const mkldnn::convolution_forward::primitive_desc& conv_pd,
+      const mkldnn::engine& engine) const {
+    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
+        mkldnn::convolution_direct, src, diff_weights, diff_dst, strides,
+        paddings, paddings, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_weights::primitive_desc(
+        conv_bwd_weights_desc, engine, conv_pd);
+  }
+  mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
+      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings,
+      const mkldnn::convolution_forward::primitive_desc& conv_pd,
+      const mkldnn::engine& engine) const {
+    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
+        mkldnn::convolution_direct, diff_src, weights, diff_dst, strides,
+        paddings, paddings, mkldnn::padding_kind::zero);
+    return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc,
+                                                             engine, conv_pd);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
-  framework::LibraryType library_;
+#ifdef PADDLE_WITH_MKLDNN
-  if (use_cudnn) {
+  if (library_ == framework::LibraryType::kPlain &&
-    library_ = framework::LibraryType::kCUDNN;
+      platform::CanMKLDNNBeUsed(ctx)) {
-  } else {
+    library_ = framework::LibraryType::kMKLDNN;
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
+#ifdef PADDLE_WITH_MKLDNN
-  framework::LibraryType library_;
+  if (library_ == framework::LibraryType::kPlain &&
-  if (use_cudnn) {
+      platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
+    library_ = framework::LibraryType::kMKLDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),

--- a/paddle/fluid/operators/create_reader_op.cc
+++ b/paddle/fluid/operators/create_reader_op.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-namespace paddle {
-namespace operators {
-static std::vector<framework::DDim> RestoreShapes(
-    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
-  std::vector<framework::DDim> res;
-  int offset = 0;
-  for (int len : ranks) {
-    auto start_it = shape_concat.begin() + offset;
-    auto end_it = start_it + len;
-    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
-    offset += len;
-  }
-  return res;
-}
-// general infershape for file readers
-class CreateFileReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output file reader should not be null.");
-    const auto shape_concat =
-        ctx->Attrs().Get<std::vector<int>>("shape_concat");
-    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    ctx->SetReaderDims("Out", shapes);
-    if (ctx->IsRuntime()) {
-      const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
-      PADDLE_ENFORCE_EQ(
-          lod_levels.size(), shapes.size(),
-          "The number of 'lod_levels'(%d) doesn't match the number "
-          "of 'shapes'(%d).",
-          lod_levels.size(), shapes.size());
-      framework::VarDesc* reader =
-          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-      reader->SetLoDLevels(lod_levels);
-    }
-  }
-};
-// general infershape for decorated readers
-class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
-                   "Input(UnderlyingReader) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output decorated reader should not be null.");
-    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
-    if (ctx->IsRuntime()) {
-      framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
-          ctx->GetInputVarPtrs("UnderlyingReader")[0]);
-      framework::VarDesc* out_reader =
-          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-      out_reader->SetLoDLevels(in_reader->GetLoDLevels());
-    }
-  }
-};
-// general var type inference for file readers
-class CreateFileReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Output("Out")[0];
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    reader->SetType(framework::proto::VarType::READER);
-  }
-};
-// general var type inference for decorated readers
-class CreateDecoratedReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
-    framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
-    std::string out_reader_name = op_desc.Output("Out")[0];
-    framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
-    out_reader->SetType(framework::proto::VarType::READER);
-    out_reader->SetDataTypes(in_reader->GetDataTypes());
-  }
-};
-template <typename T>
-class CreateRandomDataGeneratorOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
-                                                     Attr<float>("max")));
-  }
-};
-class CreateRandomDataGeneratorOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddOutput("Out", "(ReaderHolder) The created random reader.");
-    AddAttr<std::vector<int>>("shape_concat",
-                              "The concat of all data's shapes.");
-    AddAttr<std::vector<int>>(
-        "ranks",
-        "The ranks of each data."
-        "e.g."
-        "shape_concat = [2,3,4,5,6]"
-        "ranks = [3,2]"
-        "It means the reader will generate two data each time,"
-        "whose shapes are [2,3,4] and [5,6] respectively.");
-    AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
-    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
-    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
-    AddComment(R"DOC(
-      CreateRandomDataGenerator Operator
-      This Op creates a random reader.
-      The reader generates random data instead of really reading from files.
-      Generated data follow an uniform distribution between 'min' and 'max'.
-    )DOC");
-  }
-};
-class CreateShuffleReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
-                                            Attr<int>("buffer_size")));
-  }
-};
-class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
-    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
-    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
-    AddComment(R"DOC(
-      CreateShuffleReader Operator
-      A shuffle reader takes another reader as its 'underlying reader'
-      and yields the underlying reader's outputs in a shuffled order.
-    )DOC");
-  }
-};
-class CreateBatchReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::BatchReader(underlying_reader.Get(),
-                                          Attr<int>("batch_size")));
-  }
-};
-class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a batch reader.");
-    AddOutput("Out", "(ReaderHolder) The created batch reader.");
-    AddAttr<int>("batch_size",
-                 "How many instances the batch reader yields each time.")
-        .GreaterThan(0);
-    AddComment(R"DOC(
-      CreateBatchReader Operator
-      A batch reader takes another reader as its 'underlying reader',
-      gathers the underlying reader's outputs and then yields them in batches.
-    )DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(create_random_data_generator,
-                  ops::CreateRandomDataGeneratorOp<float>,
-                  ops::CreateFileReaderInferShape,
-                  ops::CreateRandomDataGeneratorOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateFileReaderInferVarType);
-REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateShuffleReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateDecoratedReaderInferVarType);
-REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateBatchReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateDecoratedReaderInferVarType);
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
-grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+if(WITH_DISTRIBUTE)
+  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc)
+endif()
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+#include "bytebuffer_stream.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+GrpcByteBufferSource::GrpcByteBufferSource() {}
+bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
+  cur_ = -1;
+  left_ = 0;
+  ptr_ = nullptr;
+  byte_count_ = 0;
+  bool ok = src.Dump(&slices_).ok();
+  if (!ok) {
+    slices_.clear();
+  }
+  return ok;
+}
+bool GrpcByteBufferSource::Next(const void** data, int* size) {
+  // Use loop instead of if in case buffer contained empty slices.
+  while (left_ == 0) {
+    // Advance to next slice.
+    cur_++;
+    if (cur_ >= slices_.size()) {
+      return false;
+    }
+    const ::grpc::Slice& s = slices_[cur_];
+    left_ = s.size();
+    ptr_ = reinterpret_cast<const char*>(s.begin());
+  }
+  *data = ptr_;
+  *size = left_;
+  byte_count_ += left_;
+  ptr_ += left_;
+  left_ = 0;
+  return true;
+}
+void GrpcByteBufferSource::BackUp(int count) {
+  ptr_ -= count;
+  left_ += count;
+  byte_count_ -= count;
+}
+bool GrpcByteBufferSource::Skip(int count) {
+  const void* data;
+  int size;
+  while (Next(&data, &size)) {
+    if (size >= count) {
+      BackUp(size - count);
+      return true;
+    }
+    // size < count;
+    count -= size;
+  }
+  // error or we have too large count;
+  return false;
+}
+google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
+  return byte_count_;
+}
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+#pragma once
+#include <grpc++/grpc++.h>
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
+class GrpcByteBufferSource
+    : public ::google::protobuf::io::ZeroCopyInputStream {
+ public:
+  GrpcByteBufferSource();
+  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
+  bool Next(const void** data, int* size) override;
+  void BackUp(int count) override;
+  bool Skip(int count) override;
+  ::google::protobuf::int64 ByteCount() const override;
+ private:
+  std::vector<::grpc::Slice> slices_;
+  size_t cur_;       // Current slice index.
+  int left_;         // Number of bytes in slices_[cur_] left to yield.
+  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
+  ::google::protobuf::int64 byte_count_;
+};
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/detail/proto_encoder_helper.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+#pragma once
+#include <grpc++/grpc++.h>
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+char* EncodeVarint32(char* dst, uint32_t v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  static const int B = 128;
+  if (v < (1 << 7)) {
+    *(ptr++) = v;
+  } else if (v < (1 << 14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+char* EncodeVarint64(char* dst, uint64_t v) {
+  static const int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+class ProtoEncodeHelper {
+ public:
+  ProtoEncodeHelper(char* buf, int max_size)
+      : base_(buf), p_(buf), limit_(base_ + max_size) {}
+  ~ProtoEncodeHelper() {
+    // Make sure callers didn't do operations that went over max_size promised
+    PADDLE_ENFORCE_LE(p_, limit_);
+  }
+  const char* data() const { return base_; }
+  size_t size() const { return p_ - base_; }
+  void WriteUint64(int tag, uint64_t v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    Encode64(v);
+  }
+  void WriteBool(int tag, bool v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    EncodeBool(v);
+  }
+  void WriteString(int tag, const std::string& v) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(v.size());
+    EncodeBytes(v.data(), v.size());
+  }
+  void WriteVarlengthBeginning(int tag, uint32_t len) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(len);
+  }
+  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
+ private:
+  // Note: this module's behavior must match the protocol buffer wire encoding
+  // format.
+  enum {
+    WIRETYPE_VARINT = 0,
+    WIRETYPE_LENGTH_DELIMITED = 2,
+  };
+  static uint32_t combine(uint32_t tag, uint32_t type) {
+    return ((tag << 3) | type);
+  }
+  inline void Encode32(uint32_t v) {
+    if (v < 128) {
+      // Fast path for single-byte values.  Many of the calls will use a
+      // constant value for v, so the comparison will get optimized away
+      // when Encode32 is inlined into the caller.
+      *p_ = v;
+      p_++;
+    } else {
+      p_ = EncodeVarint32(p_, v);
+    }
+  }
+  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
+  void EncodeBool(bool v) {
+    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
+    p_++;
+  }
+  void EncodeBytes(const char* bytes, int N) {
+    memcpy(p_, bytes, N);
+    p_ += N;
+  }
+  char* base_;
+  char* p_;
+  char* limit_;  // Just for CHECKs
+};
+}  // detail
+}  // operators
+}  // paddle
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -33,10 +33,34 @@ enum VarType {
 }
 message VariableMessage {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+  }
+  message LodData { repeated int64 lod_data = 1; }
  string varname = 1;
  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
  VarType type = 2;
-  bytes serialized = 3;
+  // bool persistable is not needed for sending.
+  // tensor info:
+  Type data_type = 3;
+  repeated int64 dims = 4;
+  // lod details:
+  int64 lod_level = 5;
+  repeated LodData lod = 6;
+  // tensor data
+  bytes serialized = 7;
+  // selected_rows data
+  bytes rows = 8;
 }
 message VoidMessage {}
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
+#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
 namespace paddle {
 namespace operators {
@@ -63,6 +68,233 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
  }
 }
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg) {
+  using VarMsg = sendrecv::VariableMessage;
+  sendrecv::VariableMessage request;
+  std::string header;
+  request.AppendToString(&header);
+  // When using GPU, need to free the copied CPU buffer
+  // when the ByteBuffer destroies
+  // TODO(typhoonzero): add unref here, if we have dependent
+  // parallelism execution, need to know when to free the tensor.
+  DestroyCallback destroy_callback = [](void* backing) {};
+  void* buf = malloc(1024);
+  void* payload;
+  size_t payload_size;
+  ProtoEncodeHelper e((char*)buf, 1024);
+  e.WriteString(VarMsg::kVarnameFieldNumber, name);
+  if (var->IsType<framework::LoDTensor>()) {
+    e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
+  } else if (var->IsType<framework::SelectedRows>()) {
+    e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
+  }
+  switch (framework::ToVarType(var->Type())) {
+    case framework::proto::VarType_Type_LOD_TENSOR: {
+      auto tensor = var->Get<framework::LoDTensor>();
+      e.WriteUint64(VarMsg::kDataTypeFieldNumber,
+                    framework::ToDataType(tensor.type()));
+      for (auto& dim : framework::vectorize(tensor.dims())) {
+        e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
+      }
+      auto lod = tensor.lod();  // std::vector<Vector<size_t>>
+      if (lod.size() > 0) {
+        e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size());
+        for (auto& each : lod) {
+          e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber,
+                                    2 +      // tag + varintlength of submessage
+                                        1 +  // kLodDataFieldNumber
+                                        each.size());
+          // auto copied from GPU
+          for (auto& d : each) {
+            e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d);
+          }
+        }
+      }
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+        PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
+        platform::CPUPlace cpu;
+        auto& gpu_dev_ctx =
+            static_cast<const platform::CUDADeviceContext&>(ctx);
+        auto copy_size = tensor.memory_size();
+        payload = memory::Alloc(cpu, copy_size);
+        memory::Copy(cpu, payload,
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(tensor.data<void>()),
+                     copy_size, gpu_dev_ctx.stream());
+        ctx.Wait();
+        destroy_callback = [](void* backing) {
+          platform::CPUPlace cpu;
+          memory::Free(cpu, backing);
+        };
+#endif
+      } else {
+        payload = tensor.data<void>();
+      }
+      payload_size = tensor.memory_size();
+      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+    } break;
+    case framework::proto::VarType_Type_SELECTED_ROWS: {
+      // TODO(typhoonzero): selectedrows implement should not use unique_ptr
+      auto* slr = var->GetMutable<framework::SelectedRows>();
+      e.WriteUint64(VarMsg::kDataTypeFieldNumber,
+                    framework::ToDataType(slr->value().type()));
+      for (auto& dim : framework::vectorize(slr->value().dims())) {
+        e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
+      }
+      e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
+      auto* tensor = slr->mutable_value();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+        platform::CPUPlace cpu;
+        auto& gpu_dev_ctx =
+            static_cast<const platform::CUDADeviceContext&>(ctx);
+        auto copy_size = tensor->memory_size();
+        payload = memory::Alloc(cpu, copy_size);
+        memory::Copy(cpu, payload,
+                     boost::get<platform::CUDAPlace>(tensor->place()),
+                     reinterpret_cast<const void*>(tensor->data<void>()),
+                     copy_size, gpu_dev_ctx.stream());
+        ctx.Wait();
+        destroy_callback = [](void* backing) {
+          platform::CPUPlace cpu;
+          memory::Free(cpu, backing);
+        };
+#endif
+      } else {
+        payload = slr->mutable_value()->data<void>();
+      }
+      payload_size = tensor->memory_size();
+      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+    } break;
+    default:
+      PADDLE_THROW("Serialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+  }
+  // steal reference of tensor data
+  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
+  int num_slices = 2;       // only SelectedRows have rows buffer
+  slices[0] = ::grpc::Slice(e.size());
+  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
+  slices[1] = ::grpc::Slice(
+      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
+                                    static_cast<char*>(payload)),
+      ::grpc::Slice::STEAL_REF);
+  if (framework::ToVarType(var->Type()) ==
+      framework::proto::VarType_Type_SELECTED_ROWS) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    ProtoEncodeHelper e2((char*)buf, 128);
+    // NOTE: rows is of type int64_t
+    size_t rows_memory_size =
+        slr->rows().capacity() * framework::SizeOfType(typeid(int64_t));
+    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
+    slices[2] = ::grpc::Slice(e2.size());
+    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
+    slices[3] = ::grpc::Slice(
+        grpc_slice_new_with_user_data(
+            const_cast<void*>(
+                reinterpret_cast<const void*>(slr->rows().data())),
+            rows_memory_size,
+            [](void* backing) {
+              // TODO(typhoonzero): add unref here, same as above.
+            },
+            const_cast<char*>(
+                reinterpret_cast<const char*>(slr->rows().data()))),
+        ::grpc::Slice::STEAL_REF);
+    num_slices = 4;
+  }
+  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
+  msg->Swap(&tmp);
+}
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               framework::Variable* var) {
+  sendrecv::VariableMessage meta;
+  GrpcByteBufferSource source;
+  source.Init(msg);
+  ::google::protobuf::io::CodedInputStream input(&source);
+  // do zerocopy parsing
+  PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
+  PADDLE_ENFORCE(input.ConsumedEntireMessage());
+  // dims is needed by both tensor and selectedrows
+  std::vector<int> vecdims;
+  for (auto& d : meta.dims()) {
+    vecdims.push_back(d);
+  }
+  framework::DDim dims = framework::make_ddim(vecdims);
+  if (meta.type() == sendrecv::LOD_TENSOR) {
+    auto* tensor = var->GetMutable<framework::LoDTensor>();
+    tensor->Resize(dims);
+    void* tensor_data = tensor->mutable_data(
+        ctx.GetPlace(),
+        paddle::operators::detail::ToTypeIndex(meta.data_type()));
+    framework::LoD lod;
+    for (int i = 0; i < meta.lod_level(); ++i) {
+      framework::Vector<size_t> v;
+      for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
+        v.push_back(meta.lod(i).lod_data(j));
+      }
+      lod.push_back(v);
+    }
+    tensor->set_lod(lod);
+    // How to avoid copying and use the message buffer directly?
+    // Maybe need to find a way to release all memory except tensor content.
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      platform::CPUPlace cpu;
+      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
+                   tensor_data, cpu,
+                   reinterpret_cast<const void*>(meta.serialized().data()),
+                   meta.serialized().size(), gpu_dev_ctx.stream());
+      ctx.Wait();
+#endif
+    } else {
+      memcpy(tensor_data,
+             reinterpret_cast<const void*>(meta.serialized().data()),
+             meta.serialized().size());
+    }
+  } else if (meta.type() == sendrecv::SELECTED_ROWS) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    auto* tensor = slr->mutable_value();
+    int64_t* rows_data = slr->mutable_rows()->data();
+    tensor->Resize(dims);
+    void* tensor_data = tensor->mutable_data(
+        ctx.GetPlace(),
+        paddle::operators::detail::ToTypeIndex(meta.data_type()));
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      platform::CPUPlace cpu;
+      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
+                   tensor_data, cpu,
+                   reinterpret_cast<const void*>(meta.serialized().data()),
+                   meta.serialized().size(), gpu_dev_ctx.stream());
+      ctx.Wait();
+#endif
+    } else {
+      memcpy(tensor_data,
+             reinterpret_cast<const void*>(meta.serialized().data()),
+             meta.serialized().size());
+    }
+    // copy rows CPU data, GPU data will be copied lazly
+    memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
+           meta.rows().size());
+  }
+}
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
\ No newline at end of file
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -33,6 +33,8 @@ namespace detail {
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+typedef void (*DestroyCallback)(void*);
 void SerializeToMessage(const std::string& name, const framework::Variable* var,
                        const platform::DeviceContext& ctx,
                        sendrecv::VariableMessage* msg);
@@ -40,6 +42,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var,
 void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
                            const platform::DeviceContext& ctx,
                            framework::Variable* var);
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg);
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               framework::Variable* var);
+inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
+  switch (type) {
+    case sendrecv::VariableMessage::FP32:
+      return typeid(float);  // NOLINT
+    case sendrecv::VariableMessage::FP64:
+      return typeid(double);  // NOLINT
+    case sendrecv::VariableMessage::INT32:
+      return typeid(int);  // NOLINT
+    case sendrecv::VariableMessage::INT64:
+      return typeid(int64_t);  // NOLINT
+    case sendrecv::VariableMessage::BOOL:
+      return typeid(bool);  // NOLINT
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ b/paddle/fluid/operators/detail/test_serde.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <unistd.h>
+#include <string>
+#include <thread>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+void RunSerdeTestTensor(platform::Place place) {
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* tensor = var.GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
+  framework::LoD lod;
+  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+  tensor->set_lod(lod);
+  int tensor_numel = 4 * 8 * 4 * 2;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+  tensor->mutable_data<float>(place);
+  math::set_constant(ctx, tensor, 31.9);
+  ::grpc::ByteBuffer msg;
+  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  EXPECT_GT(msg.Length(), 0);
+  // deserialize
+  std::vector<::grpc::Slice> slices;
+  (void)msg.Dump(&slices);
+  std::string tmp;
+  for (const auto& s : slices) {
+    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  sendrecv::VariableMessage varmsg;
+  EXPECT_TRUE(varmsg.ParseFromString(tmp));
+  EXPECT_EQ(varmsg.varname(), "myvar");
+  EXPECT_EQ(varmsg.type(), 0);
+  EXPECT_EQ(varmsg.dims()[0], 4);
+  EXPECT_EQ(varmsg.dims()[1], 8);
+  EXPECT_EQ(varmsg.dims()[2], 4);
+  EXPECT_EQ(varmsg.dims()[3], 2);
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+  const float* tensor_data =
+      reinterpret_cast<const float*>(varmsg.serialized().data());
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
+  }
+  // deserialize zero-copy
+  framework::Variable var2;
+  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  auto tensor2 = var2.Get<framework::LoDTensor>();
+  float* tensor_data2 = nullptr;
+  framework::Tensor tmp_tensor;
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    platform::CPUPlace cpu;
+    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+    tensor_data2 = tmp_tensor.data<float>();
+  } else {
+    tensor_data2 = const_cast<float*>(tensor2.data<float>());
+  }
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+}
+void RunSerdeTestSelectedRows(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* slr = var.GetMutable<framework::SelectedRows>();
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+  tensor->Resize(framework::make_ddim({2, 10}));
+  tensor->mutable_data<float>(place);
+  int tensor_numel = 2 * 10;
+  math::set_constant(ctx, tensor, 32.7);
+  rows->push_back(3);
+  rows->push_back(10);
+  ::grpc::ByteBuffer msg;
+  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  EXPECT_GT(msg.Length(), 0);
+  // deserialize
+  std::vector<::grpc::Slice> slices;
+  (void)msg.Dump(&slices);
+  std::string tmp;
+  for (const auto& s : slices) {
+    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  sendrecv::VariableMessage varmsg;
+  EXPECT_TRUE(varmsg.ParseFromString(tmp));
+  EXPECT_EQ(varmsg.varname(), "myvar");
+  EXPECT_EQ(varmsg.type(), 1);
+  const float* tensor_data =
+      reinterpret_cast<const float*>(varmsg.serialized().data());
+  const int64_t* rows_data =
+      reinterpret_cast<const int64_t*>(varmsg.rows().data());
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
+  }
+  EXPECT_EQ(rows_data[0], 3);
+  EXPECT_EQ(rows_data[1], 10);
+  // deserialize zero-copy
+  framework::Variable var2;
+  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
+  auto* slr2 = var2.GetMutable<framework::SelectedRows>();
+  auto* tensor2 = slr2->mutable_value();
+  auto* rows2 = slr2->mutable_rows();
+  float* tensor_data2 = nullptr;
+  framework::Tensor tmp_tensor;
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    platform::CPUPlace cpu;
+    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+    tensor_data2 = tmp_tensor.data<float>();
+  } else {
+    tensor_data2 = const_cast<float*>(tensor2->data<float>());
+  }
+  const int64_t* rows_data2 = rows2->data();
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+  }
+  EXPECT_EQ(rows_data2[0], 3);
+  EXPECT_EQ(rows_data2[1], 10);
+}
+TEST(SelectedRows, CPU) {
+  platform::CPUPlace place;
+  RunSerdeTestSelectedRows(place);
+}
+TEST(SelectedRows, GPU) {
+  platform::CUDAPlace place;
+  RunSerdeTestSelectedRows(place);
+}
+TEST(Tensor, CPU) {
+  platform::CPUPlace place;
+  RunSerdeTestTensor(place);
+}
+TEST(Tensor, GPU) {
+  platform::CUDAPlace place;
+  RunSerdeTestTensor(place);
+}
\ No newline at end of file
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
    return framework::OpKernelType(
        framework::ToDataType(
            ctx.Input<framework::Tensor>("DetectRes")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };
@@ -142,7 +142,15 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("MAP",
              "(Tensor) A tensor with shape [1], store the mAP evaluate "
              "result of the detection.");
+    AddAttr<int>("class_num",
+                 "(int) "
+                 "The class number.");
+    AddAttr<int>(
+        "background_label",
+        "(int, defalut: 0) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(0);
    AddAttr<float>(
        "overlap_threshold",
        "(float) "

--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -69,6 +69,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    float overlap_threshold = ctx.Attr<float>("overlap_threshold");
    float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
+    int class_num = ctx.Attr<int>("class_num");
    auto label_lod = in_label->lod();
    auto detect_lod = in_detect->lod();
@@ -95,17 +96,19 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    if (in_pos_count != nullptr && state) {
      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count,
-                  true_pos, false_pos);
+                  true_pos, false_pos, class_num);
    }
    CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
                             overlap_threshold, label_pos_count, true_pos,
                             false_pos);
-    T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos);
+    int background_label = ctx.Attr<int>("background_label");
+    T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
+                    background_label);
    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count,
-                 *out_true_pos, *out_false_pos);
+                 *out_true_pos, *out_false_pos, class_num);
    T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
    map_data[0] = map;
@@ -141,6 +144,15 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    }
  }
+  inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const {
+    T one = static_cast<T>(1.0);
+    T zero = static_cast<T>(0.0);
+    clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero);
+    clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero);
+    clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero);
+    clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero);
+  }
  void GetBoxes(const framework::LoDTensor& input_label,
                const framework::LoDTensor& input_detect,
                std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
@@ -190,24 +202,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
      framework::Tensor& output_pos_count,
      framework::LoDTensor& output_true_pos,
-      framework::LoDTensor& output_false_pos) const {
+      framework::LoDTensor& output_false_pos, const int class_num) const {
-    int max_class_id = 0;
    int true_pos_count = 0;
    int false_pos_count = 0;
-    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
+    for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
-      int label = it->first;
+      auto tp = it->second;
-      if (label > max_class_id) max_class_id = label;
+      true_pos_count += tp.size();
-      int label_num_pos = it->second;
+    }
-      if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
+    for (auto it = false_pos.begin(); it != false_pos.end(); ++it) {
-        continue;
+      auto fp = it->second;
-      auto label_true_pos = true_pos.find(label)->second;
+      false_pos_count += fp.size();
-      auto label_false_pos = false_pos.find(label)->second;
-      true_pos_count += label_true_pos.size();
-      false_pos_count += label_false_pos.size();
    }
    int* pos_count_data = output_pos_count.mutable_data<int>(
-        framework::make_ddim({max_class_id + 1, 1}), ctx.GetPlace());
+        framework::make_ddim({class_num, 1}), ctx.GetPlace());
    T* true_pos_data = output_true_pos.mutable_data<T>(
        framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
@@ -217,7 +225,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    false_pos_count = 0;
    std::vector<size_t> true_pos_starts = {0};
    std::vector<size_t> false_pos_starts = {0};
-    for (int i = 0; i <= max_class_id; ++i) {
+    for (int i = 0; i < class_num; ++i) {
      auto it_count = label_pos_count.find(i);
      pos_count_data[i] = 0;
      if (it_count != label_pos_count.end()) {
@@ -258,17 +266,16 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    return;
  }
-  void GetInputPos(
+  void GetInputPos(const framework::Tensor& input_pos_count,
-      const framework::Tensor& input_pos_count,
+                   const framework::LoDTensor& input_true_pos,
-      const framework::LoDTensor& input_true_pos,
+                   const framework::LoDTensor& input_false_pos,
-      const framework::LoDTensor& input_false_pos,
+                   std::map<int, int>& label_pos_count,
-      std::map<int, int>& label_pos_count,
+                   std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+                   std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+                   const int class_num) const {
    constexpr T kEPS = static_cast<T>(1e-6);
-    int class_number = input_pos_count.dims()[0];
    const int* pos_count_data = input_pos_count.data<int>();
-    for (int i = 0; i < class_number; ++i) {
+    for (int i = 0; i < class_num; ++i) {
      label_pos_count[i] = pos_count_data[i];
    }
@@ -362,7 +369,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
          size_t max_idx = 0;
          auto score = pred_boxes[i].first;
          for (size_t j = 0; j < matched_bboxes.size(); ++j) {
-            T overlap = JaccardOverlap(pred_boxes[i].second, matched_bboxes[j]);
+            Box& pred_box = pred_boxes[i].second;
+            ClipBBox(pred_box, &pred_box);
+            T overlap = JaccardOverlap(pred_box, matched_bboxes[j]);
            if (overlap > max_overlap) {
              max_overlap = overlap;
              max_idx = j;
@@ -391,17 +400,19 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    }
  }
-  T CalcMAP(
+  T CalcMAP(APType ap_type, const std::map<int, int>& label_pos_count,
-      APType ap_type, const std::map<int, int>& label_pos_count,
+            const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+            const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      const std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+            const int background_label) const {
    T mAP = 0.0;
    int count = 0;
    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
      int label = it->first;
      int label_num_pos = it->second;
-      if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
+      if (label_num_pos == background_label ||
+          true_pos.find(label) == true_pos.end()) {
        continue;
+      }
      auto label_true_pos = true_pos.find(label)->second;
      auto label_false_pos = false_pos.find(label)->second;
      // Compute average precision.
@@ -450,7 +461,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      }
    }
    if (count != 0) mAP /= count;
-    return mAP * 100;
+    return mAP;
  }
 };  // namespace operators

--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
 };
 template <typename T>
-struct ElementwiseDivGradFunctor {
+struct DivGradDX {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto z_e = framework::EigenVector<T>::Flatten(*z);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
-    }
-  }
-};
-template <typename T>
-struct ElementwiseDivBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e_bcast;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
 };
 template <typename T>
-struct ElementwiseDivBroadCast2GradFunctor {
+struct DivGradDY {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
+    return -dout * x / (y * y);
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e_bcast;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
  }
 };
@@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
+    ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-                           ElementwiseDivBroadCastGradFunctor<T>,
+        ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-                           ElementwiseDivBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
  }
 };

--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -41,76 +41,16 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
 };
 template <typename T>
-struct ElementwiseMaxGradFunctor {
+struct MaxGradDx {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-            typename dY, typename dZ>
+    return dout * (x > y);
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
-    }
  }
 };
 template <typename T>
-struct ElementwiseMaxBroadCastGradFunctor {
+struct MaxGradDy {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-            typename dY, typename dZ, typename Pre, typename N>
+    return dout * (x <= y);
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-template <typename T>
-struct ElementwiseMaxBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
  }
 };
@@ -127,12 +67,9 @@ class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
+    ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
-                           ElementwiseMaxBroadCastGradFunctor<T>,
+        ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
-                           ElementwiseMaxBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -41,76 +41,16 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
 };
 template <typename T>
-struct ElementwiseMinGradFunctor {
+struct MinGradDx {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-            typename dY, typename dZ>
+    return dout * (x < y);
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
-    }
  }
 };
 template <typename T>
-struct ElementwiseMinBroadCastGradFunctor {
+struct MinGradDy {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-            typename dY, typename dZ, typename Pre, typename N>
+    return dout * (x >= y);
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-template <typename T>
-struct ElementwiseMinBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
  }
 };
@@ -127,12 +67,9 @@ class ElementwiseMinGradKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
+    ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
-                           ElementwiseMinBroadCastGradFunctor<T>,
+        ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
-                           ElementwiseMinBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -40,78 +40,13 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
 };
 template <typename T>
-struct ElementwiseMulGradFunctor {
+struct MulGradDX {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = x_e * dz_e;
-    }
-  }
-};
-template <typename T>
-struct ElementwiseMulBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
 };
 template <typename T>
-struct ElementwiseMulBroadCast2GradFunctor {
+struct MulGradDY {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
 };
 template <typename DeviceContext, typename T>
@@ -127,12 +62,9 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
+    ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-                           ElementwiseMulBroadCastGradFunctor<T>,
+        ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-                           ElementwiseMulBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast {
      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
    }
    if (dy_ != nullptr) {
-      dy_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
    }
  }

--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -40,61 +40,13 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
 };
 template <typename T>
-struct ElementwiseSubGradFunctor {
+struct SubGradDX {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) * dz_e;
-    }
-  }
 };
 template <typename T>
-struct ElementwiseSubBroadCastGradFunctor {
+struct SubGradDY {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) *
-                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-template <typename T>
-struct ElementwiseSubBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) *
-                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
 };
 template <typename DeviceContext, typename T>
@@ -110,12 +62,9 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
    int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
+    ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-                           ElementwiseSubBroadCastGradFunctor<T>,
+        ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
-                           ElementwiseSubBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
  }
 };
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
 add_subdirectory(detail)
-if(WITH_GPU)
+function(math_library TARGET)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
+    # math_library is a function to create math library. 
-    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
+    # The interface is the same as cc_library. 
-    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
+    # But it handle split GPU/CPU code and link some common library.
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+    set(cc_srcs)
-    nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
+    set(cu_srcs)
-    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
+    set(math_common_deps device_context framework_proto)
-    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    set(multiValueArgs DEPS)
-    nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
+    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
-    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
+            "${multiValueArgs}" ${ARGN})
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
-    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
+        list(APPEND cc_srcs ${TARGET}.cc)
-    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
+    endif()
-    nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+        list(APPEND cu_srcs ${TARGET}.cu)
-    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    endif()
-    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
+    list(LENGTH cc_srcs cc_srcs_len)
-    nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
+    if (WITH_GPU)
-else()
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
+    elseif(${cc_srcs_len} GREATER 0)
-    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    cc_library(softmax SRCS softmax.cc DEPS device_context)
+    endif()
-    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
+endfunction()
-    cc_library(pooling SRCS pooling.cc DEPS device_context)
-    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
-    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
-    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
-    cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
-    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
-    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
-    cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
-endif()
-cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+# please add new math_library in alphabetical order
+math_library(concat)
+math_library(context_project DEPS im2col math_function)
+math_library(cross_entropy)
+math_library(cos_sim_functor)
+math_library(depthwise_conv)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(im2col)
+math_library(lstm_compute DEPS activation_functions)
+math_library(math_function DEPS cblas)
+math_library(maxouting)
+math_library(pooling)
+math_library(selected_rows_functor DEPS selected_rows)
+math_library(sequence2batch)
+math_library(sequence_padding)
+math_library(sequence_pooling DEPS math_function)
+math_library(sequence_scale)
+math_library(softmax)
+math_library(unpooling)
+math_library(vol2col)
+cc_test(math_function_test SRCS math_function_test.cc)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
-cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
+cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
-cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
+cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
+if(WITH_GPU)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+endif()
+cc_test(concat_test SRCS concat_test.cc DEPS concat)
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/concat.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int out_rows = rows, out_cols = 0;
+    std::vector<int64_t> input_cols(input.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    // computation
+    for (int k = 0; k < out_rows; ++k) {
+      T* dst_ptr = output->data<T>() + k * out_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        const T* src_prt = input[j].data<T>() + k * col_len;
+        memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
+                     sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const int axis,
+                  std::vector<framework::Tensor>& outputs) {
+    // TODO(zcd): Add input data validity checking
+    int num = outputs.size();
+    int input_rows = 1;
+    auto dim_0 = outputs[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      input_rows *= dim_0[i];
+    }
+    int input_cols = 0;
+    std::vector<int64_t> output_cols(outputs.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = outputs[i].numel() / input_rows;
+      input_cols += t_cols;
+      output_cols[i] = t_cols;
+    }
+    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    // computation
+    for (int k = 0; k < input_rows; ++k) {
+      const T* src_ptr = input.data<T>() + k * input_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = output_cols[j];
+        T* dst_ptr = outputs[j].data<T>() + k * col_len;
+        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                     sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+template class ConcatFunctor<platform::CPUDeviceContext, int>;
+template class ConcatFunctor<platform::CPUDeviceContext, int64_t>;
+template class ConcatFunctor<platform::CPUDeviceContext, float>;
+template class ConcatFunctor<platform::CPUDeviceContext, double>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, int>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, int64_t>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, float>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__device__ T upper_bound(const T* first, T count, T val) {
+  const T* orig = first;
+  const T* it = nullptr;
+  T step = 0;
+  while (count > 0) {
+    it = first;
+    step = count / 2;
+    it += step;
+    if (!(val < *it)) {
+      first = ++it;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first - orig;
+}
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
+  int curr_offset = input_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* input_ptr = inputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * segment_width + local_col];
+  }
+}
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int input_col,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* input_ptr = inputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * input_col + in_offset];
+    }
+  }
+}
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int* output_cols,
+                                 int col_size, T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
+  int curr_offset = output_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * segment_width + local_col] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int output_cols,
+                                 T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* output_ptr = outputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * output_cols + in_offset] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int cols = input[0].numel() / rows;
+    int out_rows = rows, out_cols = 0;
+    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_cols(num + 1);
+    inputs_cols[0] = 0;
+    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+    bool sameShape = true;
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      if (sameShape) {
+        if (t_cols != cols) sameShape = false;
+      }
+      out_cols += t_cols;
+      inputs_cols[i + 1] = out_cols;
+      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+    }
+    T** ins_gpu =
+        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
+    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
+    // computation
+    // set the thread block and grid according to CurrentDeviceId
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_cols + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+    int grid_cols =
+        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+    if (sameShape) {
+      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+    } else {
+      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
+          out_cols, output->data<T>());
+    }
+  }
+};
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const int axis,
+                  std::vector<framework::Tensor>& outputs) {
+    // TODO(zcd): Add input data validity checking
+    int num = outputs.size();
+    int input_row = 1;
+    auto dim_0 = outputs[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      input_row *= dim_0[i];
+    }
+    int output_col_0 = outputs[0].numel() / input_row;
+    int input_col = 0;
+    bool sameShape = true;
+    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(num + 1);
+    outputs_cols[0] = 0;
+    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
+    for (int i = 0; i < num; ++i) {
+      int t_col = outputs[i].numel() / input_row;
+      if (sameShape) {
+        if (t_col != output_col_0) sameShape = false;
+      }
+      input_col += t_col;
+      outputs_cols[i + 1] = input_col;
+      outputs_ptr[i] = outputs[i].data<T>();
+    }
+    T** outs_gpu =
+        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
+    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
+    // computation
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((input_col + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+    int grid_cols =
+        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+    if (sameShape) {
+      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+    } else {
+      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+          input.data<T>(), input_row, input_col, outs_col_gpu,
+          static_cast<int>(outputs_cols.size()), outs_gpu);
+    }
+  }
+};
+template class ConcatFunctor<platform::CUDADeviceContext, int>;
+template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatFunctor<platform::CUDADeviceContext, float>;
+template class ConcatFunctor<platform::CUDADeviceContext, double>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output);
+};
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const int axis, std::vector<framework::Tensor>& outputs);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/math/concat.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+using namespace paddle::framework;
+using namespace paddle::platform;
+template <typename DeviceContext, typename Place>
+void testConcat() {
+  Tensor input_a_cpu;
+  Tensor input_b_cpu;
+  Tensor out_cpu;
+  Tensor input_a;
+  Tensor input_b;
+  Tensor out;
+  DeviceContext* context = new DeviceContext(Place());
+  //  DeviceContext context(Place());
+  /**
+   * cast1:
+   *    inputs:
+   *        t_a.shape: [2, 3, 4]
+   *        t_b.shape: [3, 3, 4]
+   *    output:
+   *        out.shape: [5, 3, 4]
+   */
+  auto dim_a = make_ddim({2, 3, 4});
+  auto dim_b = make_ddim({3, 3, 4});
+  auto dim_out = make_ddim({5, 3, 4});
+  input_a.mutable_data<int>(dim_a, Place());
+  input_b.mutable_data<int>(dim_b, Place());
+  out.mutable_data<int>(dim_out, Place());
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
+    input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
+    out_cpu.mutable_data<int>(dim_out, CPUPlace());
+  }
+  int* a_ptr;
+  int* b_ptr;
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 3 * 3 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  std::vector<Tensor> input;
+  input.push_back(input_a);
+  input.push_back(input_b);
+  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
+  concat_functor(*context, input, 0, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  int* out_ptr;
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  int cols = 2 * 3 * 4;
+  int idx_a = 0, idx_b = 0;
+  for (int j = 0; j < 5 * 3 * 4; ++j) {
+    if (j >= cols) {
+      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
+      ++idx_b;
+    } else {
+      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
+      ++idx_a;
+    }
+  }
+  //
+  /**
+    * cast2:
+    *    inputs:
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 4, 4]
+    *    output:
+    *        out.shape: [2, 7, 4]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 4, 4});
+  dim_out = make_ddim({2, 7, 4});
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 4 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+  concat_functor(*context, input, 1, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  cols = 3 * 4;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 28; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+  /**
+    * cast3:
+    *    inputs:
+    *        t_a.shape: [2, 3, 5]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 3, 9]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 3, 5});
+  dim_out = make_ddim({2, 3, 9});
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 3 * 5; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+  concat_functor(*context, input, 2, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  // check the data
+  cols = 4;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+  /**
+    * cast4:
+    *    inputs:
+    *        axis = 1
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 6, 4]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 3, 4});
+  dim_out = make_ddim({2, 6, 4});
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+  concat_functor(*context, input, 1, &out);
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+  // check the data
+  cols = 12;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 24; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+}
+TEST(math, concat) {
+  testConcat<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testConcat<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
+#endif
+}
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -15,11 +15,23 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
 namespace math {
+using float16 = paddle::platform::float16;
+template <>
+void gemm<platform::CPUDeviceContext, float16>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float16 alpha, const float16* A, const float16* B, const float16 beta,
+    float16* C) {
+  PADDLE_THROW("float16 GEMM not supported on CPU");
+}
 template <>
 void gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
@@ -46,6 +58,15 @@ void gemm<platform::CPUDeviceContext, double>(
              beta, C, ldc);
 }
+template <>
+void gemm<platform::CPUDeviceContext, float16>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const float16 alpha, const float16* A, const int lda, const float16* B,
+    const int ldb, const float16 beta, float16* C, const int ldc) {
+  PADDLE_THROW("float16 GEMM not supported on CPU");
+}
 template <>
 void gemm<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context, const bool transA,
@@ -68,6 +89,15 @@ void gemm<platform::CPUDeviceContext, double>(
              lda, B, ldb, beta, C, ldc);
 }
+template <>
+void matmul<platform::CPUDeviceContext, float16>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
+    framework::Tensor* matrix_out, float16 beta) {
+  PADDLE_THROW("float16 matmul not supported on CPU");
+}
 template <>
 void matmul<platform::CPUDeviceContext, float>(
    const platform::CPUDeviceContext& context,
@@ -126,6 +156,15 @@ void matmul<platform::CPUDeviceContext, double>(
      matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
+template <>
+void batched_gemm<platform::CPUDeviceContext, float16>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float16 alpha, const float16* A, const float16* B, const float16 beta,
+    float16* C, const int batchCount, const int strideA, const int strideB) {
+  PADDLE_THROW("float16 batched_gemm not supported on CPU");
+}
 #ifdef PADDLE_WITH_MKLML
 // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
 template <>
@@ -245,11 +284,13 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
-#define DEFINE_CPU_TRANS(RANK)                                          \
+#define DEFINE_CPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+                            RANK>;                                         \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
 DEFINE_CPU_TRANS(1);

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -16,11 +16,40 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/float16.h"
 namespace paddle {
 namespace operators {
 namespace math {
+using float16 = paddle::platform::float16;
+template <>
+void gemm<platform::CUDADeviceContext, float16>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float16 alpha, const float16* A, const float16* B, const float16 beta,
+    float16* C) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const half h_alpha = static_cast<const half>(alpha);
+  const half h_beta = static_cast<const half>(beta);
+  const half* h_A = reinterpret_cast<const half*>(A);
+  const half* h_B = reinterpret_cast<const half*>(B);
+  half* h_C = reinterpret_cast<half*>(C);
+  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
+      h_A, lda, &h_beta, h_C, N));
+}
 template <>
 void gemm<platform::CUDADeviceContext, float>(
    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
@@ -60,6 +89,28 @@ void gemm<platform::CUDADeviceContext, double>(
      lda, &beta, C, N));
 }
+template <>
+void gemm<platform::CUDADeviceContext, float16>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const float16 alpha, const float16* A, const int lda, const float16* B,
+    const int ldb, const float16 beta, float16* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const half h_alpha = static_cast<const half>(alpha);
+  const half h_beta = static_cast<const half>(beta);
+  const half* h_A = reinterpret_cast<const half*>(A);
+  const half* h_B = reinterpret_cast<const half*>(B);
+  half* h_C = reinterpret_cast<half*>(C);
+  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
+      h_A, lda, &h_beta, h_C, ldc));
+}
 template <>
 void gemm<platform::CUDADeviceContext, float>(
    const platform::CUDADeviceContext& context, const bool transA,
@@ -90,6 +141,35 @@ void gemm<platform::CUDADeviceContext, double>(
      lda, &beta, C, ldc));
 }
+template <>
+void matmul<platform::CUDADeviceContext, float16>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
+    framework::Tensor* matrix_out, float16 beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
+                     platform::is_gpu_place(matrix_b.place()) &&
+                     platform::is_gpu_place(matrix_out->place()),
+                 "Matrix must all be in CUDAPlace");
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+  gemm<platform::CUDADeviceContext, float16>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
+      matrix_b.data<float16>(), beta, matrix_out->data<float16>());
+}
 template <>
 void matmul<platform::CUDADeviceContext, float>(
    const platform::CUDADeviceContext& context,
@@ -148,6 +228,34 @@ void matmul<platform::CUDADeviceContext, double>(
      matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
+template <>
+void batched_gemm<platform::CUDADeviceContext, float16>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float16 alpha, const float16* A, const float16* B, const float16 beta,
+    float16* C, const int batchCount, const int strideA, const int strideB) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int strideC = M * N;
+  const half h_alpha = static_cast<const half>(alpha);
+  const half h_beta = static_cast<const half>(beta);
+  const half* h_A = reinterpret_cast<const half*>(A);
+  const half* h_B = reinterpret_cast<const half*>(B);
+  half* h_C = reinterpret_cast<half*>(C);
+  PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
+      strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
+}
 template <>
 void batched_gemm<platform::CUDADeviceContext, float>(
    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,

--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -14,30 +14,41 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/math_function.h"
-TEST(math_function, notrans_mul_trans) {
+void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
-  paddle::framework::Tensor input1;
+                    const std::vector<float>& data) {
-  paddle::framework::Tensor input1_gpu;
+  PADDLE_ENFORCE_EQ(size, data.size());
-  paddle::framework::Tensor input2_gpu;
+  for (size_t i = 0; i < data.size(); ++i) {
-  paddle::framework::Tensor out_gpu;
+    in_ptr[i] = paddle::platform::float16(data[i]);
-  paddle::framework::Tensor out;
+  }
+}
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+TEST(math_function, notrans_mul_trans_fp32) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor input1;
+  Tensor input1_gpu;
+  Tensor input2_gpu;
+  Tensor out_gpu;
+  Tensor out;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr, 6 * sizeof(float));
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
+  TensorCopy(input1, gpu_place, context, &input2_gpu);
-  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
-  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
+  out_gpu.mutable_data<float>({2, 2}, gpu_place);
-  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
+  paddle::operators::math::matmul<CUDADeviceContext, float>(
      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
-  paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
+  TensorCopy(out_gpu, cpu_place, context, &out);
  float* out_ptr = out.data<float>();
  context.Wait();
@@ -45,33 +56,71 @@ TEST(math_function, notrans_mul_trans) {
  EXPECT_EQ(out_ptr[1], 14);
  EXPECT_EQ(out_ptr[2], 14);
  EXPECT_EQ(out_ptr[3], 50);
-  delete gpu_place;
 }
-TEST(math_function, trans_mul_notrans) {
+TEST(math_function, notrans_mul_trans_fp16) {
-  paddle::framework::Tensor input1;
+  using namespace paddle::framework;
-  paddle::framework::Tensor input1_gpu;
+  using namespace paddle::platform;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
+  Tensor input1;
-  paddle::framework::Tensor out;
+  Tensor input1_gpu;
+  Tensor input2_gpu;
+  Tensor out_gpu;
+  Tensor out;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
+  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
+  TensorCopy(input1, gpu_place, context, &input2_gpu);
+  out_gpu.mutable_data<float16>({2, 2}, gpu_place);
+  paddle::operators::math::matmul<CUDADeviceContext, float16>(
+      context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
+      float16(0));
+  TensorCopy(out_gpu, cpu_place, context, &out);
+  float16* out_ptr = out.data<float16>();
+  context.Wait();
+  EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
+  EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
+  EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
+  EXPECT_EQ(static_cast<float>(out_ptr[3]), 50);
+}
+TEST(math_function, trans_mul_notrans_fp32) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor input1;
+  Tensor input1_gpu;
+  Tensor input2_gpu;
+  Tensor out_gpu;
+  Tensor out;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
-  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
  float arr[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr, 6 * sizeof(float));
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
+  TensorCopy(input1, gpu_place, context, &input2_gpu);
-  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
+  out_gpu.mutable_data<float>({3, 3}, gpu_place);
-  paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
-  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
-  paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
+  TensorCopy(out_gpu, cpu_place, context, &out);
  float* out_ptr = out.data<float>();
  context.Wait();
@@ -84,45 +133,88 @@ TEST(math_function, trans_mul_notrans) {
  EXPECT_EQ(out_ptr[6], 15);
  EXPECT_EQ(out_ptr[7], 22);
  EXPECT_EQ(out_ptr[8], 29);
-  delete gpu_place;
 }
-TEST(math_function, gemm_notrans_cublas) {
+TEST(math_function, trans_mul_notrans_fp16) {
-  paddle::framework::Tensor input1;
+  using namespace paddle::framework;
-  paddle::framework::Tensor input2;
+  using namespace paddle::platform;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
+  Tensor input1;
-  paddle::framework::Tensor input2_gpu;
+  Tensor input1_gpu;
-  paddle::framework::Tensor input3_gpu;
+  Tensor input2_gpu;
+  Tensor out_gpu;
+  Tensor out;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
+  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
+  TensorCopy(input1, gpu_place, context, &input2_gpu);
+  out_gpu.mutable_data<float16>({3, 3}, gpu_place);
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>(
+      context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
+      float16(0));
+  TensorCopy(out_gpu, cpu_place, context, &out);
+  float16* out_ptr = out.data<float16>();
+  context.Wait();
+  EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
+  EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
+  EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
+  EXPECT_EQ(static_cast<float>(out_ptr[3]), 12);
+  EXPECT_EQ(static_cast<float>(out_ptr[4]), 17);
+  EXPECT_EQ(static_cast<float>(out_ptr[5]), 22);
+  EXPECT_EQ(static_cast<float>(out_ptr[6]), 15);
+  EXPECT_EQ(static_cast<float>(out_ptr[7]), 22);
+  EXPECT_EQ(static_cast<float>(out_ptr[8]), 29);
+}
+TEST(math_function, gemm_notrans_cublas_fp32) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor input1;
+  Tensor input2;
+  Tensor input3;
+  Tensor input1_gpu;
+  Tensor input2_gpu;
+  Tensor input3_gpu;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
  int m = 2;
  int n = 3;
  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
  float arr1[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, cpu_place);
  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
+  TensorCopy(input2, gpu_place, context, &input2_gpu);
+  TensorCopy(input3, gpu_place, context, &input3_gpu);
-  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+  float* c = input3_gpu.mutable_data<float>(gpu_place);
  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
-  paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
+  TensorCopy(input3_gpu, cpu_place, context, &input3);
  // numpy code:
  // a = np.arange(6).reshape(2, 3)
@@ -139,47 +231,105 @@ TEST(math_function, gemm_notrans_cublas) {
  EXPECT_EQ(input3_ptr[5], 73);
  EXPECT_EQ(input3_ptr[6], 86);
  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
 }
-TEST(math_function, gemm_trans_cublas) {
+TEST(math_function, gemm_notrans_cublas_fp16) {
-  paddle::framework::Tensor input1;
+  using namespace paddle::framework;
-  paddle::framework::Tensor input2;
+  using namespace paddle::platform;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
+  Tensor input1;
-  paddle::framework::Tensor input2_gpu;
+  Tensor input2;
-  paddle::framework::Tensor input3_gpu;
+  Tensor input3;
+  Tensor input1_gpu;
+  Tensor input2_gpu;
+  Tensor input3_gpu;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
+  float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place);
+  fill_fp16_data(input2_ptr, input2.numel(),
+                 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
+  fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
+  TensorCopy(input2, gpu_place, context, &input2_gpu);
+  TensorCopy(input3, gpu_place, context, &input3_gpu);
+  float16* a = input1_gpu.data<float16>();
+  float16* b = input2_gpu.data<float16>();
+  float16* c = input3_gpu.mutable_data<float16>(gpu_place);
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
+      context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
+      c + 1, 4);
+  TensorCopy(input3_gpu, cpu_place, context, &input3);
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  context.Wait();
+  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
+  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
+  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
+  EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
+  EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
+  EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
+  EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
+  EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
+}
+TEST(math_function, gemm_trans_cublas_fp32) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor input1;
+  Tensor input2;
+  Tensor input3;
+  Tensor input1_gpu;
+  Tensor input2_gpu;
+  Tensor input3_gpu;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
  int m = 2;
  int n = 3;
  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
  float arr1[6] = {0, 1, 2, 3, 4, 5};
  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, cpu_place);
  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
+  TensorCopy(input2, gpu_place, context, &input2_gpu);
+  TensorCopy(input3, gpu_place, context, &input3_gpu);
-  paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
-  paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
-  paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
  float* a = input1_gpu.data<float>();
  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+  float* c = input3_gpu.mutable_data<float>(gpu_place);
  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
-  paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
+  TensorCopy(input3_gpu, cpu_place, context, &input3);
-  context.Wait();
+  context.Wait();
  EXPECT_EQ(input3_ptr[0], 0);
  EXPECT_EQ(input3_ptr[1], 24);
  EXPECT_EQ(input3_ptr[2], 28);
@@ -188,27 +338,81 @@ TEST(math_function, gemm_trans_cublas) {
  EXPECT_EQ(input3_ptr[5], 73);
  EXPECT_EQ(input3_ptr[6], 86);
  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
+}
+TEST(math_function, gemm_trans_cublas_fp16) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor input1;
+  Tensor input2;
+  Tensor input3;
+  Tensor input1_gpu;
+  Tensor input2_gpu;
+  Tensor input3_gpu;
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext context(gpu_place);
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
+  float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place);
+  fill_fp16_data(input2_ptr, input2.numel(),
+                 {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
+  float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
+  fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
+  TensorCopy(input1, gpu_place, context, &input1_gpu);
+  TensorCopy(input2, gpu_place, context, &input2_gpu);
+  TensorCopy(input3, gpu_place, context, &input3_gpu);
+  float16* a = input1_gpu.data<float16>();
+  float16* b = input2_gpu.data<float16>();
+  float16* c = input3_gpu.mutable_data<float16>(gpu_place);
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
+      context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
+      c + 1, 4);
+  TensorCopy(input3_gpu, cpu_place, context, &input3);
+  context.Wait();
+  EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
+  EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
+  EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
+  EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
+  EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
+  EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
+  EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
+  EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
 }
 template <typename T>
 void GemvTest(int m, int n, bool trans) {
-  paddle::framework::Tensor mat_a;
+  using namespace paddle::framework;
-  paddle::framework::Tensor vec_b;
+  using namespace paddle::platform;
-  paddle::framework::Tensor vec_c;
-  auto* cpu_place = new paddle::platform::CPUPlace();
+  Tensor mat_a;
+  Tensor vec_b;
-  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
+  Tensor vec_c;
-  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
-  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
+  CPUPlace cpu_place;
+  CUDAPlace gpu_place(0);
-  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  CUDADeviceContext context(gpu_place);
-  paddle::framework::Tensor g_mat_a;
-  paddle::framework::Tensor g_vec_b;
+  T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
-  paddle::framework::Tensor g_vec_c;
+  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
-  T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), *gpu_place);
+  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
-  T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), *gpu_place);
-  T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), *gpu_place);
+  Tensor g_mat_a;
+  Tensor g_vec_b;
+  Tensor g_vec_c;
+  T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
+  T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
+  T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
  for (int i = 0; i < mat_a.numel(); ++i) {
    data_a[i] = static_cast<T>(i);
@@ -217,16 +421,14 @@ void GemvTest(int m, int n, bool trans) {
    data_b[i] = static_cast<T>(i);
  }
-  paddle::platform::CUDADeviceContext context(*gpu_place);
+  TensorCopy(mat_a, gpu_place, context, &g_mat_a);
-  paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a);
+  TensorCopy(vec_b, gpu_place, context, &g_vec_b);
-  paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b);
-  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
+  paddle::operators::math::gemv<CUDADeviceContext, T>(
      context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
      g_data_b, 0., g_data_c);
-  paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context,
+  TensorCopy(g_vec_c, cpu_place, context, &vec_c);
-                                &vec_c);
  if (!trans) {
    for (int i = 0; i < m; ++i) {

--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -324,7 +324,7 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
             " Please note, M is equal to the 1st dimension of BBoxes. ");
    AddAttr<int>(
        "background_label",
-        "(int64_t, defalut: 0) "
+        "(int, defalut: 0) "
        "The index of background label, the background label will be ignored. "
        "If set to -1, then all categories will be considered.")
        .SetDefault(0);

--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/pool_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("X");
+    Tensor* output = ctx.Output<Tensor>("Out");
+    // Get an unique name from "argument" name of "Out" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Output("Out");
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
+    }
+    // Only 2D pooling is supported now
+    PADDLE_ENFORCE(ksize.size() == 2, "ksize must be 2D, i.e. 2D pooling");
+    PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
+                   "pooling_type must be 'max' or 'avg'");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input dim must be with 4, i.e. NCHW");
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // TODO(pzelazko-intel): support more formats
+    auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                          mkldnn::memory::format::nchw);
+    auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
+                                          mkldnn::memory::format::nchw);
+    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
+        CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
+                            pooling_type, mkldnn_engine);
+    // save pool_pd into global device context to be referred in backward path
+    dev_ctx.SetBlob(key_pool_pd, pool_pd);
+    std::shared_ptr<mkldnn::memory> workspace_memory =
+        CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
+    // save pool_workspace_memory to be referred in backward path
+    dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+    auto src_memory =
+        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+    auto dst_memory =
+        mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
+    auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
+                                             *workspace_memory);
+    // push primitive to stream and wait until it's executed
+    std::vector<mkldnn::primitive> pipeline{pool_prim};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+ private:
+  std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
+      const std::vector<int>& stride, const std::vector<int>& padding,
+      const std::vector<int>& kernel, const std::string& pooling_type,
+      const mkldnn::engine& engine) const {
+    auto pool_desc = mkldnn::pooling_forward::desc(
+        mkldnn::prop_kind::forward,
+        pooling_type == "max" ? mkldnn::algorithm::pooling_max
+                              : mkldnn::algorithm::pooling_avg,
+        src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero);
+    auto p_pool_pd =
+        new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);
+    return std::unique_ptr<mkldnn::pooling_forward::primitive_desc>(p_pool_pd);
+  }
+  std::unique_ptr<mkldnn::memory> CreateWorkspaceMemory(
+      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd,
+      const std::string& pooling_type, const mkldnn::engine& engine) const {
+    mkldnn::memory::primitive_desc workspace_md =
+        pooling_type == "max"
+            ? pool_pd->workspace_primitive_desc()
+            : mkldnn::memory::primitive_desc(
+                  {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw},
+                  engine);
+    auto p_workspace_memory = new mkldnn::memory(workspace_md);
+    return std::unique_ptr<mkldnn::memory>(p_workspace_memory);
+  }
+};
+template <typename T>
+class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    const Tensor* in_x = ctx.Input<Tensor>("X");
+    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    // Get an unique name from "argument" name of "Out" variable
+    // This name will be used as key when referring info from device context
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine();
+    const T* out_grad_data = out_grad->data<T>();
+    T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> diff_src_tz =
+        paddle::framework::vectorize2int(in_x_grad->dims());
+    std::vector<int> diff_dst_tz =
+        paddle::framework::vectorize2int(out_grad->dims());
+    auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);
+    auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);
+    // Retrieve pool_pd/pool_workspace_memory from device context
+    auto pool_pd =
+        std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_pool_pd));
+    PADDLE_ENFORCE(pool_pd != nullptr,
+                   "Fail to find pool_pd in device context");
+    auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx.GetBlob(key_pool_workspace_memory));
+    PADDLE_ENFORCE(workspace_memory != nullptr,
+                   "Fail to find workspace_memory in device context");
+    auto pool_bwd_desc = mkldnn::pooling_backward::desc(
+        pooling_type == "max" ? mkldnn::algorithm::pooling_max
+                              : mkldnn::algorithm::pooling_avg,
+        diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
+        mkldnn::padding_kind::zero);
+    auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
+        pool_bwd_desc, mkldnn_engine, *pool_pd);
+    auto diff_src_memory =
+        mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
+    auto diff_dst_memory =
+        mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
+    auto bwd_prim = mkldnn::pooling_backward(
+        pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
+    // push primitive to stream and wait until it's executed
+    std::vector<mkldnn::primitive> pipeline{bwd_prim};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }  // Compute()
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::PoolMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::PoolMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -13,12 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
-int PoolOutputSize(int input_size, int filter_size, int padding, int stride) {
+int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+                   bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+  }
  PADDLE_ENFORCE(output_size > 0,
                 "Due to the settings of padding(%d), filter_size(%d) and "
                 "stride(%d), the output size is less than 0, please check "
@@ -38,6 +51,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                 "Pooling intput should be 4-D or 5-D tensor.");
@@ -59,8 +73,8 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
  for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(
+    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
-        PoolOutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+                                          paddings[i], strides[i], ceil_mode));
  }
  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
  ctx->ShareLoD("X", "Out");
@@ -68,20 +82,18 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
    const framework::ExecutionContext &ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
-  framework::LibraryType library_;
+#ifdef PADDLE_WITH_MKLDNN
-  if (use_cudnn) {
+  if (library_ == framework::LibraryType::kPlain &&
-    library_ = framework::LibraryType::kCUDNN;
+      platform::CanMKLDNNBeUsed(ctx)) {
-  } else {
+    library_ = framework::LibraryType::kMKLDNN;
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
@@ -99,20 +111,18 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext &ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  framework::LibraryType library_{framework::LibraryType::kPlain};
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
+  if (platform::CanCUDNNBeUsed(ctx)) {
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    library_ = framework::LibraryType::kCUDNN;
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
  }
 #endif
-  framework::LibraryType library_;
+#ifdef PADDLE_WITH_MKLDNN
-  if (use_cudnn) {
+  if (library_ == framework::LibraryType::kPlain &&
-    library_ = framework::LibraryType::kCUDNN;
+      platform::CanMKLDNNBeUsed(ctx)) {
-  } else {
+    library_ = framework::LibraryType::kMKLDNN;
-    library_ = framework::LibraryType::kPlain;
  }
+#endif
  std::string data_format = ctx.Attr<std::string>("data_format");
  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
@@ -167,6 +177,15 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>(
+      "ceil_mode",
+      "(bool, default false) Wether to use the ceil function to calculate "
+      "output height and width. False is the default. If it is set to False, "
+      "the floor function will be used.")
+      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -187,16 +206,21 @@ Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
-Example:   
+Example:
  Input:
       X shape: $(N, C, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, H_{out}, W_{out})$
-  Where
+  For ceil_mode = false:
-       $$ 
+       $$
       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
       $$
+  For ceil_mode = true:
+       $$
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       $$
 )DOC");
 }
@@ -251,6 +275,15 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
      .SetDefault(false);
+  AddAttr<bool>(
+      "ceil_mode",
+      "(bool, default false) Wether to use the ceil function to calculate "
+      "output height and width. False is the default. If it is set to False, "
+      "the floor function will be used.")
+      .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "
@@ -267,8 +300,8 @@ The pooling3d operation calculates the output based on
 the input, pooling_type, ksize, strides, and paddings parameters.
 Input(X) and output(Out) are in NCDHW format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. Parameters(ksize, strides, paddings) 
+width of the feature, respectively. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and 
+are three elements. These three elements represent depth, height and
 width, respectively. The input(X) size and output(Out) size may be different.
 Example:
@@ -276,12 +309,18 @@ Example:
       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  Where
+  For ceil_mode = false:
  $$
       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
  $$
+  For ceil_mode = true:
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+  $$
 )DOC");
 }

--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -60,15 +60,16 @@ class ReadOp : public framework::OperatorBase {
               const platform::Place& dev_place) const override {
    framework::ReaderHolder* reader =
        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
-    if (!reader->HasNext()) {
+    std::vector<std::string> out_arg_names = Outputs("Out");
+    std::vector<framework::LoDTensor> ins;
+    reader->ReadNext(&ins);
+    if (ins.empty()) {
      reader->ReInit();
+      reader->ReadNext(&ins);
      PADDLE_ENFORCE(
-          reader->HasNext(),
+          !ins.empty(),
          "Reader can not read the next data even it has been re-initialized.");
    }
-    std::vector<std::string> out_arg_names = Outputs("Out");
-    std::vector<framework::LoDTensor> ins;
-    reader->ReadNext(&ins);
    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
    for (size_t i = 0; i < ins.size(); ++i) {
      auto* out =

--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
+cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
+op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry)
+op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry)
+op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry)
+op_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS reader_op_registry)
+set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op create_double_buffer_reader_op PARENT_SCOPE)
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+class BatchReader : public framework::DecoratedReader {
+ public:
+  BatchReader(ReaderBase* reader, int batch_size)
+      : DecoratedReader(reader), batch_size_(batch_size) {
+    buffer_.reserve(batch_size_);
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+ private:
+  int batch_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+};
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(
+        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
+  }
+};
+class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+      A batch reader takes another reader as its 'underlying reader',
+      gathers the underlying reader's outputs and then yields them in batches.
+    )DOC");
+  }
+};
+void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (int i = 0; i < batch_size_; ++i) {
+    buffer_.push_back(std::vector<framework::LoDTensor>());
+    reader_->ReadNext(&buffer_.back());
+    if (buffer_.back().empty()) {
+      buffer_.pop_back();
+      break;
+    }
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  int out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (int j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    std::type_index batch_type = buffer_[0][j].type();
+    framework::DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      std::type_index ins_type = buffer_[i][j].type();
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+    framework::LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+    // Merge lod and data
+    framework::LoD batch_lod;
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      framework::LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      auto dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_batch_reader,
+                                   ops::CreateBatchReaderOp,
+                                   ops::CreateBatchReaderOpMaker);
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <thread>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+static constexpr size_t kDoubleBufferSize = 2;
+class DoubleBufferReader : public framework::DecoratedReader {
+ public:
+  explicit DoubleBufferReader(ReaderBase* reader)
+      : DecoratedReader(reader),
+        buffer_(framework::MakeChannel<std::vector<framework::LoDTensor>>(
+            kDoubleBufferSize)) {
+    std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
+    prefetch.detach();
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReInit() override;
+  ~DoubleBufferReader() { buffer_->Close(); }
+ private:
+  void PrefetchThreadFunc();
+  framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
+};
+class CreateDoubleBufferReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new DoubleBufferReader(underlying_reader.Get()));
+  }
+};
+class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateDoubleBufferReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddComment(R"DOC(
+      CreateDoubleBufferReader Operator
+      A double buffer reader takes another reader as its 'underlying reader'.
+      It launches another thread to execute the 'underlying reader' asynchronously, 
+      which prevents reading process from blocking subsequent training.
+    )DOC");
+  }
+};
+void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  buffer_->Receive(out);
+}
+void DoubleBufferReader::ReInit() {
+  reader_->ReInit();
+  buffer_->Close();
+  // The existing prefetch thread will terminate for the buffer_ is closed.
+  buffer_ = framework::MakeChannel<std::vector<framework::LoDTensor>>(
+      kDoubleBufferSize);
+  std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
+  prefetch.detach();
+}
+void DoubleBufferReader::PrefetchThreadFunc() {
+  VLOG(5) << "A new prefetch thread starts.";
+  while (true) {
+    std::vector<framework::LoDTensor> batch;
+    reader_->ReadNext(&batch);
+    if (batch.empty()) {
+      // EOF
+      buffer_->Close();
+      VLOG(5) << "Reached the end of the file. The prefetch thread terminates.";
+      break;
+    }
+    if (!buffer_->Send(&batch)) {
+      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
+                 "prefetch thread terminates.";
+      break;
+    }
+  }
+}
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_double_buffer_reader,
+                                   ops::CreateDoubleBufferReaderOp,
+                                   ops::CreateDoubleBufferReaderOpMaker);
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+template <typename T>
+class RandomDataGenerator : public framework::FileReader {
+ public:
+  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
+                      float max)
+      : FileReader(shapes), min_(min), max_(max) {
+    PADDLE_ENFORCE_LE(
+        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(min_, max_);
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const framework::DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      framework::LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = framework::product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+  void ReInit() override { return; }
+ private:
+  float min_;
+  float max_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+};
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("min"),
+                                          Attr<float>("max")));
+  }
+};
+class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
+ public:
+  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : FileReaderMakerBase(op_proto, op_checker) {
+    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+      This Op creates a random reader.
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'min' and 'max'.
+    )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators::reader;
+REGISTER_FILE_READER_OPERATOR(create_random_data_generator,
+                              ops::CreateRandomDataGeneratorOp<float>,
+                              ops::CreateRandomDataGeneratorOpMaker);
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+class ShuffleReader : public framework::DecoratedReader {
+ public:
+  ShuffleReader(ReaderBase* reader, int buffer_size)
+      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+    buffer_.reserve(buffer_size);
+  }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+ private:
+  int buffer_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+  size_t iteration_pos_;
+};
+void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  if (iteration_pos_ >= buffer_.size()) {
+    // Reload buffer with new data
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    for (int i = 0; i < buffer_size_; ++i) {
+      buffer_.push_back(std::vector<framework::LoDTensor>());
+      reader_->ReadNext(&buffer_.back());
+      if (buffer_.back().empty()) {
+        buffer_.pop_back();
+        break;
+      }
+    }
+    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+    // optimize.
+    std::random_shuffle(buffer_.begin(), buffer_.end());
+    iteration_pos_ = 0;
+  }
+  out->clear();
+  if (!buffer_.empty()) {
+    std::swap(*out, buffer_[iteration_pos_++]);
+  }
+  // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(
+        new ShuffleReader(underlying_reader.Get(), Attr<int>("buffer_size")));
+  }
+};
+class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order.
+    )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_shuffle_reader,
+                                   ops::CreateShuffleReaderOp,
+                                   ops::CreateShuffleReaderOpMaker);
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "reader_op_registry.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
+                                           const std::vector<int>& ranks) {
+  std::vector<framework::DDim> res;
+  int offset = 0;
+  for (int len : ranks) {
+    auto start_it = shape_concat.begin() + offset;
+    auto end_it = start_it + len;
+    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
+    offset += len;
+  }
+  return res;
+}
+FileReaderMakerBase::FileReaderMakerBase(
+    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
+    framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  AddOutput("Out", "(ReaderHolder) The created random reader.");
+  AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
+  AddAttr<std::vector<int>>(
+      "ranks",
+      "The ranks of each data."
+      "e.g."
+      "shape_concat = [2,3,4,5,6]"
+      "ranks = [3,2]"
+      "It means the reader will generate two data each time,"
+      "whose shapes are [2,3,4] and [5,6] respectively.");
+  AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+}
+void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "The output file reader should not be null.");
+  const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
+  const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+  std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+  ctx->SetReaderDims("Out", shapes);
+  if (ctx->IsRuntime()) {
+    const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+    PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
+                      "The number of 'lod_levels'(%d) doesn't match the number "
+                      "of 'shapes'(%d).",
+                      lod_levels.size(), shapes.size());
+    framework::VarDesc* reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    reader->SetLoDLevels(lod_levels);
+  }
+}
+void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
+                                        framework::BlockDesc* block) const {
+  std::string reader_name = op_desc.Output("Out")[0];
+  framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+  reader->SetType(framework::proto::VarType::READER);
+}
+void DecoratedReaderInferShape::operator()(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
+                 "Input(UnderlyingReader) should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "The output decorated reader should not be null.");
+  ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
+  if (ctx->IsRuntime()) {
+    framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
+        ctx->GetInputVarPtrs("UnderlyingReader")[0]);
+    framework::VarDesc* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetLoDLevels(in_reader->GetLoDLevels());
+  }
+}
+void DecoratedReaderInferVarType::operator()(
+    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
+  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
+  framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
+  std::string out_reader_name = op_desc.Output("Out")[0];
+  framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
+  out_reader->SetType(framework::proto::VarType::READER);
+  out_reader->SetDataTypes(in_reader->GetDataTypes());
+}
+DecoratedReaderMakerBase::DecoratedReaderMakerBase(
+    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
+    framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  AddInput("UnderlyingReader",
+           "(ReaderHolder) The underlying reader for creating a batch reader.");
+  AddOutput("Out", "(ReaderHolder) The created batch reader.");
+}
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+namespace paddle {
+namespace operators {
+namespace reader {
+extern std::vector<framework::DDim> RestoreShapes(
+    const std::vector<int>& shape_concat, const std::vector<int>& ranks);
+class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker {
+ public:
+  FileReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+};
+class FileReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override;
+};
+class FileReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override;
+};
+// general infershape for decorated reader
+class DecoratedReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override;
+};
+// general var type inference for decorated reader
+class DecoratedReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override;
+};
+class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecoratedReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+#define REGISTER_FILE_READER_OPERATOR(op_name, ...)                  \
+  REGISTER_OPERATOR(op_name, __VA_ARGS__,                            \
+                    paddle::operators::reader::FileReaderInferShape, \
+                    paddle::framework::EmptyGradOpMaker,             \
+                    paddle::operators::reader::FileReaderInferVarType)
+#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...)                  \
+  REGISTER_OPERATOR(op_name, __VA_ARGS__,                                 \
+                    paddle::operators::reader::DecoratedReaderInferShape, \
+                    paddle::framework::EmptyGradOpMaker,                  \
+                    paddle::operators::reader::DecoratedReaderInferVarType)
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -118,6 +118,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::vector<int>>(
        "shape", "(std::vector<int>) Target shape of reshape operator.")
        .SetDefault(std::vector<int>());
+    AddAttr<bool>("inplace",
+                  "Change the source tensor's shape without copy memory.")
+        .SetDefault(true);
    AddComment(R"DOC(
 Reshape Operator.

--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -38,9 +38,15 @@ class ReshapeKernel : public framework::OpKernel<T> {
      out_dims = out->dims();
    }
-    out->mutable_data<T>(ctx.GetPlace());
+    bool inplace = ctx.Attr<bool>("inplace");
-    framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+    if (!inplace) {
-    out->Resize(out_dims);
+      out->mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
  }
 private:

--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -24,15 +24,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-static bool IsVariableInitialized(const framework::Scope& scope,
+static bool NeedSend(const framework::Scope& scope,
-                                  const std::string& varname) {
+                     const std::string& varname) {
  auto* var = scope.FindVar(varname);
  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
                          varname);
  if (var->IsType<framework::LoDTensor>()) {
    return var->Get<framework::LoDTensor>().IsInitialized();
  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().value().IsInitialized();
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
  } else {
    PADDLE_THROW(
        "Variable type in send side should be in "
@@ -67,7 +67,7 @@ class SendOp : public framework::OperatorBase {
    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
    for (size_t i = 0; i < ins.size(); i++) {
-      if (IsVariableInitialized(scope, ins[i])) {
+      if (NeedSend(scope, ins[i])) {
        VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
      } else {

--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -39,6 +39,14 @@ class SGDOp : public framework::OperatorWithKernel {
    // and run time.
    ctx->SetOutputDim("ParamOut", param_dim);
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
+        ctx.GetPlace());
+  }
 };
 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -47,6 +47,12 @@ class SGDOpKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(param, param_out);
      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+      // for distributed training, a sparse var may be empty,
+      // just skip updating.
+      if (grad->rows().size() == 0) {
+        return;
+      }
      auto in_height = grad->height();
      auto out_dims = param_out->dims();
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
@@ -60,13 +66,15 @@ class SGDOpKernel : public framework::OpKernel<T> {
      auto* in_data = in_value.data<T>();
      auto* out_data = param_out->data<T>();
      auto* lr = learning_rate->data<T>();
      for (size_t i = 0; i < in_rows.size(); i++) {
+        PADDLE_ENFORCE(in_rows[i] < in_height,
+                       "Input rows index should less than height");
        for (int64_t j = 0; j < in_row_numel; j++) {
          out_data[in_rows[i] * in_row_numel + j] -=
              lr[0] * in_data[i * in_row_numel + j];
        }
      }
    } else {
      PADDLE_THROW("Unsupported Variable Type of Grad");
    }

--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -21,15 +21,24 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-static int FindOutIdx(int row, const std::vector<int>& height_sections) {
+static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
-  int offset = 0;
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
-  for (size_t i = 0; i < height_sections.size(); ++i) {
+    if (row < abs_sections[i]) {
-    if (row >= offset && row < (offset + height_sections[i])) {
+      return i - 1;
-      return i;
    }
-    offset += height_sections[i];
  }
-  return -1;
+  return abs_sections.size() - 1;
+}
+static std::vector<int> ToAbsoluteSection(
+    const std::vector<int>& height_sections) {
+  std::vector<int> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
 }
 template <typename DeviceContext, typename T>
@@ -40,16 +49,23 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+    auto abs_sections = ToAbsoluteSection(height_sections);
    auto x_rows = x->rows();
    std::vector<std::vector<int>> outs_rows_idx;
+    std::vector<std::vector<int>> outs_dense_idx;
    outs_rows_idx.resize(outs.size());
+    outs_dense_idx.resize(outs.size());
    auto row_numel = x->value().numel() / x->value().dims()[0];
    auto src = x->value().data<T>();
+    // split rows index into output sparse vars
    for (size_t i = 0; i < x_rows.size(); ++i) {
-      int out_idx = FindOutIdx(x_rows[i], height_sections);
+      int out_idx = FindOutIdx(x_rows[i], abs_sections);
-      outs_rows_idx[out_idx].push_back(i);
+      outs_rows_idx[out_idx].push_back(x_rows[i]);
+      outs_dense_idx[out_idx].push_back(i);
    }
    auto place = ctx.GetPlace();
@@ -61,19 +77,20 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
        dims[0] = rows_idx.size();
        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
        for (auto idx : rows_idx) {
-          outs[i]->mutable_rows()->push_back(x_rows[idx]);
+          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
        }
        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
        for (size_t j = 0; j < rows_idx.size(); j++) {
          if (platform::is_cpu_place(place)) {
-            memory::Copy(platform::CPUPlace(), dst + j * row_numel,
+            memory::Copy(
-                         platform::CPUPlace(), src + rows_idx[j] * row_numel,
+                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
-                         sizeof(T) * row_numel);
+                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
          } else {
 #ifdef PADDLE_WITH_CUDA
            auto stream = ctx.cuda_device_context().stream();
            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(), src + rows_idx[j] * row_numel,
+                         platform::CUDAPlace(),
+                         src + outs_dense_idx[i][j] * row_numel,
                         sizeof(T) * row_numel, stream);
 #else
            PADDLE_THROW("Paddle is not compiled with GPU");

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -76,10 +76,16 @@ class SumOp : public framework::OperatorWithKernel {
          static_cast<framework::proto::VarType::Type>(dtype),
          ctx.device_context());
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
-      return framework::OpKernelType(
+      for (auto& var : x_vars) {
-          framework::ToDataType(
+        auto& value = var->Get<framework::SelectedRows>().value();
-              x_vars[0]->Get<framework::SelectedRows>().value().type()),
+        if (value.IsInitialized()) {
-          ctx.device_context());
+          return framework::OpKernelType(framework::ToDataType(value.type()),
+                                         ctx.device_context());
+        }
+      }
+      // if input sparse vars are not initialized, use an default kernel type.
+      return framework::OpKernelType(framework::proto::VarType::FP32,
+                                     ctx.device_context());
    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
      for (auto& x_var : x_vars) {
        auto& array = x_var->Get<framework::LoDTensorArray>();

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -109,6 +109,12 @@ class SumKernel : public framework::OpKernel<T> {
      in_dim[0] = static_cast<int64_t>(first_dim);
      out_value->Resize(framework::make_ddim(in_dim));
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
      out_value->mutable_data<T>(context.GetPlace());
      math::SelectedRowsAddTo<DeviceContext, T> functor;
@@ -116,7 +122,7 @@ class SumKernel : public framework::OpKernel<T> {
      int64_t offset = 0;
      for (int i = 0; i < N; i++) {
        auto &sel_row = get_selected_row(i);
-        if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) {
+        if (sel_row.rows().size() == 0) {
          continue;
        }
        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
+inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (use_cudnn) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  return use_cudnn;
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
  PADDLE_ENFORCE_GT(places.size(), 0);
  for (size_t i = 0; i < places.size(); i++) {
    if (platform::is_cpu_place(places[i])) {
+#ifdef PADDLE_WITH_MKLDNN
+      device_contexts_.emplace(places[i],
+                               new platform::MKLDNNDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+#else
      device_contexts_.emplace(places[i],
                               new platform::CPUDeviceContext(
                                   boost::get<platform::CPUPlace>(places[i])));
+#endif
    } else if (platform::is_gpu_place(places[i])) {
 #ifdef PADDLE_WITH_CUDA
      device_contexts_.emplace(places[i],
@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
  SetDeviceId(place_.device);
+  multi_process = GetCUDAMultiProcessors(place_.device);
+  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
  eigen_stream_.reset(new EigenCudaStreamDevice());
  eigen_stream_->Reinitialize(&stream_, place);
@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
  PADDLE_ENFORCE(cudaGetLastError());
 }
+int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
+  return multi_process * max_threads_per_mp;
+}
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
  return eigen_device_.get();
 }
@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), ready_(false) {
+    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
 }
-template <typename T>
+void MKLDNNDeviceContext::SetBlob(const std::string& name,
-void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                  std::shared_ptr<void> data) const {
-                                     const T& value) {
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-  if (GetElement<T>(op_key)) {
+  p = p_blobs_.get();
-    return;
-  }
-  GetElementPool<T>().emplace(op_key, std::move(value));
-}
-template <typename T>
+  auto it = p->find(name);
-const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
-  auto it = GetElementPool<T>().find(op_key);
-  return it == GetElementPool<T>().end() ? nullptr : it->second;
-}
-template <>
+  if (it == p->end()) {
-const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+    (*p)[name] = data;  // create new blob
-                         std::hash<std::string>>&
+  } else {
-MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+    it->second = data;  // set data to existing blob
-  return memory_pool_;
+  }
-}
-template <>
+  return;
-const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
-  return primitive_pool_;
 }
-template <>
+std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
-const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+    const std::string& name) const {
-                         std::hash<std::string>>&
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  p = p_blobs_.get();
-  return primitive_desc_pool_;
-}
-void MKLDNNDeviceContext::Execute(bool block) {
+  auto it = p->find(name);
-  if (pipeline_.empty()) {
-    return;
-  }
-  ResetStream();
-  stream_->submit(pipeline_).wait(block);
-  ready_ = false;
-  pipeline_.clear();
-}
-void MKLDNNDeviceContext::ResetStream() {
+  if (it != p->end()) {
-  if (ready_) {
+    return it->second;
-    return;
  }
-  // TODO(TJ): change me when mkldnn have specific method to reset this state
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  return nullptr;
-  ready_ = true;
 }
 #endif

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include <mkldnn.hpp>
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
  /*! \brief  Return place in the device context. */
  Place GetPlace() const override;
+  /*! \brief  Return the max physical thread count in the device context */
+  int GetMaxPhysicalThreadCount() const;
  /*! \brief  Return eigen device in the device context. */
  Eigen::GpuDevice* eigen_device() const;
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
  cudaStream_t stream_;
  cudnnHandle_t cudnn_handle_;
  cublasHandle_t cublas_handle_;
+  int multi_process;
+  int max_threads_per_mp;
 };
 template <>
@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 public:
  explicit MKLDNNDeviceContext(CPUPlace place);
-  /* \brief  Add new element: memory, primitive or primitive desc */
-  template <typename T>
-  void AddElement(const std::string& op_key, const T& value);
-  /* \brief  Get existed element: memory, primitive or primitive desc */
-  template <typename T>
-  const T& GetElement(const std::string& op_key) const;
-  /* \brief  Get element pool: memory, primitive or primitive desc pool */
-  template <typename T>
-  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
-  GetElementPool() const;
  /* \brief  Get the active engine */
-  const MKLDNNEngine& engine() const { return *engine_; }
+  const mkldnn::engine& GetEngine() const { return engine_; }
-  /* \brief  Submit primitive to pipeline */
-  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
-  /*! \brief  Execute all submitted primitives in pipeline */
+  // Set data to blob (i.e. name/data pair). Create blob if not existing
-  void Execute(bool block = true);
+  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
- protected:
+  // Find a saved blob. Return nullptr if not found
-  /*! \brief  Reset the stream to prepare next exectue */
+  std::shared_ptr<void> GetBlob(const std::string& name) const;
-  void ResetStream();
 private:
-  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+  mkldnn::engine engine_;
-                     std::hash<std::string>>
+  std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
-      memory_pool_;
+      p_blobs_;
-  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                     std::hash<std::string>>
-      primitive_pool_;
-  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
-                     std::hash<std::string>>
-      primitive_desc_pool_;
-  std::vector<MKLDNNPrimitive> pipeline_;
-  MKLDNNStreamPtr stream_;
-  MKLDNNEnginePtr engine_;
-  bool ready_;
 };
 #endif

--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -192,21 +192,36 @@ class DeviceTracerImpl : public DeviceTracer {
  }
  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
+    if (!anno) {
+      // TODO(panyx0718): Currently, it doesn't support nested situation
+      // Up-level can be cleared by low-level and therefore get nullptr
+      // here.
+      return;
+    }
    std::lock_guard<std::mutex> l(trace_mu_);
-    cpu_records_.push_back(
+    cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0});
-        CPURecord{anno, start_ns, end_ns,
-                  std::hash<std::thread::id>{}(std::this_thread::get_id())});
  }
  void AddMemRecords(const std::string &name, uint64_t start_ns,
                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
                     uint32_t correlation_id, uint64_t bytes) {
+    // 0 means timestamp information could not be collected for the kernel.
+    if (start_ns == 0 || end_ns == 0) {
+      VLOG(3) << name << " cannot be traced";
+      return;
+    }
+    std::lock_guard<std::mutex> l(trace_mu_);
    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
                                     stream_id, correlation_id, bytes});
  }
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {
+    // 0 means timestamp information could not be collected for the kernel.
+    if (start == 0 || end == 0) {
+      VLOG(3) << correlation_id << " cannot be traced";
+      return;
+    }
    std::lock_guard<std::mutex> l(trace_mu_);
    kernel_records_.push_back(
        KernelRecord{start, end, device_id, stream_id, correlation_id});
@@ -279,10 +294,10 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_device_id(r.device_id);
      event->mutable_memcopy()->set_bytes(r.bytes);
    }
-    std::string profile_str;
-    google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
    std::ofstream profile_f;
    profile_f.open(profile_path, std::ios::out | std::ios::trunc);
+    std::string profile_str;
+    profile_pb.SerializeToString(&profile_str);
    profile_f << profile_str;
    profile_f.close();
    return profile_pb;

--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -68,6 +68,8 @@ extern void *cublas_dso_handle;
  __macro(cublasDgemv_v2);                \
  __macro(cublasSgemm_v2);                \
  __macro(cublasDgemm_v2);                \
+  __macro(cublasHgemm);                   \
+  __macro(cublasSgemmEx);                 \
  __macro(cublasSgeam_v2);                \
  __macro(cublasDgeam_v2);                \
  __macro(cublasCreate_v2);               \
@@ -83,6 +85,7 @@ extern void *cublas_dso_handle;
  __macro(cublasDgemmStridedBatched);     \
  __macro(cublasCgemmStridedBatched);     \
  __macro(cublasZgemmStridedBatched);     \
+  __macro(cublasHgemmStridedBatched);     \
  __macro(cublasSgetrfBatched);           \
  __macro(cublasSgetriBatched);           \
  __macro(cublasDgetrfBatched);           \

--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -20,10 +20,6 @@ limitations under the License. */
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
-#include "unsupported/Eigen/CXX11/Tensor"
-#include "paddle/fluid/platform/hostdevice.h"
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
 #else
@@ -64,6 +60,18 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
+// Forward declare float16 for eigen.h
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/hostdevice.h"
+namespace paddle {
+namespace platform {
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
@@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 }
 #endif
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hisnan(half(a));
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (a.x & 0x7fff) == 0x7c00;
+}
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
 }  // namespace platform
 }  // namespace paddle
@@ -750,3 +774,27 @@ struct is_pod<paddle::platform::float16> {
 };
 }  // namespace std
+namespace Eigen {
+namespace numext {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isnan)(a);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isinf)(a);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+}  // namespace numext
+}  // namespace Eigen
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
  return count;
 }
+int GetCUDAMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int count;
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id),
+      "cudaDeviceGetAttribute failed in "
+      "paddle::platform::GetCUDAMultiProcessors");
+  return count;
+}
+int GetCUDAMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int count;
+  PADDLE_ENFORCE(cudaDeviceGetAttribute(
+                     &count, cudaDevAttrMaxThreadsPerMultiProcessor, id),
+                 "cudaDeviceGetAttribute failed in "
+                 "paddle::platform::GetCUDAMaxThreadsPerMultiProcessor");
+  return count;
+}
 int GetCurrentDeviceId() {
  int device_id;
  PADDLE_ENFORCE(

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
+//! Get the MultiProcessors of the ith GPU.
+int GetCUDAMultiProcessors(int i);
+//! Get the MaxThreads of each MultiProcessor of the ith GPU.
+int GetCUDAMaxThreadsPerMultiProcessor(int i);
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,12 +16,15 @@ limitations under the License. */
 #include <mkldnn.hpp>
+#include "paddle/fluid/framework/operator.h"
 namespace paddle {
 namespace platform {
 using MKLDNNStream = mkldnn::stream;
 using MKLDNNEngine = mkldnn::engine;
 using MKLDNNMemory = mkldnn::memory;
+using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
 using MKLDNNPrimitive = mkldnn::primitive;
 using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
 typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
 typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
+inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
+                                          mkldnn::memory::data_type data_type,
+                                          mkldnn::memory::format format) {
+  mkldnn::memory::dims tz = dims;
+  return mkldnn::memory::desc({tz}, data_type, format);
+}
+inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
+  return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) {
  }
 #ifdef PADDLE_WITH_CUDA
  if (g_state == ProfilerState::kCUDA) {
-    // Generate some dummy evenets first to reduce the startup overhead.
+    // Generate some dummy events first to reduce the startup overhead.
    for (int i = 0; i < 5; i++) {
      ForEachDevice([](int d) {
        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));

--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -15,7 +15,7 @@ limitations under the License. */
 syntax = "proto2";
 package paddle.platform.proto;
-message MemCopy { optional uint64 bytes = 3; }
+message MemCopy { optional uint64 bytes = 1; }
 message Event {
  optional string name = 1;

--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -75,6 +75,7 @@ TEST(RecordEvent, RecordEvent) {
  *  ...
  * PopEvent(evt_name, dev_ctx);
  */
+  LOG(INFO) << "Usage 1: PushEvent & PopEvent";
  for (int loop = 0; loop < 3; ++loop) {
    for (int i = 1; i < 5; ++i) {
      std::string name = "op_" + std::to_string(i);
@@ -93,6 +94,7 @@ TEST(RecordEvent, RecordEvent) {
   *   ...
   * }
   */
+  LOG(INFO) << "Usage 2: RecordEvent";
  for (int i = 1; i < 5; ++i) {
    std::string name = "evs_op_" + std::to_string(i);
    RecordEvent record_event(name, dev_ctx);
@@ -100,6 +102,34 @@ TEST(RecordEvent, RecordEvent) {
    while (counter != i * 1000) counter++;
  }
+  /* Usage 3
+   * {
+   *   RecordEvent record_event(name1, dev_ctx);
+   *   ...
+   *   code to be analyzed
+   *   ...
+   *   {
+   *     RecordEvent nested_record_event(name2, dev_ctx);
+   *     ...
+   *     code to be analyzed
+   *     ...
+   *   }
+   * }
+   */
+  LOG(INFO) << "Usage 3: nested RecordEvent";
+  for (int i = 1; i < 5; ++i) {
+    std::string name = "ano_evs_op_" + std::to_string(i);
+    RecordEvent record_event(name, dev_ctx);
+    int counter = 1;
+    while (counter != i * 100) counter++;
+    {
+      std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
+      RecordEvent nested_record_event(nested_name, dev_ctx);
+      int nested_counter = 1;
+      while (nested_counter != i * 100) nested_counter++;
+    }
+  }
  // Bad Usage:
  PushEvent("event_without_pop", dev_ctx);
  PopEvent("event_without_push", dev_ctx);

--- a/paddle/fluid/recordio/CMakeLists.txt
+++ b/paddle/fluid/recordio/CMakeLists.txt
+# internal library.
+cc_library(header SRCS header.cc)
+cc_test(header_test SRCS header_test.cc DEPS header)
+cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
+cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
+cc_library(recordio DEPS chunk header)
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/chunk.h"
+#include <memory>
+#include <sstream>
+#include "paddle/fluid/platform/enforce.h"
+#include "snappystream.hpp"
+#include "zlib.h"
+namespace paddle {
+namespace recordio {
+constexpr size_t kMaxBufSize = 1024;
+template <typename Callback>
+static void ReadStreamByBuf(std::istream& in, int limit, Callback callback) {
+  char buf[kMaxBufSize];
+  std::streamsize actual_size;
+  size_t counter = 0;
+  do {
+    auto actual_max =
+        limit > 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize;
+    actual_size = in.readsome(buf, actual_max);
+    if (actual_size == 0) {
+      break;
+    }
+    callback(buf, actual_size);
+    if (limit > 0) {
+      counter += actual_size;
+    }
+  } while (actual_size == kMaxBufSize);
+}
+static void PipeStream(std::istream& in, std::ostream& os) {
+  ReadStreamByBuf(
+      in, -1, [&os](const char* buf, size_t len) { os.write(buf, len); });
+}
+static uint32_t Crc32Stream(std::istream& in, int limit = -1) {
+  auto crc = crc32(0, nullptr, 0);
+  ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
+    crc = crc32(crc, reinterpret_cast<const Bytef*>(buf), len);
+  });
+  return crc;
+}
+bool Chunk::Write(std::ostream& os, Compressor ct) const {
+  // NOTE(dzhwinter): don't check records.numBytes instead, because
+  // empty records are allowed.
+  if (records_.empty()) {
+    return false;
+  }
+  std::stringstream sout;
+  std::unique_ptr<std::ostream> compressed_stream;
+  switch (ct) {
+    case Compressor::kNoCompress:
+      break;
+    case Compressor::kSnappy:
+      compressed_stream.reset(new snappy::oSnappyStream(sout));
+      break;
+    default:
+      PADDLE_THROW("Not implemented");
+  }
+  std::ostream& buf_stream = compressed_stream ? *compressed_stream : sout;
+  for (auto& record : records_) {
+    size_t sz = record.size();
+    buf_stream.write(reinterpret_cast<const char*>(&sz), sizeof(uint32_t))
+        .write(record.data(), record.size());
+  }
+  if (compressed_stream) {
+    compressed_stream.reset();
+  }
+  auto end_pos = sout.tellg();
+  sout.seekg(0, std::ios::beg);
+  uint32_t len = static_cast<uint32_t>(end_pos - sout.tellg());
+  uint32_t crc = Crc32Stream(sout);
+  sout.seekg(0, std::ios::beg);
+  Header hdr(static_cast<uint32_t>(records_.size()), crc, ct, len);
+  hdr.Write(os);
+  PipeStream(sout, os);
+  return true;
+}
+void Chunk::Parse(std::istream& sin) {
+  Header hdr;
+  hdr.Parse(sin);
+  auto beg_pos = sin.tellg();
+  auto crc = Crc32Stream(sin, hdr.CompressSize());
+  PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
+  Clear();
+  sin.seekg(beg_pos, std::ios::beg);
+  std::unique_ptr<std::istream> compressed_stream;
+  switch (hdr.CompressType()) {
+    case Compressor::kNoCompress:
+      break;
+    case Compressor::kSnappy:
+      compressed_stream.reset(new snappy::iSnappyStream(sin));
+      break;
+    default:
+      PADDLE_THROW("Not implemented");
+  }
+  std::istream& stream = compressed_stream ? *compressed_stream : sin;
+  for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
+    uint32_t rec_len;
+    stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+    std::string buf;
+    buf.resize(rec_len);
+    stream.read(&buf[0], rec_len);
+    Add(buf);
+  }
+}
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/recordio/header.h"
+namespace paddle {
+namespace recordio {
+// A Chunk contains the Header and optionally compressed records.
+class Chunk {
+public:
+  Chunk() : num_bytes_(0) {}
+  void Add(std::string buf) {
+    records_.push_back(buf);
+    num_bytes_ += buf.size();
+  }
+  // dump the chunk into w, and clears the chunk and makes it ready for
+  // the next add invocation.
+  bool Write(std::ostream& fo, Compressor ct) const;
+  void Clear() {
+    records_.clear();
+    num_bytes_ = 0;
+  }
+  void Parse(std::istream& sin);
+  size_t NumBytes() { return num_bytes_; }
+  const std::string& Record(int i) const { return records_[i]; }
+private:
+  std::vector<std::string> records_;
+  // sum of record lengths in bytes.
+  size_t num_bytes_;
+  DISABLE_COPY_AND_ASSIGN(Chunk);
+};
+size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
+void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/chunk.h"
+#include <sstream>
+#include "gtest/gtest.h"
+using namespace paddle::recordio;
+TEST(Chunk, SaveLoad) {
+  Chunk ch;
+  ch.Add(std::string("12345", 6));
+  ch.Add(std::string("123", 4));
+  std::stringstream ss;
+  ch.Write(ss, Compressor::kNoCompress);
+  ch.Clear();
+  ch.Parse(ss);
+  ASSERT_EQ(ch.NumBytes(), 10U);
+}
+TEST(Chunk, Compressor) {
+  Chunk ch;
+  ch.Add(std::string("12345", 6));
+  ch.Add(std::string("123", 4));
+  ch.Add(std::string("123", 4));
+  ch.Add(std::string("123", 4));
+  std::stringstream ss;
+  ch.Write(ss, Compressor::kSnappy);
+  std::stringstream ss2;
+  ch.Write(ss2, Compressor::kNoCompress);
+  ASSERT_LE(ss.tellp(), ss2.tellp());  // Compress should contain less data;
+  ch.Clear();
+  ch.Parse(ss);
+  ASSERT_EQ(ch.NumBytes(), 18);
+}
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/header.h"
+namespace paddle {
+namespace recordio {
+Header::Header()
+    : num_records_(0),
+      checksum_(0),
+      compressor_(Compressor::kNoCompress),
+      compress_size_(0) {}
+Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
+    : num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {}
+void Header::Parse(std::istream& is) {
+  is.read(reinterpret_cast<char*>(&num_records_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&checksum_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&compressor_), sizeof(uint32_t))
+      .read(reinterpret_cast<char*>(&compress_size_), sizeof(uint32_t));
+}
+void Header::Write(std::ostream& os) const {
+  os.write(reinterpret_cast<const char*>(&num_records_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&checksum_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&compressor_), sizeof(uint32_t))
+      .write(reinterpret_cast<const char*>(&compress_size_), sizeof(uint32_t));
+}
+std::ostream& operator<<(std::ostream& os, Header h) {
+  os << h.NumRecords() << h.Checksum()
+     << static_cast<uint32_t>(h.CompressType()) << h.CompressSize();
+  return os;
+}
+bool operator==(Header l, Header r) {
+  return l.NumRecords() == r.NumRecords() && l.Checksum() == r.Checksum() &&
+         l.CompressType() == r.CompressType() &&
+         l.CompressSize() == r.CompressSize();
+}
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <sstream>
+namespace paddle {
+namespace recordio {
+// Default ChunkSize
+constexpr size_t kDefaultMaxChunkSize = 32 * 1024 * 1024;
+// MagicNumber for memory checking
+constexpr uint32_t kMagicNumber = 0x01020304;
+enum class Compressor : uint32_t {
+  // NoCompression means writing raw chunk data into files.
+  // With other choices, chunks are compressed before written.
+  kNoCompress = 0,
+  // Snappy had been the default compressing algorithm widely
+  // used in Google.  It compromises between speech and
+  // compression ratio.
+  kSnappy = 1,
+  // Gzip is a well-known compression algorithm.  It is
+  // recommmended only you are looking for compression ratio.
+  kGzip = 2,
+};
+// Header is the metadata of Chunk
+class Header {
+public:
+  Header();
+  Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
+  void Write(std::ostream& os) const;
+  void Parse(std::istream& is);
+  uint32_t NumRecords() const { return num_records_; }
+  uint32_t Checksum() const { return checksum_; }
+  Compressor CompressType() const { return compressor_; }
+  uint32_t CompressSize() const { return compress_size_; }
+private:
+  uint32_t num_records_;
+  uint32_t checksum_;
+  Compressor compressor_;
+  uint32_t compress_size_;
+};
+// Allow Header Loggable
+std::ostream& operator<<(std::ostream& os, Header h);
+bool operator==(Header l, Header r);
+}  // namespace recordio
+}  // namespace paddle
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/recordio/header.h"
+#include <sstream>
+#include "gtest/gtest.h"
+using namespace paddle::recordio;
+TEST(Recordio, ChunkHead) {
+  Header hdr(0, 1, Compressor::kGzip, 3);
+  std::stringstream ss;
+  hdr.Write(ss);
+  ss.seekg(0, std::ios::beg);
+  Header hdr2;
+  hdr2.Parse(ss);
+  EXPECT_TRUE(hdr == hdr2);
+}
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -213,7 +213,7 @@ function gen_fluid_inference_lib() {
    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
    cat <<EOF
    ========================================
-    Building fluid inference library ...
+    Deploying fluid inference library ...
    ========================================
 EOF
        make inference_lib_dist

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -28,6 +28,7 @@ import nets
 import optimizer
 import backward
 import regularizer
+import average
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace

--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+"""
+    Class of all kinds of Average.
+    All Averages are accomplished via Python totally. 
+    They do not change Paddle's Program, nor do anything to
+    modify NN model's configuration. They are completely 
+    wrappers of Python functions.
+"""
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+class WeightedAverage(object):
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.numerator = None
+        self.denominator = None
+    def add(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        if self.numerator is None or self.denominator is None:
+            self.numerator = value * weight
+            self.denominator = weight
+        else:
+            self.numerator += value * weight
+            self.denominator += weight
+    def eval(self):
+        if self.numerator is None or self.denominator is None:
+            raise ValueError(
+                "There is no data to be averaged in WeightedAverage.")
+        return self.numerator / self.denominator
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -486,7 +486,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
    params_and_grads = []
    for param in parameters:
        if param not in grad_info_map:
-            raise ValueError("param %s is not in map" % param)
+            continue
        grad_info = grad_info_map[param]
        grad_block = grad_info[1]
        if not grad_block.has_var(grad_info[0]):

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -108,44 +108,6 @@ class Evaluator(object):
        return state
-class Accuracy(Evaluator):
-    """
-    Average Accuracy for multiple mini-batches.
-    """
-    def __init__(self, input, label, k=1, **kwargs):
-        super(Accuracy, self).__init__("accuracy", **kwargs)
-        main_program = self.helper.main_program
-        if main_program.current_block().idx != 0:
-            raise ValueError("You can only invoke Evaluator in root block")
-        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
-        self.correct = self.create_state(
-            dtype='int64', shape=[1], suffix='correct')
-        total = self.helper.create_tmp_variable(dtype='int')
-        correct = self.helper.create_tmp_variable(dtype='int')
-        acc = layers.accuracy(
-            input=input, label=label, k=k, total=total, correct=correct)
-        total = layers.cast(x=total, dtype='int64')
-        correct = layers.cast(x=correct, dtype='int64')
-        layers.sums(input=[self.total, total], out=self.total)
-        layers.sums(input=[self.correct, correct], out=self.correct)
-        self.metrics.append(acc)
-    def eval(self, executor, eval_program=None):
-        if eval_program is None:
-            eval_program = Program()
-        block = eval_program.current_block()
-        with program_guard(main_program=eval_program):
-            total = _clone_var_(block, self.total)
-            correct = _clone_var_(block, self.correct)
-            total = layers.cast(total, dtype='float32')
-            correct = layers.cast(correct, dtype='float32')
-            out = layers.elementwise_div(x=correct, y=total)
-        return np.array(executor.run(eval_program, fetch_list=[out])[0])
 class ChunkEvaluator(Evaluator):
    """
    Accumulate counter numbers output by chunk_eval from mini-batches and
@@ -312,6 +274,10 @@ class DetectionMAP(Evaluator):
            bounding box (bbox), which is a LoDTensor [N, 1].
        gt_box (Variable): The ground truth bounding box (bbox), which is a
            LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+        class_num (int): The class number.
+        background_label (int): The index of background label, the background
+            label will be ignored. If set to -1, then all categories will be
+            considered, 0 by defalut.
        overlap_threshold (float): The threshold for deciding true/false
            positive, 0.5 by defalut.
        evaluate_difficult (bool): Whether to consider difficult ground truth
@@ -345,6 +311,8 @@ class DetectionMAP(Evaluator):
                 gt_label,
                 gt_box,
                 gt_difficult,
+                 class_num,
+                 background_label=0,
                 overlap_threshold=0.5,
                 evaluate_difficult=True,
                 ap_version='integral'):
@@ -358,6 +326,8 @@ class DetectionMAP(Evaluator):
        map = layers.detection_map(
            input,
            label,
+            class_num,
+            background_label,
            overlap_threshold=overlap_threshold,
            evaluate_difficult=evaluate_difficult,
            ap_version=ap_version)
@@ -377,6 +347,8 @@ class DetectionMAP(Evaluator):
        accum_map = layers.detection_map(
            input,
            label,
+            class_num,
+            background_label,
            overlap_threshold=overlap_threshold,
            evaluate_difficult=evaluate_difficult,
            has_state=self.has_state,

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -102,6 +102,9 @@ def save_vars(executor,
        save_var_map = {}
        for each_var in vars:
+            # NOTE: don't save the variable which type is RAW
+            if each_var.type == core.VarDesc.VarType.RAW:
+                continue
            new_var = _clone_var_in_block_(save_block, each_var)
            if filename is None:
                save_block.append_op(

--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -28,6 +28,8 @@ import math_op_patch
 from math_op_patch import *
 import detection
 from detection import *
+import metric
+from metric import *
 from learning_rate_scheduler import *
 __all__ = []
@@ -39,4 +41,5 @@ __all__ += control_flow.__all__
 __all__ += ops.__all__
 __all__ += device.__all__
 __all__ += detection.__all__
+__all__ += metric.__all__
 __all__ += learning_rate_scheduler.__all__
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -151,6 +151,8 @@ def detection_output(loc,
 @autodoc()
 def detection_map(detect_res,
                  label,
+                  class_num,
+                  background_label=0,
                  overlap_threshold=0.3,
                  evaluate_difficult=True,
                  has_state=None,
@@ -192,7 +194,8 @@ def detection_map(detect_res,
        attrs={
            'overlap_threshold': overlap_threshold,
            'evaluate_difficult': evaluate_difficult,
-            'ap_type': ap_version
+            'ap_type': ap_version,
+            'class_num': class_num,
        })
    return map_out

--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All layers just related to metric.
+"""
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+from ..param_attr import ParamAttr
+__all__ = ['accuracy']
+def accuracy(input, label, k=1, correct=None, total=None):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("accuracy", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -35,7 +35,6 @@ __all__ = [
    'cos_sim',
    'cross_entropy',
    'square_error_cost',
-    'accuracy',
    'chunk_eval',
    'sequence_conv',
    'conv2d',
@@ -1022,40 +1021,6 @@ def square_error_cost(input, label):
    return square_out
-def accuracy(input, label, k=1, correct=None, total=None):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
 def chunk_eval(input,
               label,
               chunk_scheme,
@@ -1146,6 +1111,7 @@ def conv2d(input,
           param_attr=None,
           bias_attr=None,
           use_cudnn=True,
+           use_mkldnn=False,
           act=None):
    """
    **Convlution2D Layer**
@@ -1287,7 +1253,8 @@ def conv2d(input,
            'strides': stride,
            'paddings': padding,
            'groups': groups,
-            'use_cudnn': use_cudnn
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': use_mkldnn
        })
    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -1438,6 +1405,8 @@ def pool2d(input,
           pool_padding=0,
           global_pooling=False,
           use_cudnn=True,
+           ceil_mode=False,
+           use_mkldnn=False,
           name=None):
    """
    This function adds the operator for pooling in 2 dimensions, using the
@@ -1474,7 +1443,9 @@ def pool2d(input,
            "global_pooling": global_pooling,
            "strides": pool_stride,
            "paddings": pool_padding,
-            "use_cudnn": use_cudnn
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": use_mkldnn
        })
    return pool_out
@@ -3180,7 +3151,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
            data = fluid.layers.data(name='data', shape=[128], dtype='float32')
            label = fluid.layers.data(name='label', shape=[100], dtype='int64')
            fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.smooth_l1(logits=fc, label=label)
+            out = fluid.layers.smooth_l1(x=fc, y=label)
    """
    helper = LayerHelper('smooth_l1_loss', **locals())
    diff = helper.create_tmp_variable(dtype=x.dtype)

--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -31,6 +31,8 @@ dtype_to_size = {
 sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
+PRINT_LOG = False
 class ControlFlowGraph(object):
    def __init__(self, Program, ops, forward_num, skip_opt):
@@ -171,12 +173,14 @@ class ControlFlowGraph(object):
                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
                                # and dtype_to_size[cache_dtype]
                                if x_dtype == cache_dtype:
-                                    print(("Hit Cache !!!! cache pool index "
+                                    if PRINT_LOG:
-                                           "is %d, var name is %s, "
+                                        print(
-                                           "cached var name is %s, "
+                                            ("Hit Cache !!!! cache pool index "
-                                           "var shape is %s ") %
+                                             "is %d, var name is %s, "
-                                          (index, x, cache_var,
+                                             "cached var name is %s, "
-                                           str(cache_shape)))
+                                             "var shape is %s ") %
+                                            (index, x, cache_var,
+                                             str(cache_shape)))
                                    self.pool.pop(index)
                                    if x == cache_var:
                                        break
@@ -277,7 +281,9 @@ def _get_cfgs(input_program):
    return cfgs
-def memory_optimize(input_program):
+def memory_optimize(input_program, print_log=False):
+    global PRINT_LOG
+    PRINT_LOG = print_log
    cfgs = _get_cfgs(input_program)
    for cfg in cfgs:
        cfg.memory_optimize()
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -29,21 +29,24 @@ def simple_img_conv_pool(input,
                         act,
                         param_attr=None,
                         pool_type='max',
-                         use_cudnn=True):
+                         use_cudnn=True,
+                         use_mkldnn=False):
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
        act=act,
-        use_cudnn=use_cudnn)
+        use_cudnn=use_cudnn,
+        use_mkldnn=use_mkldnn)
    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
        pool_type=pool_type,
        pool_stride=pool_stride,
-        use_cudnn=use_cudnn)
+        use_cudnn=use_cudnn,
+        use_mkldnn=use_mkldnn)
    return pool_out
@@ -58,7 +61,8 @@ def img_conv_group(input,
                   conv_batchnorm_drop_rate=0.0,
                   pool_stride=1,
                   pool_type=None,
-                   use_cudnn=True):
+                   use_cudnn=True,
+                   use_mkldnn=False):
    """
    Image Convolution Group, Used for vgg net.
    """
@@ -90,7 +94,8 @@ def img_conv_group(input,
            padding=conv_padding[i],
            param_attr=param_attr[i],
            act=local_conv_act,
-            use_cudnn=use_cudnn)
+            use_cudnn=use_cudnn,
+            use_mkldnn=use_mkldnn)
        if conv_with_batchnorm[i]:
            tmp = layers.batch_norm(input=tmp, act=conv_act)
@@ -103,7 +108,8 @@ def img_conv_group(input,
        pool_size=pool_size,
        pool_type=pool_type,
        pool_stride=pool_stride,
-        use_cudnn=use_cudnn)
+        use_cudnn=use_cudnn,
+        use_mkldnn=use_mkldnn)
    return pool_out

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -49,7 +49,7 @@ avg_cost = fluid.layers.mean(x=cost)
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 sgd_optimizer.minimize(avg_cost)
-fluid.memory_optimize(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program(), print_log=True)
 BATCH_SIZE = 200

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -122,7 +122,8 @@ avg_cost = fluid.layers.mean(cost)
 optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+batch_size = fluid.layers.create_tensor(dtype='int64')
+batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
 fluid.memory_optimize(fluid.default_main_program())
@@ -144,13 +145,17 @@ feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe.run(fluid.default_startup_program())
 i = 0
+accuracy = fluid.average.WeightedAverage()
 for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
+    accuracy.reset()
    for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
+        loss, acc, weight = exe.run(
-                            feed=feeder.feed(data),
+            fluid.default_main_program(),
-                            fetch_list=[avg_cost] + accuracy.metrics)
+            feed=feeder.feed(data),
-        pass_acc = accuracy.eval(exe)
+            fetch_list=[avg_cost, batch_acc, batch_size])
+        accuracy.add(value=acc, weight=weight)
+        pass_acc = accuracy.eval()
        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
            pass_acc))
        # this model is slow, so if we can train two mini batch, we think it works properly.

--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -15,16 +15,30 @@
 import paddle.v2 as paddle
 import paddle.fluid as fluid
 import numpy as np
+import sys
-prog = fluid.framework.Program()
+startup_prog = fluid.framework.Program()
-block = prog.current_block()
+startup_block = startup_prog.current_block()
-random_reader = block.create_var(
+random_reader = startup_block.create_var(
    type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
 random_reader.desc.set_dtypes(
    [fluid.core.VarDesc.VarType.FP32, fluid.core.VarDesc.VarType.FP32])
+random_reader.persistable = True
+shuffle_reader = startup_block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
+shuffle_reader.persistable = True
+batch_reader = startup_block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
+batch_reader.persistable = True
+double_buffer = startup_block.create_var(
+    type=fluid.core.VarDesc.VarType.READER, name="DoubleBuffer")
+double_buffer.persistable = True
+main_prog = startup_prog.clone()
+main_block = main_prog.current_block()
-create_random_data_generator_op = block.append_op(
+create_random_data_generator_op = startup_block.append_op(
    type="create_random_data_generator",
    outputs={"Out": random_reader},
    attrs={
@@ -34,37 +48,45 @@ create_random_data_generator_op = block.append_op(
        "max": 1.0,
        'lod_levels': [0, 0]
    })
-shuffle_reader = block.create_var(
-    type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
-create_shuffle_reader_op = block.append_op(
+create_shuffle_reader_op = startup_block.append_op(
    type="create_shuffle_reader",
    inputs={"UnderlyingReader": random_reader},
    outputs={"Out": shuffle_reader},
    attrs={"buffer_size": 7})
-batch_reader = block.create_var(
+create_batch_reader_op = startup_block.append_op(
-    type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
-create_batch_reader_op = block.append_op(
    type="create_batch_reader",
    inputs={"UnderlyingReader": shuffle_reader},
    outputs={"Out": batch_reader},
    attrs={"batch_size": 10})
-out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
+create_double_buffer_reader_op = startup_block.append_op(
-out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
+    type="create_double_buffer_reader",
+    inputs={"UnderlyingReader": batch_reader},
+    outputs={"Out": double_buffer})
+out1 = main_block.create_var(
+    type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
+out2 = main_block.create_var(
+    type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
-read_op = block.append_op(
+main_block.var("DoubleBuffer").desc.set_shapes(double_buffer.desc.shapes())
-    type="read", inputs={"Reader": batch_reader},
+main_block.var("DoubleBuffer").desc.set_dtypes(double_buffer.desc.dtypes())
+main_block.var("DoubleBuffer").desc.set_lod_levels(
+    double_buffer.desc.lod_levels())
+read_op = main_block.append_op(
+    type="read",
+    inputs={"Reader": double_buffer},
    outputs={"Out": [out1, out2]})
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
-[res1, res2] = exe.run(prog, fetch_list=[out1, out2])
+exe.run(startup_prog)
-if not (res1.shape == (10, 2) and res2.shape == (10, 1)):
-    exit(1)
-exit(0)
+for i in range(1, 100):
+    [res1, res2] = exe.run(main_prog, fetch_list=[out1, out2])
+    if not (res1.shape == (10, 2) and res2.shape == (10, 1)):
+        exit(1)
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -158,7 +158,7 @@ class TestDetectionMAP(unittest.TestCase):
                append_batch_size=False,
                dtype='float32')
-            map_out = layers.detection_map(detect_res=detect_res, label=label)
+            map_out = layers.detection_map(detect_res, label, 21)
            self.assertIsNotNone(map_out)
            self.assertEqual(map_out.shape, (1, ))
        print(str(program))

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 class TestConv2dOp(OpTest):
    def setUp(self):
        self.use_cudnn = False
+        self.use_mkldnn = False
        self.init_op_type()
        self.init_group()
        self.init_dilation()
@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
            'paddings': self.pad,
            'groups': self.groups,
            'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
        }
        self.outputs = {'Output': output}
@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
+#----------------Conv2dMKLDNN----------------
+class TestMKLDNN(TestConv2dOp):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+class TestMKLDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+class TestMKLDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -22,8 +22,8 @@ from op_test import OpTest
 class TestDetectionMAPOp(OpTest):
    def set_data(self):
+        self.class_num = 4
        self.init_test_case()
        self.mAP = [self.calc_map(self.tf_pos, self.tf_pos_lod)]
        self.label = np.array(self.label).astype('float32')
        self.detect = np.array(self.detect).astype('float32')
@@ -53,7 +53,8 @@ class TestDetectionMAPOp(OpTest):
        self.attrs = {
            'overlap_threshold': self.overlap_threshold,
            'evaluate_difficult': self.evaluate_difficult,
-            'ap_type': self.ap_type
+            'ap_type': self.ap_type,
+            'class_num': self.class_num
        }
        self.out_class_pos_count = np.array(self.out_class_pos_count).astype(
@@ -126,12 +127,7 @@ class TestDetectionMAPOp(OpTest):
            return class_pos_count_dict, true_pos_dict, false_pos_dict
        def get_output_pos(label_count, true_pos, false_pos):
-            max_label = 0
+            label_number = self.class_num
-            for (label, label_pos_num) in label_count.items():
-                if max_label < label:
-                    max_label = label
-            label_number = max_label + 1
            out_class_pos_count = []
            out_true_pos_lod = [0]
@@ -220,11 +216,16 @@ class TestDetectionMAPOp(OpTest):
                mAP += average_precisions
                count += 1
-        self.out_class_pos_count, self.out_true_pos, self.out_true_pos_lod, self.out_false_pos, self.out_false_pos_lod = get_output_pos(
+        pcnt, tp, tp_lod, fp, fp_lod = get_output_pos(label_count, true_pos,
-            label_count, true_pos, false_pos)
+                                                      false_pos)
+        self.out_class_pos_count = pcnt
+        self.out_true_pos = tp
+        self.out_true_pos_lod = tp_lod
+        self.out_false_pos = fp
+        self.out_false_pos_lod = fp_lod
        if count != 0:
            mAP /= count
-        return mAP * 100.0
+        return mAP
    def setUp(self):
        self.op_type = "detection_map"

--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -19,6 +19,7 @@ import unittest
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
+import paddle.fluid.core as core
 def exponential_decay(learning_rate,
@@ -81,6 +82,16 @@ def piecewise_decay(global_step, boundaries, values):
 class TestLearningRateDecay(unittest.TestCase):
    def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for place in places:
+            self.check_decay_with_place(place, python_decay_fn, fluid_decay_fn,
+                                        kwargs)
+    def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
+                               kwargs):
        decayed_lr = fluid_decay_fn(**kwargs)
        place = fluid.CPUPlace()

--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -19,12 +19,21 @@ import paddle.fluid.core as core
 from op_test import OpTest
-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, H, W = x.shape
    if global_pool == 1:
        ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
+                                                   paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
    for i in xrange(H_out):
        for j in xrange(W_out):
@@ -38,12 +47,21 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
    return out
-def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def avg_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, H, W = x.shape
    if global_pool == 1:
        ksize = [H, W]
-    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
+                                                   paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
    for i in xrange(H_out):
        for j in xrange(W_out):
@@ -61,16 +79,18 @@ def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
 class TestPool2d_Op(OpTest):
    def setUp(self):
        self.use_cudnn = False
+        self.use_mkldnn = False
        self.init_test_case()
        self.init_global_pool()
        self.init_op_type()
        self.init_pool_type()
+        self.init_ceil_mode()
        if self.global_pool:
            self.paddings = [0 for _ in range(len(self.paddings))]
        input = np.random.random(self.shape).astype("float32")
        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings,
+                                           self.paddings, self.global_pool,
-                                           self.global_pool).astype("float32")
+                                           self.ceil_mode).astype("float32")
        self.inputs = {'X': input}
        self.attrs = {
@@ -80,6 +100,8 @@ class TestPool2d_Op(OpTest):
            'pooling_type': self.pool_type,
            'global_pooling': self.global_pool,
            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'ceil_mode': self.ceil_mode,
            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
        }
@@ -116,6 +138,9 @@ class TestPool2d_Op(OpTest):
    def init_global_pool(self):
        self.global_pool = True
+    def init_ceil_mode(self):
+        self.ceil_mode = False
 class TestCase1(TestPool2d_Op):
    def init_test_case(self):
@@ -217,5 +242,62 @@ class TestCUDNNCase6(TestCase5):
        self.op_type = "pool2d"
+class TestCeilModeCase1(TestCUDNNCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase2(TestCUDNNCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase3(TestCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase4(TestCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+#--------------------test pool2d MKLDNN--------------------
+class TestMKLDNNCase1(TestPool2d_Op):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "pool2d"
+class TestMKLDNNCase2(TestCase1):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "pool2d"
+class TestMKLDNNCase3(TestCase2):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "pool2d"
+class TestMKLDNNCase4(TestCase3):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "pool2d"
+class TestMKLDNNCase5(TestCase4):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "pool2d"
+class TestMKLDNNCase6(TestCase5):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "pool2d"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -19,13 +19,24 @@ import paddle.fluid.core as core
 from op_test import OpTest
-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def max_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, D, H, W = x.shape
    if global_pool == 1:
        ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+                                                   paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
+                                                   paddings[2]) / strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    for k in xrange(D_out):
        d_start = np.max((k * strides[0] - paddings[0], 0))
@@ -42,13 +53,24 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
    return out
-def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+def avg_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=0,
+                             ceil_mode=False):
    N, C, D, H, W = x.shape
    if global_pool == 1:
        ksize = [D, H, W]
-    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
-    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+             ) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
-    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+                                                   paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
+             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
+                                                   paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
+             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
+                                                   paddings[2]) / strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    for k in xrange(D_out):
        d_start = np.max((k * strides[0] - paddings[0], 0))
@@ -73,13 +95,14 @@ class TestPool3d_Op(OpTest):
        self.init_global_pool()
        self.init_op_type()
        self.init_pool_type()
+        self.init_ceil_mode()
        if self.global_pool:
            self.paddings = [0 for _ in range(len(self.paddings))]
        input = np.random.random(self.shape).astype("float32")
        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
-                                           self.paddings,
+                                           self.paddings, self.global_pool,
-                                           self.global_pool).astype("float32")
+                                           self.ceil_mode).astype("float32")
        self.inputs = {'X': input}
        self.attrs = {
@@ -89,6 +112,7 @@ class TestPool3d_Op(OpTest):
            'pooling_type': self.pool_type,
            'global_pooling': self.global_pool,
            'use_cudnn': self.use_cudnn,
+            'ceil_mode': self.ceil_mode,
            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
        }
@@ -125,6 +149,9 @@ class TestPool3d_Op(OpTest):
    def init_global_pool(self):
        self.global_pool = True
+    def init_ceil_mode(self):
+        self.ceil_mode = False
 class TestCase1(TestPool3d_Op):
    def init_test_case(self):
@@ -227,5 +254,25 @@ class TestCUDNNCase6(TestCase5):
        self.op_type = "pool3d"
+class TestCeilModeCase1(TestCUDNNCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase2(TestCUDNNCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase3(TestCase1):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
+class TestCeilModeCase4(TestCase2):
+    def init_ceil_mode(self):
+        self.ceil_mode = True
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -37,7 +37,9 @@ class TestProfiler(unittest.TestCase):
            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(cost)
-            accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+            batch_size = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size)
        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
        opts = optimizer.minimize(avg_cost, startup_program=startup_program)
@@ -46,7 +48,7 @@ class TestProfiler(unittest.TestCase):
        exe = fluid.Executor(place)
        exe.run(startup_program)
-        accuracy.reset(exe)
+        pass_acc_calculator = fluid.average.WeightedAverage()
        with profiler.profiler(state, 'total', profile_path) as prof:
            for iter in range(10):
                if iter == 2:
@@ -57,9 +59,11 @@ class TestProfiler(unittest.TestCase):
                outs = exe.run(main_program,
                               feed={'x': x,
                                     'y': y},
-                               fetch_list=[avg_cost] + accuracy.metrics)
+                               fetch_list=[avg_cost, batch_acc, batch_size])
                acc = np.array(outs[1])
-                pass_acc = accuracy.eval(exe)
+                b_size = np.array(outs[2])
+                pass_acc_calculator.add(value=acc, weight=b_size)
+                pass_acc = pass_acc_calculator.eval()
    def test_cpu_profiler(self):
        self.net_profiler('CPU')

--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -60,8 +60,8 @@ class TestSpliteSelectedRows(unittest.TestCase):
        # expected output selected rows
        expected_out0_rows = [0, 4]
-        expected_out1_rows = [5, 7]
+        expected_out1_rows = [0, 2]
-        expected_out4_rows = [20]
+        expected_out4_rows = [0]
        op = Operator(
            "split_selected_rows",
@@ -101,7 +101,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
        out0_grad_tensor.set(np_array, place)
        out1_grad = scope.var("out1@GRAD").get_selected_rows()
-        rows1 = [7, 5]
+        rows1 = [2, 0]
        out1_grad.set_rows(rows1)
        out1_grad.set_height(height)
        out1_grad_tensor = out1_grad.get_tensor()

--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -13,8 +13,10 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
 ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
 COPY build_scripts /build_scripts
-RUN bash build_scripts/build.sh && rm -r build_scripts
+RUN bash build_scripts/build.sh && \
+  bash build_scripts/install_nccl2.sh && rm -r build_scripts
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
@@ -34,9 +36,6 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
    tar xzf protobuf-cpp-3.1.0.tar.gz && \
    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
-RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool
 RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
@@ -47,10 +46,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
-RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
+RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
-RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
-    make -j `nproc` install <NCCL_MAKE_OPTS>  && cd .. && rm -rf nccl
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
+#!/bin/bash
+DEB="nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb"
+DIR="/nccl2"
+mkdir -p $DIR
+# we cached the nccl2 deb package in BOS, so we can download it with wget
+# install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down
+wget -O $DIR/$DEB \
+  "http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb?responseContentDisposition=attachment"
+cd $DIR && ar x $DEB && tar xf data.tar.xz
+DEBS=$(find ./var/ -name "*.deb")
+for sub_deb in $DEBS; do
+  echo $sub_deb
+  ar x $sub_deb && tar xf data.tar.xz
+done
+mv -f usr/include/nccl.h /usr/local/include/
+mv -f usr/lib/libnccl* /usr/local/lib/
+rm -rf $DIR
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -159,7 +159,7 @@ if args.timeline_path:
 with open(profile_path, 'r') as f:
    profile_s = f.read()
    profile_pb = profiler_pb2.Profile()
-    text_format.Merge(profile_s, profile_pb)
+    profile_pb.ParseFromString(profile_s)
 tl = Timeline(profile_pb)
 with open(timeline_path, 'w') as f: