diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 53e71998f1517a8f2cfa4509426231fb0f0177e8..786f224608f7d41c438411de0e09fedbcf2264b8 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -1,11 +1,11 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -138,13 +138,14 @@ def main():
     avg_cost = fluid.layers.mean(x=cost)
 
     # Evaluator
-    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    batch_size = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size)
 
     # inference program
     inference_program = fluid.default_main_program().clone()
     with fluid.program_guard(inference_program):
-        test_target = accuracy.metrics + accuracy.states
-        inference_program = fluid.io.get_inference_program(test_target)
+        inference_program = fluid.io.get_inference_program(batch_acc)
 
     # Optimization
     optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
@@ -157,27 +158,30 @@ def main():
 
     # test
     def test(exe):
-        accuracy.reset(exe)
+        test_pass_acc = fluid.average.WeightedAverage()
         for batch_id, data in enumerate(test_reader()):
             img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                     data)).astype("float32")
             y_data = np.array(map(lambda x: x[1], data)).astype("int64")
             y_data = y_data.reshape([-1, 1])
 
-            exe.run(inference_program,
-                    feed={"pixel": img_data,
-                          "label": y_data})
+            outs = exe.run(inference_program,
+                           feed={"pixel": img_data,
+                                 "label": y_data},
+                           fetch_list=[batch_acc, batch_size])
+            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
 
-        return accuracy.eval(exe)
+        return test_pass_acc.eval()
 
     def train_loop(exe, trainer_prog):
         iters = 0
         ts = time.time()
+        train_pass_acc = fluid.average.WeightedAverage()
         for pass_id in range(args.num_passes):
             # train
             start_time = time.time()
             num_samples = 0
-            accuracy.reset(exe)
+            train_pass_acc.reset()
             with profiler.profiler("CPU", 'total') as prof:
                 for batch_id, data in enumerate(train_reader()):
                     ts = time.time()
@@ -187,13 +191,14 @@ def main():
                     y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                     y_data = y_data.reshape([-1, 1])
 
-                    loss, acc = exe.run(
+                    loss, acc, b_size = exe.run(
                         trainer_prog,
                         feed={"pixel": img_data,
                               "label": y_data},
-                        fetch_list=[avg_cost] + accuracy.metrics)
+                        fetch_list=[avg_cost, batch_acc, batch_size])
                     iters += 1
                     num_samples += len(data)
+                    train_pass_acc.add(value=acc, weight=b_size)
                     print(
                         "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
                         % (pass_id, iters, loss, acc,
@@ -201,7 +206,7 @@ def main():
                     )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
-            pass_train_acc = accuracy.eval(exe)
+            pass_train_acc = train_pass_acc.eval()
             pass_test_acc = test(exe)
             print(
                 "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index e2b7ef8d54748de46370e808e28f9b9fcc988a84..8af2765f58717408e3a1ef6b500bb01511bfd8d3 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND})
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
         BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
-        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> 
+                            && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
     )
@@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
                 \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
             )"
         )
-        INSTALL(CODE "execute_process(
-            COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
-                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
-            )"
-        )
     ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
 
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 2c109727cfa641b16ba01d6392613fa497b95902..71f54c425d4c38e271a8f1b78887d95a27252443 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -39,6 +39,7 @@ ExternalProject_Add(
                     -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DBUILD_TESTING=OFF
+                    -DSNAPPY_BUILD_TESTS:BOOL=OFF
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 356da582d1f8b6a8858af90ccdf5af2100e5db87..d0b5eaec2e2a50acf17e5dd1d1aeb0ec3e614fbf 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
     else()
       add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+      find_fluid_modules(${TARGET_NAME})
     endif()
+
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
       if("${cc_library_DEPS};" MATCHES "warpctc;")
@@ -263,7 +265,8 @@ function(nv_library TARGET_NAME)
       if (nv_library_SHARED OR nv_library_shared) # build *.so
         cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
       else()
-          cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+        cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 4471df36b0717171da4dff92ca0ec98b4f981028..6b2237b858380f384be0aa3c6ae24a4c83ad646d 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -1,9 +1,22 @@
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 # make package for paddle fluid shared and static library
 function(copy TARGET)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DSTS DEPS)
     cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
 
     list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
     list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
@@ -42,13 +55,21 @@ copy(glog_lib
   DSTS ${dst_dir} ${dst_dir}/lib
 )
 
-IF(NOT PROTOBUF_FOUND)
+if(NOT PROTOBUF_FOUND)
     set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
     copy(protobuf_lib
-      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
+      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
       DSTS ${dst_dir} ${dst_dir}/lib
     )
-ENDIF(NOT PROTOBUF_FOUND)
+endif()
+
+if(NOT CBLAS_FOUND)
+    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    copy(openblas_lib
+      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
+      DSTS ${dst_dir} ${dst_dir}
+    )
+endif()
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
@@ -66,8 +87,8 @@ copy(memory_lib
 )
 
 set(module "inference")
-copy(inference_lib DEPENDS paddle_fluid_shared
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
+copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
@@ -83,6 +104,4 @@ copy(string_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
 
-add_custom_target(inference_lib_dist DEPENDS 
-  inference_lib framework_lib memory_lib platform_lib string_lib
-  gflags_lib glog_lib protobuf_lib eigen3_lib)
+add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
diff --git a/doc/fluid/howto/optimization/timeline.jpeg b/doc/fluid/howto/optimization/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/howto/optimization/timeline.jpeg differ
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..9d9565a3e698a83ca465c5da83ff892360c33b8f
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline.md
@@ -0,0 +1,27 @@
+## how to use timeline tool to do profile
+
+1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[],
+	                    use_program_cache=True)
+	            ...
+	```
+
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+
+1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
+
+	![chrome tracing](./tracing.jpeg)
+
+1. The resulting timeline should be like:
+
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/tracing.jpeg b/doc/fluid/howto/optimization/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/howto/optimization/tracing.jpeg differ
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
index 8e4165da6b8135d083766c650f1092158f9d01c2..ddcd42a0c6554469d702d3a9bbecd16643d6b7ed 100644
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -39,7 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
 
     "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
     "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
index 0d4c925b6e2731bdfd76582309aa7e8adbafa6ae..e08c84703bfa89352a79acbddd5d7f1bc88ce82e 100644
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -42,7 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
 
     "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
     "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
     "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
diff --git a/doc/v2/howto/index_cn.rst b/doc/v2/howto/index_cn.rst
index 0c534f107b6e047035c424ed2ea59f3982799b63..b0268907bceb11cd53a4630c3f8b8e0424abe247 100644
--- a/doc/v2/howto/index_cn.rst
+++ b/doc/v2/howto/index_cn.rst
@@ -1,11 +1,37 @@
 进阶使用
 ========
 
+PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。使用方式请参考：
+
 ..  toctree::
   :maxdepth: 1
 
   cmd_parameter/index_cn.rst
+
+PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务，具体环境配置和使用说明请参考：
+
+..  toctree::
+  :maxdepth: 1
+
   cluster/index_cn.rst
+
+PaddlePaddle提供了用于预测的C-API，关于C-API的使用，我们提供了如下指南:
+
+..  toctree::
+  :maxdepth: 1
+
   capi/index_cn.rst
+
+PaddlePaddle支持多种灵活和高效的循环神经网络，具体配置使用方式请参考：
+
+..  toctree::
+  :maxdepth: 1
+
   rnn/index_cn.rst
+
+关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优，请参考：
+
+..  toctree::
+  :maxdepth: 1
+
   optimization/gpu_profiling_cn.rst
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 82c7d4a2ec648449fc65ca2ae0de397b2f6fa120..48713f2c2ac62a37b7b7a4602f7f6a325aecb0b8 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
-if (WITH_GPU)
+if(WITH_GPU)
   nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
 else()
   cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
-endif ()
+endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
-if (WITH_GPU)
+if(WITH_GPU)
   nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
 else()
   cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
@@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
 nv_test(data_device_transform_test SRCS data_device_transform_test.cu
         DEPS operator op_registry init math_function)
 
-cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
-cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+if(WITH_GPU)
+  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+else()
+  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+  cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+endif()
 
 cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
 cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index bda1bfb23b18f8c6b9f1c3eded461a9322a154af..9f8fb12098d622058a86f83c1c42a1feb1cfb2e2 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -28,24 +28,19 @@ class Channel {
   virtual bool Send(T*) = 0;
   virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
+  virtual void Lock() = 0;
+  virtual void Unlock() = 0;
   virtual void Close() = 0;
   virtual ~Channel() {}
 };
 
 // Forward declaration of channel implementations.
-namespace details {
 template <typename T>
-class Buffered;
-template <typename T>
-class UnBuffered;
-}  // namespace details
+class ChannelImpl;
 
 template <typename T>
 Channel<T>* MakeChannel(size_t buffer_size) {
-  if (buffer_size > 0) {
-    return new details::Buffered<T>(buffer_size);
-  }
-  return new details::UnBuffered<T>();
+  return new ChannelImpl<T>(buffer_size);
 }
 
 template <typename T>
@@ -89,6 +84,19 @@ class ChannelHolder {
     if (IsInitialized()) holder_->Close();
   }
 
+  size_t Cap() {
+    if (IsInitialized()) return holder_->Cap();
+    return -1;
+  }
+
+  void Lock() {
+    if (IsInitialized()) holder_->Lock();
+  }
+
+  void Unlock() {
+    if (IsInitialized()) holder_->Unlock();
+  }
+
   inline bool IsInitialized() const { return holder_ != nullptr; }
 
   inline const std::type_index Type() {
@@ -106,6 +114,9 @@ class ChannelHolder {
     virtual const std::type_index Type() const = 0;
     virtual void* Ptr() const = 0;
     virtual void Close() = 0;
+    virtual void Lock() = 0;
+    virtual void Unlock() = 0;
+    virtual size_t Cap() = 0;
   };
 
   template <typename T>
@@ -115,11 +126,28 @@ class ChannelHolder {
     }
 
     virtual const std::type_index Type() const { return type_; }
+
     virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
+
     virtual void Close() {
       if (channel_) channel_->Close();
     }
 
+    virtual size_t Cap() {
+      if (channel_)
+        return channel_->Cap();
+      else
+        return -1;
+    }
+
+    virtual void Lock() {
+      if (channel_) channel_->Lock();
+    }
+
+    virtual void Unlock() {
+      if (channel_) channel_->Unlock();
+    }
+
     std::unique_ptr<Channel<T>> channel_;
     const std::type_index type_;
   };
@@ -131,5 +159,4 @@ class ChannelHolder {
 }  // namespace framework
 }  // namespace paddle
 
-#include "paddle/fluid/framework/details/buffered_channel.h"
-#include "paddle/fluid/framework/details/unbuffered_channel.h"
+#include "paddle/fluid/framework/channel_impl.h"
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4561031fd8c49613269e7008ce558f25f9765e4
--- /dev/null
+++ b/paddle/fluid/framework/channel_impl.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stddef.h>  // for size_t
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+class ChannelImpl : public paddle::framework::Channel<T> {
+  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
+
+ public:
+  virtual bool Send(T *);
+  virtual bool Receive(T *);
+  virtual size_t Cap() { return cap_; }
+  virtual void Lock();
+  virtual void Unlock();
+  virtual void Close();
+
+  ChannelImpl(size_t);
+  virtual ~ChannelImpl();
+
+ private:
+  struct QueueMessage {
+    T *data;
+    std::condition_variable_any cond;
+    bool chan_closed = false;
+    bool completed = false;
+
+    QueueMessage(T *item) : data(item) {}
+
+    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
+      cond.wait(lock, [this]() { return completed; });
+    }
+
+    void Notify() {
+      completed = true;
+      cond.notify_all();
+    }
+  };
+
+  bool send_return(bool value) {
+    send_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+
+  bool recv_return(bool value) {
+    recv_ctr--;
+    destructor_cond_.notify_all();
+    return value;
+  }
+
+  size_t cap_;
+  std::recursive_mutex mu_;
+  bool closed_;
+  std::deque<T> buf_;
+  std::deque<std::shared_ptr<QueueMessage>> recvq;
+  std::deque<std::shared_ptr<QueueMessage>> sendq;
+  std::atomic<unsigned> send_ctr{0};
+  std::atomic<unsigned> recv_ctr{0};
+  std::condition_variable_any destructor_cond_;
+};
+
+template <typename T>
+ChannelImpl<T>::ChannelImpl(size_t capacity)
+    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
+  PADDLE_ENFORCE_GE(capacity, 0);
+}
+
+template <typename T>
+bool ChannelImpl<T>::Send(T *item) {
+  send_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+
+  // If channel is closed, do nothing
+  if (closed_) {
+    lock.unlock();
+    // TODO(abhinavarora) Should panic on closed channel
+    return send_return(false);
+  }
+
+  // If there is a receiver, directly pass the value we want
+  // to send to the receiver, bypassing the channel buffer if any
+  if (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    // Do the data transfer
+    *(m->data) = std::move(*item);
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return send_return(true);
+  }
+
+  // Unbuffered channel will always bypass this
+  // If buffered channel has space in buffer,
+  // write the element to the buffer.
+  if (buf_.size() < cap_) {
+    // Copy to buffer
+    buf_.push_back(std::move(*item));
+    // Release lock and return true
+    lock.unlock();
+    return send_return(true);
+  }
+
+  // Block on channel, because some receiver will complete
+  // the operation for us
+  auto m = std::make_shared<QueueMessage>(item);
+  sendq.push_back(m);
+  m->Wait(lock);
+  // TODO(abhinavarora) Should panic on closed channel
+  return send_return(!m->chan_closed);
+}
+
+template <typename T>
+bool ChannelImpl<T>::Receive(T *item) {
+  recv_ctr++;
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+
+  // If channel is closed and buffer is empty or
+  // channel is unbuffered
+  if (closed_ && buf_.empty()) {
+    lock.unlock();
+    return recv_return(false);
+  }
+
+  // If there is a sender, directly receive the value we want
+  // from the sender, bypassing the channel buffer if any
+  if (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    // Do the data transfer
+    *item = std::move(*(m->data));
+    // Wake up the blocked process and unlock
+    m->Notify();
+    lock.unlock();
+    return recv_return(true);
+  }
+
+  // If this is a buffered channel and there are items in buffer
+  if (buf_.size() > 0) {
+    // Directly read from buffer
+    *item = std::move(buf_.front());
+    buf_.pop_front();
+    // Release lock and return true
+    lock.unlock();
+    return recv_return(true);
+  }
+
+  // No sender available, block on this channel
+  // Some receiver will complete the option for us
+  auto m = std::make_shared<QueueMessage>(item);
+  recvq.push_back(m);
+  m->Wait(lock);
+
+  return recv_return(!m->chan_closed);
+}
+
+template <typename T>
+void ChannelImpl<T>::Lock() {
+  mu_.lock();
+}
+
+template <typename T>
+void ChannelImpl<T>::Unlock() {
+  mu_.unlock();
+}
+
+template <typename T>
+void ChannelImpl<T>::Close() {
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+
+  if (closed_) {
+    // TODO(abhinavarora): closing an already closed channel should panic
+    lock.unlock();
+    return;
+  }
+
+  closed_ = true;
+
+  // Empty the readers
+  while (!recvq.empty()) {
+    std::shared_ptr<QueueMessage> m = recvq.front();
+    recvq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+
+  // Empty the senders
+  while (!sendq.empty()) {
+    std::shared_ptr<QueueMessage> m = sendq.front();
+    sendq.pop_front();
+    m->chan_closed = true;
+    m->Notify();
+  }
+}
+
+template <typename T>
+ChannelImpl<T>::~ChannelImpl() {
+  Close();
+  // The destructor must wait for all readers and writers to complete their task
+  // The channel has been closed, so we will not accept new readers and writers
+  std::unique_lock<std::recursive_mutex> lock{mu_};
+  destructor_cond_.wait(lock,
+                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 695169fcb9e93b5e69d3d4ae6f63f8e4c2d1605f..edfb41c72489113d9803c2957baed1ce44f8296d 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -23,8 +23,19 @@ using paddle::framework::Channel;
 using paddle::framework::ChannelHolder;
 using paddle::framework::MakeChannel;
 using paddle::framework::CloseChannel;
-using paddle::framework::details::Buffered;
-using paddle::framework::details::UnBuffered;
+
+TEST(Channel, ChannelCapacityTest) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  EXPECT_EQ(ch->Cap(), buffer_size);
+  CloseChannel(ch);
+  delete ch;
+
+  ch = MakeChannel<size_t>(0);
+  EXPECT_EQ(ch->Cap(), 0U);
+  CloseChannel(ch);
+  delete ch;
+}
 
 void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
   unsigned sum_send = 0;
@@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
     }
   });
   for (int i = 0; i < 5; i++) {
-    int recv;
+    int recv = 999;
     EXPECT_EQ(ch->Receive(&recv), true);
     EXPECT_EQ(recv, i);
   }
-
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
   CloseChannel(ch);
   t.join();
   EXPECT_EQ(sum_send, 10U);
   delete ch;
 }
 
-TEST(Channel, MakeAndClose) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
-  {
-    // MakeChannel should return a buffered channel is buffer_size > 0.
-    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-  {
-    // MakeChannel should return an un-buffered channel is buffer_size = 0.
-    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
-    CloseChannel(ch);
-    delete ch;
-  }
-}
-
 TEST(Channel, SufficientBufferSizeDoesntBlock) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel<size_t>(buffer_size);
@@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
 TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel<size_t>(buffer_size);
-  size_t sum = 0;
   std::thread t([&]() {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
@@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
         EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
       else
         EXPECT_EQ(ch->Send(&i), false);
-      sum += i;
     }
   });
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
-  EXPECT_EQ(sum, 45U);
-
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
   CloseChannel(ch);
   t.join();
   delete ch;
@@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
         },
         &thread_ended[i]);
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
   for (size_t i = 0; i < num_threads; i++) {
@@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   // This should unblock all receivers
   CloseChannel(ch);
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
   for (size_t i = 0; i < num_threads; i++) {
@@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   for (size_t i = 0; i < num_threads; i++) t[i].join();
 }
 
-void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
-
+void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
   size_t num_threads = 5;
   std::thread t[num_threads];
   bool thread_ended[num_threads];
@@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
         },
         &thread_ended[i], &send_success[i]);
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
-  if (dynamic_cast<Buffered<int> *>(ch)) {
+  if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
     for (size_t i = 0; i < num_threads; i++) {
@@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
   // This should unblock all senders
   CloseChannel(ch);
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
   for (size_t i = 0; i < num_threads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  if (dynamic_cast<Buffered<int> *>(ch)) {
+  if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
     for (size_t i = 0; i < num_threads; i++) {
@@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
 //  any senders waiting for channel to have write space
 TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
   auto ch = MakeChannel<int>(1);
-  ChannelCloseUnblocksSendersTest(ch);
+  ChannelCloseUnblocksSendersTest(ch, true);
   delete ch;
 }
 
@@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
 //  unblocks any senders waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
   auto ch = MakeChannel<int>(0);
-  ChannelCloseUnblocksReceiversTest(ch);
+  ChannelCloseUnblocksSendersTest(ch, false);
   delete ch;
 }
 
@@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
     ch->Receive(&recv);
     EXPECT_EQ(recv, i);
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
   EXPECT_EQ(sum_send, 3U);
 
   CloseChannel(ch);
@@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
     ch->Send(&i);
     sum_send += i;
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
   EXPECT_EQ(sum_send, 10U);
   EXPECT_EQ(sum_receive, 10U);
   // send three more elements
@@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
-void ChannelDestroyUnblockSenders(Channel<int> *ch) {
+void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   size_t num_threads = 5;
   std::thread t[num_threads];
   bool thread_ended[num_threads];
@@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
         &thread_ended[i], &send_success[i]);
   }
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
-  bool is_buffered_channel = false;
-  if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
-  if (is_buffered_channel) {
+  if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
     for (size_t i = 0; i < num_threads; i++) {
@@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  // Count number of successfuld sends
+  // Count number of successful sends
   int ct = 0;
   for (size_t i = 0; i < num_threads; i++) {
     if (send_success[i]) ct++;
   }
 
-  if (is_buffered_channel) {
+  if (isBuffered) {
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   } else {
@@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
 TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
   size_t buffer_size = 1;
   auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockSenders(ch);
+  ChannelDestroyUnblockSenders(ch, true);
 }
 
 // This tests that destroying an unbuffered channel also unblocks
@@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
 
 TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
   auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockSenders(ch);
+  ChannelDestroyUnblockSenders(ch, false);
+}
+
+TEST(ChannelHolder, ChannelHolderCapacityTest) {
+  const size_t buffer_size = 10;
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(buffer_size);
+  EXPECT_EQ(ch->Cap(), buffer_size);
+  delete ch;
+
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  EXPECT_EQ(ch->Cap(), 0U);
+  delete ch;
 }
 
 void ChannelHolderSendReceive(ChannelHolder *ch) {
@@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
         },
         &thread_ended[i]);
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
   for (size_t i = 0; i < num_threads; i++) {
@@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   // This should unblock all receivers
   ch->close();
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
   for (size_t i = 0; i < num_threads; i++) {
@@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
 }
 
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  using paddle::framework::details::Buffered;
-  using paddle::framework::details::UnBuffered;
-
   size_t num_threads = 5;
   std::thread t[num_threads];
   bool thread_ended[num_threads];
@@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
         },
         &thread_ended[i], &send_success[i]);
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
@@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   // This should unblock all senders
   ch->close();
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
   for (size_t i = 0; i < num_threads; i++) {
@@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
         &thread_ended[i], &send_success[i]);
   }
 
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
@@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
         },
         &thread_ended[i]);
   }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads are blocked
   for (size_t i = 0; i < num_threads; i++) {
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 0475fc1d9aaeda39b3fe845b70461c434bcaafc4..bfad9ac1e9cad1936ed961ad1da55787d2faa23e 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
     PassTensorData(&out, &in);
   }
 
+  // do data type transform
   if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
     TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
     transformed = true;
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 1dec766a345d8d6c6fdafc1b50450a9dde91fe5d..4c1b3e7581fe716271c62389c6053a24158913d2 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -16,13 +16,16 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace framework {
 
 inline proto::VarType::Type ToDataType(std::type_index type) {
   using namespace paddle::framework::proto;
-  if (typeid(float).hash_code() == type.hash_code()) {
+  if (typeid(platform::float16).hash_code() == type.hash_code()) {
+    return proto::VarType::FP16;
+  } else if (typeid(float).hash_code() == type.hash_code()) {
     return proto::VarType::FP32;
   } else if (typeid(double).hash_code() == type.hash_code()) {
     return proto::VarType::FP64;
@@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
 inline std::type_index ToTypeIndex(proto::VarType::Type type) {
   using namespace paddle::framework::proto;
   switch (type) {
+    case proto::VarType::FP16:
+      return typeid(platform::float16);
     case proto::VarType::FP32:
       return typeid(float);
     case proto::VarType::FP64:
@@ -59,6 +64,9 @@ template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
   using namespace paddle::framework::proto;
   switch (type) {
+    case proto::VarType::FP16:
+      visitor.template operator()<platform::float16>();
+      break;
     case proto::VarType::FP32:
       visitor.template operator()<float>();
       break;
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 54cc1575d880242c01cd4752f522380a673b7c75..c0523f3c795b103c0c27081ec5dc717f6a0f11e0 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -47,9 +47,16 @@ struct CastDataType {
       auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
       trans(*context, in_begin, in_end, out_begin,
             CastDataTypeFunctor<InType, OutType>());
+#ifdef __NVCC__
+    } else if (platform::is_gpu_place(in_.place())) {
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+      context->Wait();
+#endif
     } else {
-      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
-      PADDLE_THROW("Unsupport CPU <-> GPU!");
+      PADDLE_THROW("Unsupported place!");
     }
   }
 };
@@ -65,6 +72,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
   auto ctx = pool.Get(in.place());
 
   switch (src_type) {
+    case proto::VarType::FP16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::float16>(in, out, ctx));
+      break;
     case proto::VarType::FP32:
       framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
       break;
diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu
new file mode 120000
index 0000000000000000000000000000000000000000..f46491293ef4ad688c1bce9327f5f28011dec809
--- /dev/null
+++ b/paddle/fluid/framework/data_type_transform.cu
@@ -0,0 +1 @@
+data_type_transform.cc
\ No newline at end of file
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index 724c8c301f25cca21704de398fc416dff46c330c..6b9a8f5e28b372c45abfaa2c20575a55d9a9dd03 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -22,32 +22,145 @@ TEST(DataTypeTransform, CPUTransform) {
 
   auto place = CPUPlace();
 
-  Tensor in;
-  Tensor out;
-
-  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
-  int data_number = 2 * 3;
-
-  for (int i = 0; i < data_number; ++i) {
-    ptr[i] = i / 3;
-  }
-
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
   auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
                                   DataLayout::kAnyLayout, LibraryType::kPlain);
   auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
                                   DataLayout::kAnyLayout, LibraryType::kPlain);
   auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
                                    DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
 
-  TransDataType(kernel_fp32, kernel_fp64, in, &out);
-  double* out_data_double = out.data<double>();
-  for (int i = 0; i < data_number; ++i) {
-    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+  // data type transform from float32
+  {
+    Tensor in;
+    Tensor out;
+
+    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i / 3;
+    }
+
+    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
+    }
+
+    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
+    }
   }
 
-  TransDataType(kernel_fp32, kernel_int32, in, &out);
-  int* out_data_int = out.data<int>();
-  for (int i = 0; i < data_number; ++i) {
-    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+  // data type transform from/to float16
+  {
+    Tensor in;
+    Tensor out;
+
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+
+    // transform double to float16
+    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    TransDataType(kernel_int32, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to float16
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    TransDataType(kernel_int64, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    TransDataType(kernel_bool, kernel_fp16, in, &out);
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
   }
 }
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..de389ddabcb86de0155757406a406e44086c5474
--- /dev/null
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -0,0 +1,222 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+#include "gtest/gtest.h"
+
+TEST(DataTypeTransform, GPUTransform) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto cpu_place = CPUPlace();
+  auto gpu_place = CUDAPlace(0);
+  CUDADeviceContext context(gpu_place);
+
+  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+
+  // data type transform from float32
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+
+    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float arr[6] = {0, 1, 2, 3, 4, 5};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(in_ptr, arr, sizeof(arr));
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
+    }
+
+    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(arr[i]));
+    }
+  }
+
+  // data type transform from/to float16
+  {
+    Tensor in;
+    Tensor in_gpu;
+    Tensor out_gpu;
+    Tensor out;
+
+    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
+    float16 arr[6] = {float16(0), float16(1), float16(2),
+                      float16(3), float16(4), float16(5)};
+    int data_number = sizeof(arr) / sizeof(arr[0]);
+    memcpy(ptr, arr, sizeof(arr));
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+
+    // transform from float16 to other data types
+    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to float16
+    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+    }
+
+    // transform double to float16
+    double* in_data_double =
+        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+    }
+
+    // transform int to float16
+    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to float16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to float16
+    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    TensorCopy(in, gpu_place, context, &in_gpu);
+    context.Wait();
+    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
+    TensorCopy(out_gpu, cpu_place, context, &out);
+    context.Wait();
+
+    ptr = out.data<float16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+    }
+  }
+}
diff --git a/paddle/fluid/framework/details/buffered_channel.h b/paddle/fluid/framework/details/buffered_channel.h
deleted file mode 100644
index 88faf3acf7c17b0cb3770a8910e400a1f6688f5f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/buffered_channel.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// Four of the properties of Buffered Channel:
-// - A send to a full channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from an empty channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-
-template <typename T>
-class Buffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
-
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return cap_; }
-  virtual void Close();
-  virtual ~Buffered();
-
- private:
-  size_t cap_;
-  std::mutex mu_;
-  std::condition_variable empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::condition_variable destructor_cond_var_;
-  std::deque<T> channel_;
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-
-  Buffered(size_t cap) : cap_(cap), closed_(false) {
-    PADDLE_ENFORCE_GT(cap, 0);
-  }
-
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-
-template <typename T>
-bool Buffered<T>::Send(T* item) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock,
-                      [this]() { return channel_.size() < cap_ || closed_; });
-  if (!closed_) {
-    channel_.push_back(std::move(*item));
-    lock.unlock();
-    empty_cond_var_.notify_one();
-    ret = true;
-  }
-  send_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-
-template <typename T>
-bool Buffered<T>::Receive(T* item) {
-  bool ret = false;
-  // Once the channel has been closed and all data has been consumed,
-  // just return false. Don't even try acquiring the mutex.
-  if (closed_ && channel_.empty()) {
-    return false;
-  }
-  recv_ctr++;
-  std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
-  if (!channel_.empty()) {
-    *item = std::move(channel_.front());
-    channel_.pop_front();
-    full_cond_var_.notify_one();
-    ret = true;
-  }
-  recv_ctr--;
-  destructor_cond_var_.notify_one();
-  return ret;
-}
-
-template <typename T>
-void Buffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-
-template <typename T>
-Buffered<T>::~Buffered() {
-  std::unique_lock<std::mutex> lock(mu_);
-  closed_ = true;
-  channel_.clear();
-  NotifyAllParticipants(&lock);
-
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  lock.lock();
-  destructor_cond_var_.wait(
-      lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-template <typename T>
-void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  full_cond_var_.notify_all();
-  empty_cond_var_.notify_all();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/unbuffered_channel.h b/paddle/fluid/framework/details/unbuffered_channel.h
deleted file mode 100644
index 5c9424928cb7029aac813e7b2f29f81a0093f836..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/unbuffered_channel.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-
-#include "paddle/fluid/framework/channel.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-// Four of the properties of UnBuffered Channel:
-// - A send to a channel blocks temporarily until a receive from the
-// channel or the channel is closed.
-// - A receive from a channel blocks temporarily until a send to the
-// channel or the channel is closed.
-// - A send to a closed channel returns false immediately.
-// - A receive from a closed channel returns false immediately.
-template <typename T>
-class UnBuffered : public paddle::framework::Channel<T> {
-  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
-
- public:
-  virtual bool Send(T*);
-  virtual bool Receive(T*);
-  virtual size_t Cap() { return 0; }
-  virtual void Close();
-  virtual ~UnBuffered();
-
- private:
-  std::mutex mu_ch_;
-  // Mutex for readers and writers who are waiting for other reader
-  // and writer to complete execution
-  std::recursive_mutex mu_read_, mu_write_;
-  // reader_found_ is set true when a reader is ready to accept data
-  // writer_found_ is set true when a writer is ready to send data
-  // A transaction occurs only when both are true
-  std::atomic<bool> reader_found_{false}, writer_found_{false};
-  std::condition_variable cv_channel_;
-  std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
-  T* item{nullptr};
-  std::atomic<bool> closed_{false};
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-
-  UnBuffered() : closed_(false) {}
-
-  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
-};
-
-// This function implements the concept of how data should
-// be sent from a writer to a reader.
-template <typename T>
-bool UnBuffered<T>::Send(T* data) {
-  bool ret = false;
-  if (closed_) {
-    return ret;
-  }
-  send_ctr++;
-  // Prevent other writers from entering
-  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
-  writer_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
-  // If writer comes first, it should wait till a reader arrives
-  cv_writer_.wait(cv_lock,
-                  [this]() { return reader_found_ == true || closed_; });
-  cv_reader_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> channel_lock(mu_ch_);
-    item = data;
-    channel_lock.unlock();
-    cv_channel_.notify_one();
-    channel_lock.lock();
-    cv_channel_.wait(channel_lock,
-                     [this]() { return item == nullptr || closed_; });
-    ret = true;
-  }
-  writer_found_ = false;
-  send_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-
-// This function implements the concept of how
-// data that was sent by a writer is read from a reader.
-template <typename T>
-bool UnBuffered<T>::Receive(T* data) {
-  bool ret = false;
-  // If channel is closed, we don't even want any reader to enter.
-  // Unlike a buffered channel, an unbuffered channel does not allow
-  // readers to read after closing because there is no buffer to be consumed.
-  if (closed_) return ret;
-  recv_ctr++;
-  // Prevent other readers from entering
-  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
-  reader_found_ = true;
-  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
-  // If reader comes first, it should wait till a writer arrives
-  cv_reader_.wait(cv_lock,
-                  [this]() { return writer_found_ == true || closed_; });
-  cv_writer_.notify_one();
-  if (!closed_) {
-    std::unique_lock<std::mutex> lock_ch{mu_ch_};
-    // Reader should wait for the writer to first write its data
-    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
-    if (!closed_) {
-      *data = std::move(*item);
-      item = nullptr;
-      lock_ch.unlock();
-      ret = true;
-    }
-    cv_channel_.notify_one();
-  }
-  reader_found_ = false;
-  recv_ctr--;
-  cv_destructor_.notify_one();
-  return ret;
-}
-
-// This function implements the sequence of events
-// that take place once the channel is closed.
-template <typename T>
-void UnBuffered<T>::Close() {
-  if (closed_) {
-    return;
-  }
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-}
-
-// This function implements the sequence of events
-// that are executed once the object of an UnBuffered
-// channel is destroyed.
-template <typename T>
-UnBuffered<T>::~UnBuffered() {
-  std::unique_lock<std::mutex> lock(mu_ch_);
-  item = nullptr;
-  closed_ = true;
-  NotifyAllParticipants(&lock);
-  lock.lock();
-  cv_destructor_.wait(lock,
-                      [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-// This function notifies all the readers, writers and
-// the channel condition variables.
-template <typename T>
-void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  cv_writer_.notify_all();
-  cv_channel_.notify_all();
-  cv_reader_.notify_all();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index dc1caa72a4c09c7045c87575abab0faac7b5bbc8..91879d6d45868bb37ca44baafb8b0e8677cd6d1a 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
   return shapes_[idx];
 }
 
-void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
-  if (iteration_pos_ >= buffer_.size()) {
-    // Reload buffer with new data
-    buffer_.clear();
-    buffer_.reserve(buffer_size_);
-    for (int i = 0; i < buffer_size_; ++i) {
-      if (reader_->HasNext()) {
-        buffer_.push_back(std::vector<LoDTensor>());
-        reader_->ReadNext(&buffer_.back());
-      } else {
-        break;
-      }
-    }
-    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
-    // optimize.
-    std::random_shuffle(buffer_.begin(), buffer_.end());
-    iteration_pos_ = 0;
-  }
-  out->clear();
-  if (!buffer_.empty()) {
-    std::swap(*out, buffer_[iteration_pos_++]);
-  }
-  // if buffer_ is empty, the 'out' will return as an empty vector.
-}
-
-void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
-  buffer_.clear();
-  buffer_.reserve(batch_size_);
-  for (int i = 0; i < batch_size_; ++i) {
-    if (reader_->HasNext()) {
-      buffer_.push_back(std::vector<LoDTensor>());
-      reader_->ReadNext(&buffer_.back());
-    } else {
-      break;
-    }
-  }
-  // Concat instances
-  out->clear();
-  if (buffer_.empty()) {
-    // if buffer_ is empty, the 'out' will return as an empty vector.
-    return;
-  }
-  int out_num = buffer_[0].size();
-  out->reserve(out_num);
-  for (int j = 0; j < out_num; ++j) {
-    // Merge shape and check date type
-    std::type_index batch_type = buffer_[0][j].type();
-    DDim batch_shape = buffer_[0][j].dims();
-    for (size_t i = 1; i < buffer_.size(); ++i) {
-      std::type_index ins_type = buffer_[i][j].type();
-      DDim ins_shape = buffer_[i][j].dims();
-      PADDLE_ENFORCE_EQ(batch_type, ins_type);
-      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
-                        slice_ddim(ins_shape, 1, ins_shape.size()));
-      PADDLE_ENFORCE_GT(ins_shape[0], 0);
-      batch_shape[0] += ins_shape[0];
-    }
-
-    LoDTensor out_tensor;
-    out_tensor.Resize(batch_shape);
-    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
-    int64_t dst_offset = 0;
-
-    // Merge lod and data
-    LoD batch_lod;
-    for (size_t i = 0; i < buffer_.size(); ++i) {
-      DDim ins_shape = buffer_[i][j].dims();
-      LoD ins_lod = buffer_[i][j].lod();
-      if (i == 0) {
-        batch_lod = ins_lod;
-      } else {
-        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
-        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
-          auto& lod_level = batch_lod[level_idx];
-          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
-            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
-          }
-        }
-      }
-      Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
-      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
-      dst_offset += ins_shape[0];
-    }
-    out_tensor.set_lod(batch_lod);
-    out->push_back(out_tensor);
-  }
-}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 4a5eba5fb733b3e9da2b245b4dda18725c9b0895..27ab6e750c2e665fa5055a3ecfb2f315cb4000c0 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -60,83 +60,8 @@ class DecoratedReader : public ReaderBase {
   ReaderBase* reader_;
 };
 
-// file readers
-
-template <typename T>
-class RandomDataGenerator : public FileReader {
- public:
-  RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
-      : FileReader(shapes), min_(min), max_(max) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
-    unsigned int seed = std::random_device()();
-    engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override {
-    out->clear();
-    out->reserve(shapes_.size());
-    for (const DDim& shape : shapes_) {
-      PADDLE_ENFORCE_GE(
-          shape.size(), 2,
-          "The rank of reader's output data should be 2 at least.(Now it's %d)",
-          shape.size());
-      LoDTensor out_tensor;
-      out_tensor.Resize(shape);
-      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
-      int64_t numel = product(shape);
-      for (int64_t i = 0; i < numel; ++i) {
-        data[i] = dist_(engine_);
-      }
-      out->push_back(out_tensor);
-    }
-  }
-
-  bool HasNext() const override { return true; }
-
-  void ReInit() override { return; }
-
- private:
-  float min_;
-  float max_;
-  std::minstd_rand engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-
-// decorated readers
-
-class ShuffleReader : public DecoratedReader {
- public:
-  ShuffleReader(ReaderBase* reader, int buffer_size)
-      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
-    buffer_.reserve(buffer_size);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
-
- private:
-  int buffer_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-  size_t iteration_pos_;
-};
-
-class BatchReader : public DecoratedReader {
- public:
-  BatchReader(ReaderBase* reader, int batch_size)
-      : DecoratedReader(reader), batch_size_(batch_size) {
-    buffer_.reserve(batch_size_);
-  }
-
-  void ReadNext(std::vector<LoDTensor>* out) override;
-
- private:
-  int batch_size_;
-  std::vector<std::vector<LoDTensor>> buffer_;
-};
-
-// The ReaderHolder is used as readers' unified wrapper,
-// making it easier to access different type readers in Variables.
+// The ReaderHolder is used as reader' unified wrapper,
+// making it easier to access different type reader in Variables.
 class ReaderHolder {
  public:
   void Reset(ReaderBase* reader) { reader_.reset(reader); }
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 8aebfcb3b624f49b747e0874e807782b8d4a9951..9687a86ca25be7886e67028a38e54b3065c8e4b5 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -235,27 +235,53 @@ TEST(TensorToVector, Tensor) {
 TEST(TensorContainsNAN, CPU) {
   using namespace paddle::framework;
   using namespace paddle::platform;
-  Tensor src;
-  float* buf = src.mutable_data<float>({3}, CPUPlace());
-  buf[0] = 0.0;
-  buf[1] = NAN;
-  buf[2] = 0.0;
-  ASSERT_TRUE(TensorContainsNAN(src));
-  buf[1] = 0.0;
-  ASSERT_FALSE(TensorContainsNAN(src));
+  {
+    Tensor src;
+    float* buf = src.mutable_data<float>({3}, CPUPlace());
+    buf[0] = 0.0;
+    buf[1] = NAN;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsNAN(src));
+    buf[1] = 0.0;
+    ASSERT_FALSE(TensorContainsNAN(src));
+  }
+
+  {
+    Tensor src;
+    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    buf[0] = 0.0;
+    buf[1].x = 0x7fff;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsNAN(src));
+    buf[1] = 0.0;
+    ASSERT_FALSE(TensorContainsNAN(src));
+  }
 }
 
 TEST(TensorContainsInf, CPU) {
   using namespace paddle::framework;
   using namespace paddle::platform;
-  Tensor src;
-  double* buf = src.mutable_data<double>({3}, CPUPlace());
-  buf[0] = 1.0;
-  buf[1] = INFINITY;
-  buf[2] = 0.0;
-  ASSERT_TRUE(TensorContainsInf(src));
-  buf[1] = 1.0;
-  ASSERT_FALSE(TensorContainsInf(src));
+  {
+    Tensor src;
+    double* buf = src.mutable_data<double>({3}, CPUPlace());
+    buf[0] = 1.0;
+    buf[1] = INFINITY;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsInf(src));
+    buf[1] = 1.0;
+    ASSERT_FALSE(TensorContainsInf(src));
+  }
+
+  {
+    Tensor src;
+    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    buf[0] = 1.0;
+    buf[1].x = 0x7c00;
+    buf[2] = 0.0;
+    ASSERT_TRUE(TensorContainsInf(src));
+    buf[1] = 1.0;
+    ASSERT_FALSE(TensorContainsInf(src));
+  }
 }
 
 TEST(Tensor, FromAndToStream) {
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
index d630ec44a2aa6f2567eedbb58b9ae181700f5fc9..4766ec28aa3cff6be3259f258f1c9543ae471f5d 100644
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) {
   buf[1] = 0.1;
   buf[2] = NAN;
 }
+
 static __global__ void FillInf(float* buf) {
   buf[0] = 0.0;
   buf[1] = INFINITY;
   buf[2] = 0.5;
 }
 
+static __global__ void FillNAN(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2].x = 0x7fff;
+}
+
+static __global__ void FillInf(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1].x = 0x7c00;
+  buf[2] = 0.5;
+}
+
 TEST(TensorContainsNAN, GPU) {
-  Tensor tensor;
-  platform::CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
+  using namespace paddle::platform;
+  CUDAPlace gpu(0);
+  auto& pool = DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
-  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-  cuda_ctx->Wait();
-  ASSERT_TRUE(TensorContainsNAN(tensor));
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsNAN(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsNAN(tensor));
+  }
 }
 
 TEST(TensorContainsInf, GPU) {
-  Tensor tensor;
-  platform::CUDAPlace gpu(0);
-  auto& pool = platform::DeviceContextPool::Instance();
+  using namespace paddle::platform;
+  CUDAPlace gpu(0);
+  auto& pool = DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
-  float* buf = tensor.mutable_data<float>({3}, gpu);
-  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
-  cuda_ctx->Wait();
-  ASSERT_TRUE(TensorContainsInf(tensor));
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsInf(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    ASSERT_TRUE(TensorContainsInf(tensor));
+  }
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index bdb147955ca0700dc0854b54c38d961caf8845f3..17ccca8cdcbcaabaddbbc0ca1d3ca4fdf054b0fb 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -5,7 +5,8 @@ cc_library(paddle_fluid_api
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 # Create static library
-cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+cc_library(paddle_fluid DEPS ${fluid_modules})
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 80eb9889670744ae527ea29609b33631a021bfa8..52e9c0baa64508f82d0a86a88c8c5f8d23f9f7f2 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -22,14 +22,14 @@ namespace paddle {
 namespace inference {
 
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
-  VLOG(3) << "loading model from " << filename;
-  std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
-  inputfs.seekg(0, std::ios::end);
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  fin.seekg(0, std::ios::end);
   contents.clear();
-  contents.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  inputfs.read(&contents[0], contents.size());
-  inputfs.close();
+  contents.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&contents[0], contents.size());
+  fin.close();
 }
 
 bool IsPersistable(const framework::VarDesc* var) {
@@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
   std::string program_desc_str;
+  VLOG(3) << "loading model from " << model_filename;
   ReadBinaryFile(model_filename, program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index d6fc51301bc45d3ce01142c0f015e4bb69e519bf..e9a27171f1cd68e7b10c860fb4a1417b930ed565 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size of input data");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 
 TEST(inference, image_classification) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
+                  "--batch_size=1 --repeat=1";
   }
 
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,13 +32,11 @@ TEST(inference, image_classification) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
-  int64_t batch_size = 1;
-
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
   SetupTensor<float>(input,
-                     {batch_size, 3, 32, 32},
+                     {FLAGS_batch_size, 3, 32, 32},
                      static_cast<float>(0),
                      static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
@@ -46,7 +47,9 @@ TEST(inference, image_classification) {
   cpu_fetchs1.push_back(&output1);
 
   // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << "--- CPU Runs: ---";
+  TestInference<paddle::platform::CPUPlace>(
+      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
   LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -55,7 +58,9 @@ TEST(inference, image_classification) {
   cpu_fetchs2.push_back(&output2);
 
   // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << "--- GPU Runs: ---";
+  TestInference<paddle::platform::CUDAPlace>(
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
   LOG(INFO) << output2.dims();
 
   CheckError<float>(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
index 99bee94cb82633219df7a4a5c5bada15d2d3ce64..1fb0f9e77797cf6e61e918700763ee33a495cb96 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -17,10 +17,13 @@ limitations under the License. */
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_int32(batch_size, 1, "Batch size of input data");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 
 TEST(inference, recognize_digits) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
+                  "--batch_size=1 --repeat=1";
   }
 
   LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
@@ -29,77 +32,39 @@ TEST(inference, recognize_digits) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
-  int64_t batch_size = 1;
-
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [-1.0, 1.0].
   SetupTensor<float>(input,
-                     {batch_size, 1, 28, 28},
+                     {FLAGS_batch_size, 1, 28, 28},
                      static_cast<float>(-1),
                      static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
+  for (auto is_combined : {false, true}) {
+    paddle::framework::LoDTensor output1;
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    cpu_fetchs1.push_back(&output1);
 
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
+    // Run inference on CPU
+    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
+    TestInference<paddle::platform::CPUPlace>(
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+    LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
+    paddle::framework::LoDTensor output2;
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+    cpu_fetchs2.push_back(&output2);
 
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
+    // Run inference on CUDA GPU
+    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
+    TestInference<paddle::platform::CUDAPlace>(
+        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+    LOG(INFO) << output2.dims();
 
-  CheckError<float>(output1, output2);
+    CheckError<float>(output1, output2);
 #endif
-}
-
-TEST(inference, recognize_digits_combine) {
-  if (FLAGS_dirname.empty()) {
-    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
   }
-
-  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
-  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
-
-  paddle::framework::LoDTensor input;
-  // Use normilized image pixels as input data,
-  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(
-      input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
-
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
-
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
-
-#ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
-
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
-
-  CheckError<float>(output1, output2);
-#endif
 }
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 49518e50d8541477234f17ac5b8709aeb57662ff..d0688445fe642d74b6bb7d5dd38a5ac3b4a719df 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
 
 template <typename T>
 void SetupTensor(paddle::framework::LoDTensor& input,
@@ -87,31 +88,58 @@ void CheckError(paddle::framework::LoDTensor& output1,
   EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
 
-template <typename Place, bool IsCombined = false>
+template <typename Place>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const int repeat = 1,
+                   const bool is_combined = false) {
   // 1. Define place, executor, scope
   auto place = Place();
   auto executor = paddle::framework::Executor(place);
   auto* scope = new paddle::framework::Scope();
 
+  // Profile the performance
+  paddle::platform::ProfilerState state;
+  if (paddle::platform::is_cpu_place(place)) {
+    state = paddle::platform::ProfilerState::kCPU;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    state = paddle::platform::ProfilerState::kCUDA;
+    // The default device_id of paddle::platform::CUDAPlace is 0.
+    // Users can get the device_id using:
+    //   int device_id = place.GetDeviceId();
+    paddle::platform::SetDeviceId(0);
+#endif
+  }
+
+  // Enable the profiler
+  paddle::platform::EnableProfiler(state);
+
   // 2. Initialize the inference_program and load parameters
   std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
-  if (IsCombined) {
-    // All parameters are saved in a single file.
-    // Hard-coding the file names of program and parameters in unittest.
-    // The file names should be consistent with that used in Python API
-    //  `fluid.io.save_inference_model`.
-    std::string prog_filename = "__model_combined__";
-    std::string param_filename = "__params_combined__";
-    inference_program = paddle::inference::Load(executor,
-                                                *scope,
-                                                dirname + "/" + prog_filename,
-                                                dirname + "/" + param_filename);
-  } else {
-    // Parameters are saved in separate files sited in the specified `dirname`.
-    inference_program = paddle::inference::Load(executor, *scope, dirname);
+  {
+    paddle::platform::RecordEvent record_event(
+        "init_program",
+        paddle::platform::DeviceContextPool::Instance().Get(place));
+
+    if (is_combined) {
+      // All parameters are saved in a single file.
+      // Hard-coding the file names of program and parameters in unittest.
+      // The file names should be consistent with that used in Python API
+      //  `fluid.io.save_inference_model`.
+      std::string prog_filename = "__model_combined__";
+      std::string param_filename = "__params_combined__";
+      inference_program =
+          paddle::inference::Load(executor,
+                                  *scope,
+                                  dirname + "/" + prog_filename,
+                                  dirname + "/" + param_filename);
+    } else {
+      // Parameters are saved in separate files sited in the specified
+      // `dirname`.
+      inference_program = paddle::inference::Load(executor, *scope, dirname);
+    }
   }
 
   // 3. Get the feed_target_names and fetch_target_names
@@ -134,7 +162,21 @@ void TestInference(const std::string& dirname,
   }
 
   // 6. Run the inference program
-  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+  {
+    // Run repeat times to profile the performance
+    for (int i = 0; i < repeat; ++i) {
+      paddle::platform::RecordEvent record_event(
+          "run_inference",
+          paddle::platform::DeviceContextPool::Instance().Get(place));
+
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    }
+  }
+
+  // Disable the profiler and print the timing information
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
+                                    "profiler.txt");
+  paddle::platform::ResetProfiler();
 
   delete scope;
 }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e1c02ec1613a7c31258f2d305b040ea627cebcdd..5d436a7e0c3752c889c19820507589f34d3bee94 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -1,5 +1,7 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
@@ -13,6 +15,8 @@ function(op_library TARGET)
     set(cu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(CUDNN_FILE)
+    set(mkldnn_cc_srcs)
+    set(MKLDNN_FILE)
     set(op_common_deps operator op_registry math_function)
     set(options "")
     set(oneValueArgs "")
@@ -36,12 +40,20 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
             list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
         endif()
+        if(WITH_MKLDNN)
+            string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
+                list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
+            endif()
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
             elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                 list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
+                list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
                 list(APPEND cu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
@@ -62,15 +74,15 @@ function(op_library TARGET)
         set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
     endif()
     if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
-                ${op_common_deps})
+        cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+            ${op_common_deps})
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -101,7 +113,8 @@ function(op_library TARGET)
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -112,6 +125,11 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
+    # pybind USE_OP_DEVICE_KERNEL for MKLDNN
+    if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+    endif()
+
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
@@ -128,8 +146,8 @@ else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 
+add_subdirectory(detail)
 if(WITH_DISTRIBUTE)
-    add_subdirectory(detail)
     set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
     op_library(send_op DEPS ${DISTRIBUTE_DEPS})
@@ -170,26 +188,31 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
-op_library(create_reader_op DEPS reader)
 
 if (WITH_GPU)
-    op_library(conv_op DEPS vol2col depthwise_conv)
+    op_library(conv_op DEPS vol2col depthwise_conv im2col)
 else()
-    op_library(conv_op DEPS vol2col)
+    op_library(conv_op DEPS vol2col im2col)
 endif()
-op_library(conv_transpose_op DEPS vol2col)
+op_library(conv_transpose_op DEPS vol2col im2col)
 
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
+op_library(concat_op DEPS concat)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
-file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
+add_subdirectory(reader)
+foreach(src ${READER_LIBRARY})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index bdce8f0a6f5f817d2010e8b4de28a7d1be4cc08b..0eedd8ee51ebfff6f553d8e19e97c3a45a95fa6a 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
                ops::ConcatOpGrad, false)
-REGISTER_OP_CPU_KERNEL(concat,
-                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
-REGISTER_OP_CPU_KERNEL(concat_grad,
-                       ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
+REGISTER_OP_CPU_KERNEL(
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
+REGISTER_OP_CPU_KERNEL(
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 208a4481c6afe1b8f62e8f675c951c3349639f46..92c8ab6d9ff11ec6acd46a39877eb67d624748a9 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
 
-    auto out_stride = framework::stride_numel(out->dims());
-
-    size_t output_offset = 0;
-
-    // If axis >=1, copy to out immediately need to call many times
-    // of cuda memcpy. Copy the input to cpu and do the stride copy,
-    // then copy to gpu output.
-
-    if (platform::is_gpu_place(place) && axis >= 1) {
-      platform::CPUPlace copy_place;
-      auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
-      framework::Tensor cpu_out;
-      cpu_out.Resize(out->dims());
-      cpu_out.mutable_data<T>(copy_place);
-      auto& dev_ctx = ctx.device_context();
-      std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
-      for (auto* in : ins) {
-        std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
-        framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
-        cpu_ins.emplace_back(std::move(cpu_in));
-      }
-      // TODO(dzhwinter): overlap copy and compute stream
-      // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
-      dev_ctx.Wait();
-
-      for (auto& in : cpu_ins) {
-        auto& cpu_in = *in.get();
-        auto in_stride = framework::stride_numel(cpu_in.dims());
-
-        StridedNumelCopyWithAxis<T>(
-            cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
-            cpu_in.data<T>(), in_stride, in_stride[axis]);
-        output_offset += in_stride[axis];
-      }
-      framework::TensorCopy(cpu_out, place, dev_ctx, out);
-    } else {
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (axis == 0 && ins.size() < 10) {
+      size_t output_offset = 0;
       for (auto* in : ins) {
         auto in_stride = framework::stride_numel(in->dims());
+        auto out_stride = framework::stride_numel(out->dims());
         StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
                                     out->data<T>() + output_offset, out_stride,
                                     in->data<T>(), in_stride, in_stride[axis]);
         output_offset += in_stride[axis];
       }
+    } else {
+      std::vector<framework::Tensor> inputs(ins.size());
+      for (size_t j = 0; j < ins.size(); ++j) {
+        inputs[j] = *ins[j];
+      }
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
+      concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
     }
   }
 };
@@ -86,16 +63,31 @@ class ConcatGradKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
-    size_t input_offset = 0;
-    auto in_stride = framework::stride_numel(in->dims());
 
-    for (auto& out : outs) {
-      out->mutable_data<T>(ctx.GetPlace());
-      auto out_stride = framework::stride_numel(out->dims());
-      StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-                                  out_stride, in->data<T>() + input_offset,
-                                  in_stride, out_stride[axis]);
-      input_offset += out_stride[axis];
+    // Sometimes direct copies will be faster, this maybe need deeply analysis.
+    if (axis == 0 && outs.size() < 10) {
+      size_t input_offset = 0;
+      auto in_stride = framework::stride_numel(in->dims());
+
+      for (auto& out : outs) {
+        out->mutable_data<T>(ctx.GetPlace());
+        auto out_stride = framework::stride_numel(out->dims());
+        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
+                                    out_stride, in->data<T>() + input_offset,
+                                    in_stride, out_stride[axis]);
+        input_offset += out_stride[axis];
+      }
+    } else {
+      std::vector<framework::Tensor> outputs(outs.size());
+      for (size_t j = 0; j < outs.size(); ++j) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs[j] = *outs[j];
+      }
+
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
+          concat_grad_functor;
+      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
     }
   }
 };
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d59cc2c9d424f067ca638cb76e52c2e95ae75182
--- /dev/null
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -0,0 +1,313 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::primitive;
+using mkldnn::convolution_forward;
+using mkldnn::convolution_backward_weights;
+using mkldnn::convolution_backward_data;
+using mkldnn::convolution_direct;
+using mkldnn::prop_kind;
+using mkldnn::padding_kind;
+using mkldnn::stream;
+
+namespace {
+std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                     const memory::desc& dst, const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const mkldnn::engine& engine);
+
+convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
+    const memory::desc& src, const memory::desc& diff_weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine);
+
+convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
+    const memory::desc& diff_src, const memory::desc& weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine);
+}  // anonymous namespace
+
+template <typename T>
+class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+
+    // TODO(pzelazko-intel) add support for group convolution and dilation
+    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    PADDLE_ENFORCE(
+        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    // allocate memory for output
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+
+    // TODO(pzelazko-intel): support more formats
+    // memory descriptors for convolution src/weight/dst
+    auto conv_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_dst_md =
+        MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
+
+    // create memory primitives
+    auto conv_src_memory =
+        memory({conv_src_md, mkldnn_engine}, (void*)input_data);
+    auto conv_weights_memory =
+        memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
+    auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data);
+
+    std::unique_ptr<convolution_forward::primitive_desc> conv_pd =
+        ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides,
+                             paddings, mkldnn_engine);
+
+    // save p_conv_pd into dev_ctx to be referred in backward path
+    auto p_conv_pd = conv_pd.get();
+    std::shared_ptr<void> conv_pd_value = std::move(conv_pd);
+    dev_ctx.SetBlob(key_conv_pd, conv_pd_value);
+
+    // create convolution op primitive
+    auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory,
+                                         conv_weights_memory, conv_dst_memory);
+
+    // push op to stream and wait MKLDNN until it's executed
+    std::vector<primitive> pipeline{conv_prim};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
+
+template <typename T>
+class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const Tensor* filter = ctx.Input<Tensor>("Filter");
+    const Tensor* output = ctx.Input<Tensor>("Output");
+    const Tensor* output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    if (!input_grad && !filter_grad) return;
+
+    // Get an unique name from "argument" name of "Output" variable
+    // This name will be used as key when saving info into device context
+    const std::string key = ctx.op().Input("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data = nullptr;
+    T* filter_grad_data = nullptr;
+
+    // allocate memory for gradient of input/filter
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (filter_grad) {
+      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+
+    // TODO(pzelazko-intel): support more formats
+    auto conv_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_diff_src_md =
+        MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
+    auto conv_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_diff_weights_md =
+        MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
+    auto conv_diff_dst_md =
+        MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
+
+    // create memory
+    auto conv_diff_dst_memory =
+        memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data);
+    // Retrieve conv_pd from device context
+    std::shared_ptr<void> conv_pd;
+    convolution_forward::primitive_desc* p_conv_pd;
+
+    conv_pd = dev_ctx.GetBlob(key_conv_pd);
+    PADDLE_ENFORCE(conv_pd != nullptr,
+                   "Fail to find conv_pd in device context");
+    p_conv_pd =
+        static_cast<convolution_forward::primitive_desc*>(conv_pd.get());
+
+    // create backward conv primitive for weights
+    if (filter_grad) {
+      // create primitive descriptor
+      convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
+          ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md,
+                                      conv_diff_dst_md, strides, paddings,
+                                      *p_conv_pd, mkldnn_engine);
+
+      // create memory
+      auto conv_diff_weights_memory = memory(
+          {conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
+      auto conv_src_memory =
+          memory({conv_src_md, mkldnn_engine}, (void*)input_data);
+
+      // create backward conv primitive for weights
+      auto conv_bwd_weights_prim = convolution_backward_weights(
+          conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory,
+          conv_diff_weights_memory);
+
+      // push primitive and execute it
+      std::vector<primitive> pipeline{conv_bwd_weights_prim};
+      stream(stream::kind::eager).submit(pipeline).wait();
+    }
+
+    if (input_grad) {
+      // create primitive descriptor
+      convolution_backward_data::primitive_desc conv_bwd_data_pd =
+          ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md,
+                                   conv_diff_dst_md, strides, paddings,
+                                   *p_conv_pd, mkldnn_engine);
+
+      // create memory
+      auto conv_diff_src_memory =
+          memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data);
+      auto conv_weights_memory =
+          memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
+
+      // create backward conv primitive for data
+      auto conv_bwd_data_prim =
+          convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory,
+                                    conv_weights_memory, conv_diff_src_memory);
+
+      // push primitive and execute it
+      std::vector<primitive> pipeline{conv_bwd_data_prim};
+      stream(stream::kind::eager).submit(pipeline).wait();
+    }
+  }  // Compute()
+};
+
+namespace {
+std::unique_ptr<convolution_forward::primitive_desc> ConvFwdPrimitiveDesc(
+    const memory::desc& src, const memory::desc& weights,
+    const memory::desc& dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings, const mkldnn::engine& engine) {
+  mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+  mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+
+  auto conv_desc = mkldnn::convolution_forward::desc(
+      mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst,
+      stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+  auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine);
+
+  return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+      p_conv_pd);
+}
+
+convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
+    const memory::desc& src, const memory::desc& diff_weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine) {
+  auto conv_bwd_weights_desc = convolution_backward_weights::desc(
+      convolution_direct, src, diff_weights, diff_dst, strides, paddings,
+      paddings, padding_kind::zero);
+  return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc,
+                                                      engine, conv_pd);
+}
+
+convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
+    const memory::desc& diff_src, const memory::desc& weights,
+    const memory::desc& diff_dst, const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const convolution_forward::primitive_desc& conv_pd,
+    const mkldnn::engine& engine) {
+  auto conv_bwd_data_desc = convolution_backward_data::desc(
+      convolution_direct, diff_src, weights, diff_dst, strides, paddings,
+      paddings, padding_kind::zero);
+  return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine,
+                                                   conv_pd);
+}
+}  // anonymous namespace
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvOpMkldnnKernel<float>);
+
+REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvGradOpMkldnnKernel<float>);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 83b7708bf337b70f97c5e9126efd142b9b957b00..4b02b80d7772fa15d2333692551da5e59d93765f 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_op.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+  framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  if (platform::CanCUDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kCUDNN;
   }
 #endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
   }
+#endif
 
   std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
-  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+  framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  if (platform::CanCUDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kCUDNN;
   }
 #endif
-
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
   }
+#endif
 
   std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc
deleted file mode 100644
index 17ed7e24eca9ae923ef71cca7d4c0005eda271f1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/create_reader_op.cc
+++ /dev/null
@@ -1,246 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-
-namespace paddle {
-namespace operators {
-
-static std::vector<framework::DDim> RestoreShapes(
-    const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
-  std::vector<framework::DDim> res;
-  int offset = 0;
-  for (int len : ranks) {
-    auto start_it = shape_concat.begin() + offset;
-    auto end_it = start_it + len;
-    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
-    offset += len;
-  }
-  return res;
-}
-
-// general infershape for file readers
-class CreateFileReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output file reader should not be null.");
-    const auto shape_concat =
-        ctx->Attrs().Get<std::vector<int>>("shape_concat");
-    const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    ctx->SetReaderDims("Out", shapes);
-
-    if (ctx->IsRuntime()) {
-      const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
-      PADDLE_ENFORCE_EQ(
-          lod_levels.size(), shapes.size(),
-          "The number of 'lod_levels'(%d) doesn't match the number "
-          "of 'shapes'(%d).",
-          lod_levels.size(), shapes.size());
-      framework::VarDesc* reader =
-          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-      reader->SetLoDLevels(lod_levels);
-    }
-  }
-};
-
-// general infershape for decorated readers
-class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
-                   "Input(UnderlyingReader) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "The output decorated reader should not be null.");
-    ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
-
-    if (ctx->IsRuntime()) {
-      framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
-          ctx->GetInputVarPtrs("UnderlyingReader")[0]);
-      framework::VarDesc* out_reader =
-          boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
-      out_reader->SetLoDLevels(in_reader->GetLoDLevels());
-    }
-  }
-};
-
-// general var type inference for file readers
-class CreateFileReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string reader_name = op_desc.Output("Out")[0];
-    framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-    reader->SetType(framework::proto::VarType::READER);
-  }
-};
-
-// general var type inference for decorated readers
-class CreateDecoratedReaderInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
-    framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
-    std::string out_reader_name = op_desc.Output("Out")[0];
-    framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
-    out_reader->SetType(framework::proto::VarType::READER);
-    out_reader->SetDataTypes(in_reader->GetDataTypes());
-  }
-};
-
-template <typename T>
-class CreateRandomDataGeneratorOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
-    const auto& ranks = Attr<std::vector<int>>("ranks");
-    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
-    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
-                      "The accumulate of all ranks should be equal to the "
-                      "shape concat's length.");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
-                                                     Attr<float>("max")));
-  }
-};
-
-class CreateRandomDataGeneratorOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddOutput("Out", "(ReaderHolder) The created random reader.");
-    AddAttr<std::vector<int>>("shape_concat",
-                              "The concat of all data's shapes.");
-    AddAttr<std::vector<int>>(
-        "ranks",
-        "The ranks of each data."
-        "e.g."
-        "shape_concat = [2,3,4,5,6]"
-        "ranks = [3,2]"
-        "It means the reader will generate two data each time,"
-        "whose shapes are [2,3,4] and [5,6] respectively.");
-    AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
-    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
-    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
-    AddComment(R"DOC(
-      CreateRandomDataGenerator Operator
-
-      This Op creates a random reader.
-      The reader generates random data instead of really reading from files.
-      Generated data follow an uniform distribution between 'min' and 'max'.
-    )DOC");
-  }
-};
-
-class CreateShuffleReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
-                                            Attr<int>("buffer_size")));
-  }
-};
-
-class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a shuffle reader.");
-    AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
-    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
-    AddComment(R"DOC(
-      CreateShuffleReader Operator
-
-      A shuffle reader takes another reader as its 'underlying reader'
-      and yields the underlying reader's outputs in a shuffled order.
-    )DOC");
-  }
-};
-
-class CreateBatchReaderOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new framework::BatchReader(underlying_reader.Get(),
-                                          Attr<int>("batch_size")));
-  }
-};
-
-class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
-    AddInput(
-        "UnderlyingReader",
-        "(ReaderHolder) The underlying reader for creating a batch reader.");
-    AddOutput("Out", "(ReaderHolder) The created batch reader.");
-    AddAttr<int>("batch_size",
-                 "How many instances the batch reader yields each time.")
-        .GreaterThan(0);
-    AddComment(R"DOC(
-      CreateBatchReader Operator
-
-      A batch reader takes another reader as its 'underlying reader',
-      gathers the underlying reader's outputs and then yields them in batches.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(create_random_data_generator,
-                  ops::CreateRandomDataGeneratorOp<float>,
-                  ops::CreateFileReaderInferShape,
-                  ops::CreateRandomDataGeneratorOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateFileReaderInferVarType);
-REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateShuffleReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateDecoratedReaderInferVarType);
-REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
-                  ops::CreateDecoratedReaderInferShape,
-                  ops::CreateBatchReaderOpMaker,
-                  paddle::framework::EmptyGradOpMaker,
-                  ops::CreateDecoratedReaderInferVarType);
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 571a75c9dcd903672d460f192bf28ddbeaea7c78..0581bd2ac55218a2955fcb260d8b61cac0d210b5 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -1 +1,3 @@
-grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+if(WITH_DISTRIBUTE)
+  grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+endif()
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 9b8ca925373eb99df3fcb14ec7962768dd2eab04..73c84c2fe0155d21d7059938330e44fa3668c6df 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(
         framework::ToDataType(
             ctx.Input<framework::Tensor>("DetectRes")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h
index 6bcc577456b13f7930e07c24564953e93c5339ed..95649ac46e6bd41b9e1a865794cdec3ae1e6e247 100644
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-struct ElementwiseDivGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto z_e = framework::EigenVector<T>::Flatten(*z);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseDivBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
+struct DivGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
 };
 
 template <typename T>
-struct ElementwiseDivBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e / y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
+struct DivGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return -dout * x / (y * y);
   }
 };
 
@@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
-                           ElementwiseDivBroadCastGradFunctor<T>,
-                           ElementwiseDivBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
+    ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
+        ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h
index ab3a3d58275e8a48a92aa0c8a50af072e83ac968..527a18ee3ba88a158a13266a7fbcdafe59ec69d9 100644
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -41,76 +41,16 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-struct ElementwiseMaxGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
-    }
+struct MaxGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * (x > y);
   }
 };
 
 template <typename T>
-struct ElementwiseMaxBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMaxBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
+struct MaxGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * (x <= y);
   }
 };
 
@@ -127,12 +67,9 @@ class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
-                           ElementwiseMaxBroadCastGradFunctor<T>,
-                           ElementwiseMaxBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
+    ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
+        ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
   }
 };
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h
index f0eec9d2468b519473f813d71127d670adb08250..d4e5831463f3e54c72789b6876ea696cf1b4ef4b 100644
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -41,76 +41,16 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-struct ElementwiseMinGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
-    }
+struct MinGradDx {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * (x < y);
   }
 };
 
 template <typename T>
-struct ElementwiseMinBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMinBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
+struct MinGradDy {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
+    return dout * (x >= y);
   }
 };
 
@@ -127,12 +67,9 @@ class ElementwiseMinGradKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
-                           ElementwiseMinBroadCastGradFunctor<T>,
-                           ElementwiseMinBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
+    ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
+        ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
   }
 };
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
index 46d69ed87d4c2eeae674730b00858b6358b97773..dc73cb6f23614504640283af01981d3f69e89126 100644
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -40,78 +40,13 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-struct ElementwiseMulGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = x_e * dz_e;
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseMulBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
-                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
+struct MulGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
 };
 
 template <typename T>
-struct ElementwiseMulBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-
-    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
-                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
-                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e * y_e_bcast;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (x_e * dz_e)
-                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
+struct MulGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
 };
 
 template <typename DeviceContext, typename T>
@@ -127,12 +62,9 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
-                           ElementwiseMulBroadCastGradFunctor<T>,
-                           ElementwiseMulBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
+    ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
+        ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
   }
 };
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index ffda53a383ced411415e528886a23f28f6a62648..0b4238436ffcc586fe8bc7abbe4cfbc1654dcb88 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast {
       dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
     }
     if (dy_ != nullptr) {
-      dy_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
     }
   }
 
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index a8fc242ed79ecee77dc5bb56882a1a15f872fccf..fe088b8203722a43b9aba7be3878b8f4ca68ba12 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -40,61 +40,13 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-struct ElementwiseSubGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) * dz_e;
-    }
-  }
+struct SubGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
 };
 
 template <typename T>
-struct ElementwiseSubBroadCastGradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) *
-                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
-                           .sum(Eigen::array<int, 1>{{0}});
-    }
-  }
-};
-
-template <typename T>
-struct ElementwiseSubBroadCast2GradFunctor {
-  template <typename Device, typename X, typename Y, typename Z, typename dX,
-            typename dY, typename dZ, typename Pre, typename N, typename Post>
-  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
-                  Post post) {
-    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
-    if (dx) {
-      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
-      dx_e.device(d) = dz_e;
-    }
-
-    if (dy) {
-      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
-      dy_e.device(d) = (-1.0) *
-                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
-                           .sum(Eigen::array<int, 2>{{0, 2}});
-    }
-  }
+struct SubGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
 };
 
 template <typename DeviceContext, typename T>
@@ -110,12 +62,9 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     int axis = ctx.Attr<int>("axis");
-    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
-                           ElementwiseSubBroadCastGradFunctor<T>,
-                           ElementwiseSubBroadCast2GradFunctor<T>>(
-        ctx, x, y, out, dout, axis, dx, dy);
+    ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+        ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
   }
 };
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 768106fadf355ea6fb148491e232dc0ef1453a75..a181d802262d15b188060dae4330cec0e24714ab 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,46 +1,59 @@
 add_subdirectory(detail)
 
-if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
-    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
-    nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
-    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
-    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
-    nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
-    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
-    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
-    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
-    nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
-    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
-    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
-    nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
-else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
-    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
-    cc_library(softmax SRCS softmax.cc DEPS device_context)
-    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
-    cc_library(pooling SRCS pooling.cc DEPS device_context)
-    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
-    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
-    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
-    cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
-    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
-    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
-    cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
-endif()
+function(math_library TARGET)
+    # math_library is a function to create math library. 
+    # The interface is the same as cc_library. 
+    # But it handle split GPU/CPU code and link some common library.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(math_common_deps device_context framework_proto)
+    set(multiValueArgs DEPS)
+    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        list(APPEND cc_srcs ${TARGET}.cc)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND cu_srcs ${TARGET}.cu)
+    endif()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif(${cc_srcs_len} GREATER 0)
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    endif()
+endfunction()
 
-cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+# please add new math_library in alphabetical order
+math_library(concat)
+math_library(context_project DEPS im2col math_function)
+math_library(cross_entropy)
+math_library(cos_sim_functor)
+math_library(depthwise_conv)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(im2col)
+math_library(lstm_compute DEPS activation_functions)
+math_library(math_function DEPS cblas)
+math_library(maxouting)
+math_library(pooling)
+math_library(selected_rows_functor DEPS selected_rows)
+math_library(sequence2batch)
+math_library(sequence_padding)
+math_library(sequence_pooling DEPS math_function)
+math_library(sequence_scale)
+math_library(softmax)
+math_library(unpooling)
+math_library(vol2col)
+
+cc_test(math_function_test SRCS math_function_test.cc)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
-cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
-cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
+cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
+cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
+if(WITH_GPU)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+endif()
+cc_test(concat_test SRCS concat_test.cc DEPS concat)
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b542143419e05e9baf29e9a2322447f32ddd9829
--- /dev/null
+++ b/paddle/fluid/operators/math/concat.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/concat.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int out_rows = rows, out_cols = 0;
+
+    std::vector<int64_t> input_cols(input.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+
+    // computation
+    for (int k = 0; k < out_rows; ++k) {
+      T* dst_ptr = output->data<T>() + k * out_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        const T* src_prt = input[j].data<T>() + k * col_len;
+        memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
+                     sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, const int axis,
+                  std::vector<framework::Tensor>& outputs) {
+    // TODO(zcd): Add input data validity checking
+    int num = outputs.size();
+
+    int input_rows = 1;
+    auto dim_0 = outputs[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      input_rows *= dim_0[i];
+    }
+    int input_cols = 0;
+
+    std::vector<int64_t> output_cols(outputs.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = outputs[i].numel() / input_rows;
+      input_cols += t_cols;
+      output_cols[i] = t_cols;
+    }
+    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+
+    // computation
+    for (int k = 0; k < input_rows; ++k) {
+      const T* src_ptr = input.data<T>() + k * input_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = output_cols[j];
+        T* dst_ptr = outputs[j].data<T>() + k * col_len;
+        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                     sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+
+template class ConcatFunctor<platform::CPUDeviceContext, int>;
+template class ConcatFunctor<platform::CPUDeviceContext, int64_t>;
+template class ConcatFunctor<platform::CPUDeviceContext, float>;
+template class ConcatFunctor<platform::CPUDeviceContext, double>;
+
+template class ConcatGradFunctor<platform::CPUDeviceContext, int>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, int64_t>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, float>;
+template class ConcatGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
new file mode 100644
index 0000000000000000000000000000000000000000..60b266f08fb2d4217c5933902d69de96fc2abe22
--- /dev/null
+++ b/paddle/fluid/operators/math/concat.cu
@@ -0,0 +1,280 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/math/concat.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__device__ T upper_bound(const T* first, T count, T val) {
+  const T* orig = first;
+  const T* it = nullptr;
+  T step = 0;
+  while (count > 0) {
+    it = first;
+    step = count / 2;
+    it += step;
+    if (!(val < *it)) {
+      first = ++it;
+      count -= step + 1;
+    } else {
+      count = step;
+    }
+  }
+  return first - orig;
+}
+
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
+
+  int curr_offset = input_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* input_ptr = inputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * segment_width + local_col];
+  }
+}
+
+template <typename T>
+__global__ void KernelConcat(T** inputs, const int input_col,
+                             const int output_rows, const int output_cols,
+                             T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* input_ptr = inputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * input_col + in_offset];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int* output_cols,
+                                 int col_size, T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
+  int curr_offset = output_cols[segment];
+  int curr_segment = segment;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    T curr_col_offset;
+    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * segment_width + local_col] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+
+template <typename T>
+__global__ void KernelConcatGrad(const T* input, const int input_row,
+                                 const int input_col, const int output_cols,
+                                 T** outputs) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  double inv_input_col = 1.0 / input_col;
+  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * inv_input_col;
+    int in_offset = tid_x - split * input_col;
+    T* output_ptr = outputs[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
+      output_ptr[tid_y * output_cols + in_offset] =
+          input[tid_y * input_col + tid_x];
+  }
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output) {
+    // TODO(zcd): Add input data validity checking
+    int num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int cols = input[0].numel() / rows;
+    int out_rows = rows, out_cols = 0;
+
+    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_cols(num + 1);
+    inputs_cols[0] = 0;
+    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+
+    bool sameShape = true;
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      if (sameShape) {
+        if (t_cols != cols) sameShape = false;
+      }
+      out_cols += t_cols;
+      inputs_cols[i + 1] = out_cols;
+      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
+    }
+
+    T** ins_gpu =
+        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
+    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
+
+    // computation
+    // set the thread block and grid according to CurrentDeviceId
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_cols + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+    int grid_cols =
+        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+    if (sameShape) {
+      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+    } else {
+      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
+          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
+          out_cols, output->data<T>());
+    }
+  }
+};
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
+template <typename T>
+class ConcatGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, const int axis,
+                  std::vector<framework::Tensor>& outputs) {
+    // TODO(zcd): Add input data validity checking
+    int num = outputs.size();
+    int input_row = 1;
+    auto dim_0 = outputs[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      input_row *= dim_0[i];
+    }
+
+    int output_col_0 = outputs[0].numel() / input_row;
+    int input_col = 0;
+    bool sameShape = true;
+
+    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(num + 1);
+    outputs_cols[0] = 0;
+    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
+
+    for (int i = 0; i < num; ++i) {
+      int t_col = outputs[i].numel() / input_row;
+      if (sameShape) {
+        if (t_col != output_col_0) sameShape = false;
+      }
+      input_col += t_col;
+      outputs_cols[i + 1] = input_col;
+      outputs_ptr[i] = outputs[i].data<T>();
+    }
+
+    T** outs_gpu =
+        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
+    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
+
+    // computation
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((input_col + 31) >> 5) << 5;
+    }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+    int grid_cols =
+        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+    if (sameShape) {
+      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+    } else {
+      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
+          input.data<T>(), input_row, input_col, outs_col_gpu,
+          static_cast<int>(outputs_cols.size()), outs_gpu);
+    }
+  }
+};
+
+template class ConcatFunctor<platform::CUDADeviceContext, int>;
+template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatFunctor<platform::CUDADeviceContext, float>;
+template class ConcatFunctor<platform::CUDADeviceContext, double>;
+
+template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
+template class ConcatGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
new file mode 100644
index 0000000000000000000000000000000000000000..22147d79e4b1eeee76f7445dd963bf5062049a34
--- /dev/null
+++ b/paddle/fluid/operators/math/concat.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const std::vector<framework::Tensor>& input, const int axis,
+                  framework::Tensor* output);
+};
+
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename DeviceContext, typename T>
+class ConcatGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const int axis, std::vector<framework::Tensor>& outputs);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1741af8148bb90863f294ba4930006a58b5ddbf9
--- /dev/null
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -0,0 +1,336 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/concat.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+
+using namespace paddle::framework;
+using namespace paddle::platform;
+
+template <typename DeviceContext, typename Place>
+void testConcat() {
+  Tensor input_a_cpu;
+  Tensor input_b_cpu;
+  Tensor out_cpu;
+  Tensor input_a;
+  Tensor input_b;
+  Tensor out;
+
+  DeviceContext* context = new DeviceContext(Place());
+  //  DeviceContext context(Place());
+
+  /**
+   * cast1:
+   *    inputs:
+   *        t_a.shape: [2, 3, 4]
+   *        t_b.shape: [3, 3, 4]
+   *    output:
+   *        out.shape: [5, 3, 4]
+   */
+  auto dim_a = make_ddim({2, 3, 4});
+  auto dim_b = make_ddim({3, 3, 4});
+  auto dim_out = make_ddim({5, 3, 4});
+
+  input_a.mutable_data<int>(dim_a, Place());
+  input_b.mutable_data<int>(dim_b, Place());
+  out.mutable_data<int>(dim_out, Place());
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
+    input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
+    out_cpu.mutable_data<int>(dim_out, CPUPlace());
+  }
+
+  int* a_ptr;
+  int* b_ptr;
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 3 * 3 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+
+  std::vector<Tensor> input;
+  input.push_back(input_a);
+  input.push_back(input_b);
+
+  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
+  concat_functor(*context, input, 0, &out);
+
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+
+  int* out_ptr;
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+
+  int cols = 2 * 3 * 4;
+  int idx_a = 0, idx_b = 0;
+  for (int j = 0; j < 5 * 3 * 4; ++j) {
+    if (j >= cols) {
+      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
+      ++idx_b;
+    } else {
+      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
+      ++idx_a;
+    }
+  }
+  //
+  /**
+    * cast2:
+    *    inputs:
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 4, 4]
+    *    output:
+    *        out.shape: [2, 7, 4]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 4, 4});
+  dim_out = make_ddim({2, 7, 4});
+
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 4 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+
+  concat_functor(*context, input, 1, &out);
+
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+
+  cols = 3 * 4;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 28; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+
+  /**
+    * cast3:
+    *    inputs:
+    *        t_a.shape: [2, 3, 5]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 3, 9]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 3, 5});
+  dim_out = make_ddim({2, 3, 9});
+
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 3 * 5; ++i) {
+    b_ptr[i] = i;
+  }
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+
+  concat_functor(*context, input, 2, &out);
+
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+
+  // check the data
+  cols = 4;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+
+  /**
+    * cast4:
+    *    inputs:
+    *        axis = 1
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 6, 4]
+    */
+  dim_a = make_ddim({2, 3, 4});
+  dim_b = make_ddim({2, 3, 4});
+  dim_out = make_ddim({2, 6, 4});
+
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
+  if (paddle::platform::is_gpu_place(Place())) {
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
+  }
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    a_ptr = input_a_cpu.data<int>();
+    b_ptr = input_b_cpu.data<int>();
+  } else {
+    a_ptr = input_a.data<int>();
+    b_ptr = input_b.data<int>();
+  }
+
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    a_ptr[i] = i;
+  }
+  for (int i = 0; i < 2 * 3 * 4; ++i) {
+    b_ptr[i] = i;
+  }
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(input_a_cpu, Place(), *context, &input_a);
+    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+  }
+
+  input.clear();
+  input.push_back(input_a);
+  input.push_back(input_b);
+
+  concat_functor(*context, input, 1, &out);
+
+  // check the dim of input_a, input_b
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+
+  if (paddle::platform::is_gpu_place(Place())) {
+    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    out_ptr = out_cpu.data<int>();
+  } else {
+    out_ptr = out.data<int>();
+  }
+
+  // check the data
+  cols = 12;
+  idx_a = 0, idx_b = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 24; ++j) {
+      if (j >= cols) {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
+        ++idx_b;
+      } else {
+        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
+        ++idx_a;
+      }
+    }
+  }
+}
+
+TEST(math, concat) {
+  testConcat<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testConcat<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
+#endif
+}
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 41eab3ade207a592e1a72cd22580f18723167eff..f7f33917d7ef5bbcc7fb5d6e3d0a7f3ae63cde34 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -245,11 +245,13 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 
-#define DEFINE_CPU_TRANS(RANK)                                          \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+#define DEFINE_CPU_TRANS(RANK)                                             \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
+                            RANK>;                                         \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
   template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
 
 DEFINE_CPU_TRANS(1);
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index 72bf2ab17016157f22bcf180090660346ed4f60d..8899abff360ea867872d3433722cdb37ef358500 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..06489f32d64d69030c084a038acb78ac2bac6200
--- /dev/null
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -0,0 +1,5 @@
+cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
+op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry)
+op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry)
+op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry)
+set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bac043a5529d877dba79c03f07b9d43c9b71d7aa
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -0,0 +1,137 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class BatchReader : public framework::DecoratedReader {
+ public:
+  BatchReader(ReaderBase* reader, int batch_size)
+      : DecoratedReader(reader), batch_size_(batch_size) {
+    buffer_.reserve(batch_size_);
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  int batch_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+};
+
+class CreateBatchReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(
+        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
+  }
+};
+
+class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<int>("batch_size",
+                 "How many instances the batch reader yields each time.")
+        .GreaterThan(0);
+    AddComment(R"DOC(
+      CreateBatchReader Operator
+
+      A batch reader takes another reader as its 'underlying reader',
+      gathers the underlying reader's outputs and then yields them in batches.
+    )DOC");
+  }
+};
+
+void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  buffer_.clear();
+  buffer_.reserve(batch_size_);
+  for (int i = 0; i < batch_size_; ++i) {
+    if (reader_->HasNext()) {
+      buffer_.push_back(std::vector<framework::LoDTensor>());
+      reader_->ReadNext(&buffer_.back());
+    } else {
+      break;
+    }
+  }
+  // Concat instances
+  out->clear();
+  if (buffer_.empty()) {
+    // if buffer_ is empty, the 'out' will return as an empty vector.
+    return;
+  }
+  int out_num = buffer_[0].size();
+  out->reserve(out_num);
+  for (int j = 0; j < out_num; ++j) {
+    // Merge shape and check date type
+    std::type_index batch_type = buffer_[0][j].type();
+    framework::DDim batch_shape = buffer_[0][j].dims();
+    for (size_t i = 1; i < buffer_.size(); ++i) {
+      std::type_index ins_type = buffer_[i][j].type();
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      PADDLE_ENFORCE_EQ(batch_type, ins_type);
+      PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
+                        slice_ddim(ins_shape, 1, ins_shape.size()));
+      PADDLE_ENFORCE_GT(ins_shape[0], 0);
+      batch_shape[0] += ins_shape[0];
+    }
+
+    framework::LoDTensor out_tensor;
+    out_tensor.Resize(batch_shape);
+    out_tensor.mutable_data(platform::CPUPlace(), batch_type);
+    int64_t dst_offset = 0;
+
+    // Merge lod and data
+    framework::LoD batch_lod;
+    for (size_t i = 0; i < buffer_.size(); ++i) {
+      framework::DDim ins_shape = buffer_[i][j].dims();
+      framework::LoD ins_lod = buffer_[i][j].lod();
+      if (i == 0) {
+        batch_lod = ins_lod;
+      } else {
+        PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
+        for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
+          auto& lod_level = batch_lod[level_idx];
+          for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
+            lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
+          }
+        }
+      }
+      auto dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
+      TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
+      dst_offset += ins_shape[0];
+    }
+    out_tensor.set_lod(batch_lod);
+    out->push_back(out_tensor);
+  }
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_batch_reader,
+                                   ops::CreateBatchReaderOp,
+                                   ops::CreateBatchReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f77ab8ab196dae4cf9351cee9bc5566ec2c04e4b
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -0,0 +1,110 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+template <typename T>
+class RandomDataGenerator : public framework::FileReader {
+ public:
+  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
+                      float max)
+      : FileReader(shapes), min_(min), max_(max) {
+    PADDLE_ENFORCE_LE(
+        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+    unsigned int seed = std::random_device()();
+    engine_.seed(seed);
+    dist_ = std::uniform_real_distribution<float>(min_, max_);
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    out->clear();
+    out->reserve(shapes_.size());
+    for (const framework::DDim& shape : shapes_) {
+      PADDLE_ENFORCE_GE(
+          shape.size(), 2,
+          "The rank of reader's output data should be 2 at least.(Now it's %d)",
+          shape.size());
+      framework::LoDTensor out_tensor;
+      out_tensor.Resize(shape);
+      T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
+      int64_t numel = framework::product(shape);
+      for (int64_t i = 0; i < numel; ++i) {
+        data[i] = dist_(engine_);
+      }
+      out->push_back(out_tensor);
+    }
+  }
+
+  bool HasNext() const override { return true; }
+
+  void ReInit() override { return; }
+
+ private:
+  float min_;
+  float max_;
+  std::minstd_rand engine_;
+  std::uniform_real_distribution<float> dist_;
+};
+
+template <typename T>
+class CreateRandomDataGeneratorOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      int(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("min"),
+                                          Attr<float>("max")));
+  }
+};
+
+class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
+ public:
+  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : FileReaderMakerBase(op_proto, op_checker) {
+    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+    AddComment(R"DOC(
+      CreateRandomDataGenerator Operator
+
+      This Op creates a random reader.
+      The reader generates random data instead of really reading from files.
+      Generated data follow an uniform distribution between 'min' and 'max'.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_FILE_READER_OPERATOR(create_random_data_generator,
+                              ops::CreateRandomDataGeneratorOp<float>,
+                              ops::CreateRandomDataGeneratorOpMaker);
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e8b463efc99e4a962e5ae14ab133cf634548756
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -0,0 +1,97 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class ShuffleReader : public framework::DecoratedReader {
+ public:
+  ShuffleReader(ReaderBase* reader, int buffer_size)
+      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
+    buffer_.reserve(buffer_size);
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  int buffer_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+  size_t iteration_pos_;
+};
+
+void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  if (iteration_pos_ >= buffer_.size()) {
+    // Reload buffer with new data
+    buffer_.clear();
+    buffer_.reserve(buffer_size_);
+    for (int i = 0; i < buffer_size_; ++i) {
+      if (reader_->HasNext()) {
+        buffer_.push_back(std::vector<framework::LoDTensor>());
+        reader_->ReadNext(&buffer_.back());
+      } else {
+        break;
+      }
+    }
+    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
+    // optimize.
+    std::random_shuffle(buffer_.begin(), buffer_.end());
+    iteration_pos_ = 0;
+  }
+  out->clear();
+  if (!buffer_.empty()) {
+    std::swap(*out, buffer_[iteration_pos_++]);
+  }
+  // if buffer_ is empty, the 'out' will return as an empty vector.
+}
+
+class CreateShuffleReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(
+        new ShuffleReader(underlying_reader.Get(), Attr<int>("buffer_size")));
+  }
+};
+
+class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateShuffleReader Operator
+
+      A shuffle reader takes another reader as its 'underlying reader'
+      and yields the underlying reader's outputs in a shuffled order.
+    )DOC");
+  }
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_shuffle_reader,
+                                   ops::CreateShuffleReaderOp,
+                                   ops::CreateShuffleReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ea4f4b8d9feecac5bc2d0338bbbe9ab7a532040
--- /dev/null
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -0,0 +1,116 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
+                                           const std::vector<int>& ranks) {
+  std::vector<framework::DDim> res;
+  int offset = 0;
+  for (int len : ranks) {
+    auto start_it = shape_concat.begin() + offset;
+    auto end_it = start_it + len;
+    res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
+    offset += len;
+  }
+  return res;
+}
+
+FileReaderMakerBase::FileReaderMakerBase(
+    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
+    framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  AddOutput("Out", "(ReaderHolder) The created random reader.");
+  AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
+  AddAttr<std::vector<int>>(
+      "ranks",
+      "The ranks of each data."
+      "e.g."
+      "shape_concat = [2,3,4,5,6]"
+      "ranks = [3,2]"
+      "It means the reader will generate two data each time,"
+      "whose shapes are [2,3,4] and [5,6] respectively.");
+  AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+}
+
+void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "The output file reader should not be null.");
+  const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
+  const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
+  std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
+  ctx->SetReaderDims("Out", shapes);
+
+  if (ctx->IsRuntime()) {
+    const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
+    PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
+                      "The number of 'lod_levels'(%d) doesn't match the number "
+                      "of 'shapes'(%d).",
+                      lod_levels.size(), shapes.size());
+    framework::VarDesc* reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    reader->SetLoDLevels(lod_levels);
+  }
+}
+
+void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
+                                        framework::BlockDesc* block) const {
+  std::string reader_name = op_desc.Output("Out")[0];
+  framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+  reader->SetType(framework::proto::VarType::READER);
+}
+
+void DecoratedReaderInferShape::operator()(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
+                 "Input(UnderlyingReader) should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "The output decorated reader should not be null.");
+  ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
+
+  if (ctx->IsRuntime()) {
+    framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
+        ctx->GetInputVarPtrs("UnderlyingReader")[0]);
+    framework::VarDesc* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetLoDLevels(in_reader->GetLoDLevels());
+  }
+}
+void DecoratedReaderInferVarType::operator()(
+    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
+  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
+  framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
+  std::string out_reader_name = op_desc.Output("Out")[0];
+  framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
+  out_reader->SetType(framework::proto::VarType::READER);
+  out_reader->SetDataTypes(in_reader->GetDataTypes());
+}
+
+DecoratedReaderMakerBase::DecoratedReaderMakerBase(
+    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
+    framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  AddInput("UnderlyingReader",
+           "(ReaderHolder) The underlying reader for creating a batch reader.");
+  AddOutput("Out", "(ReaderHolder) The created batch reader.");
+}
+
+}  // namespace reader
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1f0498f4692247cda72fbcbdd5070ddfaa11553
--- /dev/null
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -0,0 +1,75 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+extern std::vector<framework::DDim> RestoreShapes(
+    const std::vector<int>& shape_concat, const std::vector<int>& ranks);
+
+class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker {
+ public:
+  FileReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+};
+
+class FileReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override;
+};
+
+class FileReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override;
+};
+
+// general infershape for decorated reader
+class DecoratedReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override;
+};
+
+// general var type inference for decorated reader
+class DecoratedReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override;
+};
+
+class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecoratedReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_FILE_READER_OPERATOR(op_name, ...)                  \
+  REGISTER_OPERATOR(op_name, __VA_ARGS__,                            \
+                    paddle::operators::reader::FileReaderInferShape, \
+                    paddle::framework::EmptyGradOpMaker,             \
+                    paddle::operators::reader::FileReaderInferVarType)
+
+#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...)                  \
+  REGISTER_OPERATOR(op_name, __VA_ARGS__,                                 \
+                    paddle::operators::reader::DecoratedReaderInferShape, \
+                    paddle::framework::EmptyGradOpMaker,                  \
+                    paddle::operators::reader::DecoratedReaderInferVarType)
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 3580932356fd5f29d5e4d00a70e64c207c64e41e..832509641cc3d5178ff090e05437484d395bfe51 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -84,6 +84,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int>>("shape",
                               "(vector<int>) "
                               "Target shape of reshape operator.");
+    AddAttr<bool>("inplace",
+                  "Change the source tensor's shape without copy memory.")
+        .SetDefault(true);
     AddComment(R"DOC(
 Reshape Operator.
 
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 1357bce4b7e7dcde616ec15df6617db5573df1d3..eacb0a0cf21a60ffbdef5787434859ac549388bc 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -26,10 +26,16 @@ class ReshapeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out = ctx.Output<framework::Tensor>("Out");
     auto* in = ctx.Input<framework::Tensor>("X");
+    bool inplace = ctx.Attr<bool>("inplace");
     auto out_dims = out->dims();
-    out->mutable_data<T>(ctx.GetPlace());
-    framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
-    out->Resize(out_dims);
+    if (!inplace) {
+      out->mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
   }
 };
 
@@ -40,10 +46,16 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     d_x->mutable_data<T>(ctx.GetPlace());
+    bool inplace = ctx.Attr<bool>("inplace");
 
     auto in_dims = d_x->dims();
-    framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-    d_x->Resize(in_dims);
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 48c967de1155a705b76f0285a4baed52d7ce1f58..1842ecd745e3f5cb75600ce00d89018f81682632 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
   DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
 
+inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (use_cudnn) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  return use_cudnn;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 7da6e04d0a8b81bcb5fb6b105ebdd5b908cf8f1d..bb9fbd468f38fffc94107e321e777fc0e772fbe6 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
   PADDLE_ENFORCE_GT(places.size(), 0);
   for (size_t i = 0; i < places.size(); i++) {
     if (platform::is_cpu_place(places[i])) {
+#ifdef PADDLE_WITH_MKLDNN
+      device_contexts_.emplace(places[i],
+                               new platform::MKLDNNDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+#else
       device_contexts_.emplace(places[i],
                                new platform::CPUDeviceContext(
                                    boost::get<platform::CPUPlace>(places[i])));
+#endif
     } else if (platform::is_gpu_place(places[i])) {
 #ifdef PADDLE_WITH_CUDA
       device_contexts_.emplace(places[i],
@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
 
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
   SetDeviceId(place_.device);
+  multi_process = GetCUDAMultiProcessors(place_.device);
+  max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
   eigen_stream_.reset(new EigenCudaStreamDevice());
   eigen_stream_->Reinitialize(&stream_, place);
@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
   PADDLE_ENFORCE(cudaGetLastError());
 }
 
+int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
+  return multi_process * max_threads_per_mp;
+}
+
 Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
 #ifdef PADDLE_WITH_MKLDNN
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
-    : CPUDeviceContext(place), ready_(false) {
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+    : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
+  p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
 }
 
-template <typename T>
-void MKLDNNDeviceContext::AddElement(const std::string& op_key,
-                                     const T& value) {
-  if (GetElement<T>(op_key)) {
-    return;
-  }
-  GetElementPool<T>().emplace(op_key, std::move(value));
-}
+void MKLDNNDeviceContext::SetBlob(const std::string& name,
+                                  std::shared_ptr<void> data) const {
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
+  p = p_blobs_.get();
 
-template <typename T>
-const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
-  auto it = GetElementPool<T>().find(op_key);
-  return it == GetElementPool<T>().end() ? nullptr : it->second;
-}
+  auto it = p->find(name);
 
-template <>
-const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
-  return memory_pool_;
-}
+  if (it == p->end()) {
+    (*p)[name] = data;  // create new blob
+  } else {
+    it->second = data;  // set data to existing blob
+  }
 
-template <>
-const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
-  return primitive_pool_;
+  return;
 }
 
-template <>
-const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
-                         std::hash<std::string>>&
-MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
-  return primitive_desc_pool_;
-}
+std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
+    const std::string& name) const {
+  std::unordered_map<std::string, std::shared_ptr<void>>* p;
+  p = p_blobs_.get();
 
-void MKLDNNDeviceContext::Execute(bool block) {
-  if (pipeline_.empty()) {
-    return;
-  }
-  ResetStream();
-  stream_->submit(pipeline_).wait(block);
-  ready_ = false;
-  pipeline_.clear();
-}
+  auto it = p->find(name);
 
-void MKLDNNDeviceContext::ResetStream() {
-  if (ready_) {
-    return;
+  if (it != p->end()) {
+    return it->second;
   }
-  // TODO(TJ): change me when mkldnn have specific method to reset this state
-  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
-  ready_ = true;
+
+  return nullptr;
 }
 
 #endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index a294ba5101528c9ac0007bdcfc5255a0c2674aad..e779644190de1246cd650fbf91eeaeb03494643f 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include <mkldnn.hpp>
 #endif
 
 #include "paddle/fluid/platform/enforce.h"
@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return place in the device context. */
   Place GetPlace() const override;
 
+  /*! \brief  Return the max physical thread count in the device context */
+  int GetMaxPhysicalThreadCount() const;
+
   /*! \brief  Return eigen device in the device context. */
   Eigen::GpuDevice* eigen_device() const;
 
@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
   cudaStream_t stream_;
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
+
+  int multi_process;
+  int max_threads_per_mp;
 };
 
 template <>
@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  public:
   explicit MKLDNNDeviceContext(CPUPlace place);
 
-  /* \brief  Add new element: memory, primitive or primitive desc */
-  template <typename T>
-  void AddElement(const std::string& op_key, const T& value);
-
-  /* \brief  Get existed element: memory, primitive or primitive desc */
-  template <typename T>
-  const T& GetElement(const std::string& op_key) const;
-
-  /* \brief  Get element pool: memory, primitive or primitive desc pool */
-  template <typename T>
-  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
-  GetElementPool() const;
-
   /* \brief  Get the active engine */
-  const MKLDNNEngine& engine() const { return *engine_; }
-
-  /* \brief  Submit primitive to pipeline */
-  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
+  const mkldnn::engine& GetEngine() const { return engine_; }
 
-  /*! \brief  Execute all submitted primitives in pipeline */
-  void Execute(bool block = true);
+  // Set data to blob (i.e. name/data pair). Create blob if not existing
+  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
 
- protected:
-  /*! \brief  Reset the stream to prepare next exectue */
-  void ResetStream();
+  // Find a saved blob. Return nullptr if not found
+  std::shared_ptr<void> GetBlob(const std::string& name) const;
 
  private:
-  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
-                     std::hash<std::string>>
-      memory_pool_;
-  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
-                     std::hash<std::string>>
-      primitive_pool_;
-  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
-                     std::hash<std::string>>
-      primitive_desc_pool_;
-  std::vector<MKLDNNPrimitive> pipeline_;
-  MKLDNNStreamPtr stream_;
-  MKLDNNEnginePtr engine_;
-  bool ready_;
+  mkldnn::engine engine_;
+  std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
+      p_blobs_;
 };
 #endif
 
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 5832bd9ce303d48496e8c15324fd7e4a39484eb4..52fb8c2531357ad7a2b2f8613e5c7fbcef52c6bb 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -20,10 +20,6 @@ limitations under the License. */
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
 
-#include "unsupported/Eigen/CXX11/Tensor"
-
-#include "paddle/fluid/platform/hostdevice.h"
-
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
 #else
@@ -64,6 +60,18 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+// Forward declare float16 for eigen.h
+struct float16;
+
+}  // namespace platform
+}  // namespace paddle
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace platform {
+
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
@@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 }
 #endif
 
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hisnan(half(a));
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (a.x & 0x7fff) == 0x7c00;
+}
+
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
 }  // namespace platform
 }  // namespace paddle
 
@@ -750,3 +774,27 @@ struct is_pod<paddle::platform::float16> {
 };
 
 }  // namespace std
+
+namespace Eigen {
+namespace numext {
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
+    const paddle::platform::float16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+}  // namespace numext
+}  // namespace Eigen
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 05e1eae853e20b3fd86438c03f52628179a311ca..da4041bad0d82fe1c8c7a12fd0c7177e6dbddef3 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
   return count;
 }
 
+int GetCUDAMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int count;
+  PADDLE_ENFORCE(
+      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id),
+      "cudaDeviceGetAttribute failed in "
+      "paddle::platform::GetCUDAMultiProcessors");
+  return count;
+}
+
+int GetCUDAMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  int count;
+  PADDLE_ENFORCE(cudaDeviceGetAttribute(
+                     &count, cudaDevAttrMaxThreadsPerMultiProcessor, id),
+                 "cudaDeviceGetAttribute failed in "
+                 "paddle::platform::GetCUDAMaxThreadsPerMultiProcessor");
+  return count;
+}
+
 int GetCurrentDeviceId() {
   int device_id;
   PADDLE_ENFORCE(
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 3d4883d8078daa2b55d8ea792b47e93e4f4feec8..c38ccf0f2ade1d2405177b541b33fd84283726ff 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
 
+//! Get the MultiProcessors of the ith GPU.
+int GetCUDAMultiProcessors(int i);
+
+//! Get the MaxThreads of each MultiProcessor of the ith GPU.
+int GetCUDAMaxThreadsPerMultiProcessor(int i);
+
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 6d71f352c6eda12e2bc032ecbc26cfefe320f703..90b78142b845e7e12c0c7dfb391f6aa3bd848436 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -16,12 +16,15 @@ limitations under the License. */
 
 #include <mkldnn.hpp>
 
+#include "paddle/fluid/framework/operator.h"
+
 namespace paddle {
 namespace platform {
 
 using MKLDNNStream = mkldnn::stream;
 using MKLDNNEngine = mkldnn::engine;
 using MKLDNNMemory = mkldnn::memory;
+using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
 using MKLDNNPrimitive = mkldnn::primitive;
 using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
 
@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
 typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
 typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
 
+inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
+                                          mkldnn::memory::data_type data_type,
+                                          mkldnn::memory::format format) {
+  mkldnn::memory::dims tz = dims;
+  return mkldnn::memory::desc({tz}, data_type, format);
+}
+
+inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
+  return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 094f9224f719f6656fd0c44aa4a620730432ccac..28ef3e04b1c50e0d42eeb27608259c6449429da5 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) {
   }
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
-    // Generate some dummy evenets first to reduce the startup overhead.
+    // Generate some dummy events first to reduce the startup overhead.
     for (int i = 0; i < 5; i++) {
       ForEachDevice([](int d) {
         DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 06319fc638984f8f8ed897c362f516e1534bc8db..6be2bd8fad9e33cf4e1dcafdd6b8f39111bdbe88 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -213,7 +213,7 @@ function gen_fluid_inference_lib() {
     if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
     cat <<EOF
     ========================================
-    Building fluid inference library ...
+    Deploying fluid inference library ...
     ========================================
 EOF
         make inference_lib_dist
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 3f407d05768f707507e4a1339a64f3d7ae4506a9..0df3fd0343dbdaee88192a402d990fdfc2235811 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -28,6 +28,7 @@ import nets
 import optimizer
 import backward
 import regularizer
+import average
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
new file mode 100644
index 0000000000000000000000000000000000000000..ded6eb085968343fcdc9f6e4b8353c08408df426
--- /dev/null
+++ b/python/paddle/fluid/average.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+"""
+    Class of all kinds of Average.
+
+    All Averages are accomplished via Python totally. 
+    They do not change Paddle's Program, nor do anything to
+    modify NN model's configuration. They are completely 
+    wrappers of Python functions.
+"""
+
+
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+
+
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+
+
+class WeightedAverage(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.numerator = None
+        self.denominator = None
+
+    def add(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+
+        if self.numerator is None or self.denominator is None:
+            self.numerator = value * weight
+            self.denominator = weight
+        else:
+            self.numerator += value * weight
+            self.denominator += weight
+
+    def eval(self):
+        if self.numerator is None or self.denominator is None:
+            raise ValueError(
+                "There is no data to be averaged in WeightedAverage.")
+        return self.numerator / self.denominator
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index e02245d05dd9fd5d82a26dcbc00a8b832f93842e..09d137c9017bb4185308af94287fe0e8aa005505 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -486,7 +486,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     params_and_grads = []
     for param in parameters:
         if param not in grad_info_map:
-            raise ValueError("param %s is not in map" % param)
+            continue
         grad_info = grad_info_map[param]
         grad_block = grad_info[1]
         if not grad_block.has_var(grad_info[0]):
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 18b1cdce8ad88d1423d69772773fcfa78b11c592..19e5b61b0b32aba3fe1e7805704a3740e3854fc8 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -108,44 +108,6 @@ class Evaluator(object):
         return state
 
 
-class Accuracy(Evaluator):
-    """
-    Average Accuracy for multiple mini-batches.
-    """
-
-    def __init__(self, input, label, k=1, **kwargs):
-        super(Accuracy, self).__init__("accuracy", **kwargs)
-        main_program = self.helper.main_program
-        if main_program.current_block().idx != 0:
-            raise ValueError("You can only invoke Evaluator in root block")
-
-        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
-        self.correct = self.create_state(
-            dtype='int64', shape=[1], suffix='correct')
-        total = self.helper.create_tmp_variable(dtype='int')
-        correct = self.helper.create_tmp_variable(dtype='int')
-        acc = layers.accuracy(
-            input=input, label=label, k=k, total=total, correct=correct)
-        total = layers.cast(x=total, dtype='int64')
-        correct = layers.cast(x=correct, dtype='int64')
-        layers.sums(input=[self.total, total], out=self.total)
-        layers.sums(input=[self.correct, correct], out=self.correct)
-
-        self.metrics.append(acc)
-
-    def eval(self, executor, eval_program=None):
-        if eval_program is None:
-            eval_program = Program()
-        block = eval_program.current_block()
-        with program_guard(main_program=eval_program):
-            total = _clone_var_(block, self.total)
-            correct = _clone_var_(block, self.correct)
-            total = layers.cast(total, dtype='float32')
-            correct = layers.cast(correct, dtype='float32')
-            out = layers.elementwise_div(x=correct, y=total)
-        return np.array(executor.run(eval_program, fetch_list=[out])[0])
-
-
 class ChunkEvaluator(Evaluator):
     """
     Accumulate counter numbers output by chunk_eval from mini-batches and
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index 14d33582f41a33da49b1e5176b2094a6a81b3dac..a568f61dcb2da976baa7847ae26281a34d6f88dd 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -28,6 +28,8 @@ import math_op_patch
 from math_op_patch import *
 import detection
 from detection import *
+import metric
+from metric import *
 from learning_rate_scheduler import *
 
 __all__ = []
@@ -39,4 +41,5 @@ __all__ += control_flow.__all__
 __all__ += ops.__all__
 __all__ += device.__all__
 __all__ += detection.__all__
+__all__ += metric.__all__
 __all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d9157ad4ef9381b70b4007c5bdca91f1482b427
--- /dev/null
+++ b/python/paddle/fluid/layers/metric.py
@@ -0,0 +1,57 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All layers just related to metric.
+"""
+
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+from ..param_attr import ParamAttr
+
+__all__ = ['accuracy']
+
+
+def accuracy(input, label, k=1, correct=None, total=None):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("accuracy", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ffa477ba9b88126d8ff0ed404e64830b087314e9..b4fa530aa66fa1b84673b504a922595b83f42268 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -35,7 +35,6 @@ __all__ = [
     'cos_sim',
     'cross_entropy',
     'square_error_cost',
-    'accuracy',
     'chunk_eval',
     'sequence_conv',
     'conv2d',
@@ -1022,40 +1021,6 @@ def square_error_cost(input, label):
     return square_out
 
 
-def accuracy(input, label, k=1, correct=None, total=None):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
-
-
 def chunk_eval(input,
                label,
                chunk_scheme,
@@ -1146,6 +1111,7 @@ def conv2d(input,
            param_attr=None,
            bias_attr=None,
            use_cudnn=True,
+           use_mkldnn=False,
            act=None):
     """
     **Convlution2D Layer**
@@ -1287,7 +1253,8 @@ def conv2d(input,
             'strides': stride,
             'paddings': padding,
             'groups': groups,
-            'use_cudnn': use_cudnn
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': use_mkldnn
         })
 
     pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -3182,7 +3149,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
             label = fluid.layers.data(name='label', shape=[100], dtype='int64')
             fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.smooth_l1(logits=fc, label=label)
+            out = fluid.layers.smooth_l1(x=fc, y=label)
     """
     helper = LayerHelper('smooth_l1_loss', **locals())
     diff = helper.create_tmp_variable(dtype=x.dtype)
diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py
index 708ca08b17c85efbb25ecaf2580b7141421e25b9..4fa2d03ef563b98b2eec576bf87d4b2e54ca0a36 100644
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -31,6 +31,8 @@ dtype_to_size = {
 
 sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
 
+PRINT_LOG = False
+
 
 class ControlFlowGraph(object):
     def __init__(self, Program, ops, forward_num, skip_opt):
@@ -171,12 +173,14 @@ class ControlFlowGraph(object):
                                 # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
                                 # and dtype_to_size[cache_dtype]
                                 if x_dtype == cache_dtype:
-                                    print(("Hit Cache !!!! cache pool index "
-                                           "is %d, var name is %s, "
-                                           "cached var name is %s, "
-                                           "var shape is %s ") %
-                                          (index, x, cache_var,
-                                           str(cache_shape)))
+                                    if PRINT_LOG:
+                                        print(
+                                            ("Hit Cache !!!! cache pool index "
+                                             "is %d, var name is %s, "
+                                             "cached var name is %s, "
+                                             "var shape is %s ") %
+                                            (index, x, cache_var,
+                                             str(cache_shape)))
                                     self.pool.pop(index)
                                     if x == cache_var:
                                         break
@@ -277,7 +281,9 @@ def _get_cfgs(input_program):
     return cfgs
 
 
-def memory_optimize(input_program):
+def memory_optimize(input_program, print_log=False):
+    global PRINT_LOG
+    PRINT_LOG = print_log
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.memory_optimize()
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index c161d93854ae2b60b3d2b4d80612ac5efddc4c21..8c627ad55bcbaf0cb0e8dd74a3e67ed40a6245db 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
                          act,
                          param_attr=None,
                          pool_type='max',
-                         use_cudnn=True):
+                         use_cudnn=True,
+                         use_mkldnn=False):
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
         param_attr=param_attr,
         act=act,
-        use_cudnn=use_cudnn)
+        use_cudnn=use_cudnn,
+        use_mkldnn=use_mkldnn)
 
     pool_out = layers.pool2d(
         input=conv_out,
@@ -58,7 +60,8 @@ def img_conv_group(input,
                    conv_batchnorm_drop_rate=0.0,
                    pool_stride=1,
                    pool_type=None,
-                   use_cudnn=True):
+                   use_cudnn=True,
+                   use_mkldnn=False):
     """
     Image Convolution Group, Used for vgg net.
     """
@@ -90,7 +93,8 @@ def img_conv_group(input,
             padding=conv_padding[i],
             param_attr=param_attr[i],
             act=local_conv_act,
-            use_cudnn=use_cudnn)
+            use_cudnn=use_cudnn,
+            use_mkldnn=use_mkldnn)
 
         if conv_with_batchnorm[i]:
             tmp = layers.batch_norm(input=tmp, act=conv_act)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 7648bb9fe1c8d2dae3590fa67f781743ba297c32..c9d2a5ecaab0669f308b5b9c5cf74d0212fa462a 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -49,7 +49,7 @@ avg_cost = fluid.layers.mean(x=cost)
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 sgd_optimizer.minimize(avg_cost)
 
-fluid.memory_optimize(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program(), print_log=True)
 
 BATCH_SIZE = 200
 
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index f900da815b204cd7472b93607b816054999d7d69..80ff11f8d78b0a22fc6aefd722c9e6a2c23fbd5c 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -122,7 +122,8 @@ avg_cost = fluid.layers.mean(cost)
 optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
 
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+batch_size = fluid.layers.create_tensor(dtype='int64')
+batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
 
 fluid.memory_optimize(fluid.default_main_program())
 
@@ -144,13 +145,17 @@ feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe.run(fluid.default_startup_program())
 
 i = 0
+
+accuracy = fluid.average.WeightedAverage()
 for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
+    accuracy.reset()
     for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
-                            feed=feeder.feed(data),
-                            fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
+        loss, acc, weight = exe.run(
+            fluid.default_main_program(),
+            feed=feeder.feed(data),
+            fetch_list=[avg_cost, batch_acc, batch_size])
+        accuracy.add(value=acc, weight=weight)
+        pass_acc = accuracy.eval()
         print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
             pass_acc))
         # this model is slow, so if we can train two mini batch, we think it works properly.
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 1321cfd484ec8be1d8a817535386db949d825574..a49fecf09509f7b1d9f758eebcf90bf9fbf7669f 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 class TestConv2dOp(OpTest):
     def setUp(self):
         self.use_cudnn = False
+        self.use_mkldnn = False
         self.init_op_type()
         self.init_group()
         self.init_dilation()
@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
             'paddings': self.pad,
             'groups': self.groups,
             'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
         }
         self.outputs = {'Output': output}
 
@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
 
+
+#----------------Conv2dMKLDNN----------------
+class TestMKLDNN(TestConv2dOp):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.use_mkldnn = True
+        self.op_type = "conv2d"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 20b38ecfd7477b7368a1d3bfae3c8dacb06afd39..1da6b94eea30e65913ce713e0e5e355507534161 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -37,7 +37,9 @@ class TestProfiler(unittest.TestCase):
             label = fluid.layers.data(name='y', shape=[1], dtype='int64')
             cost = fluid.layers.cross_entropy(input=predict, label=label)
             avg_cost = fluid.layers.mean(cost)
-            accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+            batch_size = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size)
 
         optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
         opts = optimizer.minimize(avg_cost, startup_program=startup_program)
@@ -46,7 +48,7 @@ class TestProfiler(unittest.TestCase):
         exe = fluid.Executor(place)
         exe.run(startup_program)
 
-        accuracy.reset(exe)
+        pass_acc_calculator = fluid.average.WeightedAverage()
         with profiler.profiler(state, 'total', profile_path) as prof:
             for iter in range(10):
                 if iter == 2:
@@ -57,9 +59,11 @@ class TestProfiler(unittest.TestCase):
                 outs = exe.run(main_program,
                                feed={'x': x,
                                      'y': y},
-                               fetch_list=[avg_cost] + accuracy.metrics)
+                               fetch_list=[avg_cost, batch_acc, batch_size])
                 acc = np.array(outs[1])
-                pass_acc = accuracy.eval(exe)
+                b_size = np.array(outs[2])
+                pass_acc_calculator.add(value=acc, weight=b_size)
+                pass_acc = pass_acc_calculator.eval()
 
     def test_cpu_profiler(self):
         self.net_profiler('CPU')
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 6d1aa549d592997359342bab9233685f5d6108fb..11f35c74d41146118525a5efa6c211d528e255fe 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -45,5 +45,33 @@ class TestReshapeOpDimInfer(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestReshapeOpInplace(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [10 * 20], 'inplace': True}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [4, -1, 5], 'inplace': True}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
 if __name__ == '__main__':
     unittest.main()