diff --git a/Dockerfile b/Dockerfile
index 164fe84904947bfc3cf71132b5fba04744460b26..ea39efd00bb5c0a7deb3f6d57083d83a673b883c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -70,7 +70,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip==9.0.3 && \
+RUN easy_install -U pip && \
     pip install -U wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index 8266dec3c6125a09b90ac0ccd4aa5464f5c7db31..198a05a79e19227e90eaafe116217a164cd51a7d 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -1,3 +1,5 @@
+# CPU性能调优
+
 此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
 
 Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
@@ -8,7 +10,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
 * Python 与 C++ 混合代码的性能分析
 
 
-# Python代码的性能分析
+## Python代码的性能分析
 
 ### 生成性能分析文件
 
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index e95556dd608b7ff0a3eb18873df0015a2da94e7c..216694965b3c878a8a5f3ccd2a0cba8d21d9ce05 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -1,3 +1,5 @@
+# Tune CPU performance
+
 This tutorial introduces techniques we use to profile and tune the
 CPU performance of PaddlePaddle.  We will use Python packages
 `cProfile` and `yep`, and Google's `perftools`.
@@ -14,7 +16,7 @@ the profiling and tuning of
 1. the Python code and
 1. the mixture of Python and C++ code.
 
-# Profiling the Python Code
+## Profiling the Python Code
 
 ### Generate the Performance Profiling File
 
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
index b3d882743785e8ee301b71b696230531d2b7ba58..aa1dc6ee2cc9a3a528e54ce2da07746158735f56 100644
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -37,12 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具
     :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
 
 .. _pip_dependency:
 
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
index 1e409d86b9775094998f72f92954f4bbc1013ea1..a70821eb487be841060e6b5f7fc8b014634ac5ba 100644
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -40,12 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
     :header: "version", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
 
 .. _pip_dependency:
 
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index c0523f3c795b103c0c27081ec5dc717f6a0f11e0..5a57ec20585c26dbcd4251464718fc819148a7a5 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -91,6 +91,12 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
     case proto::VarType::BOOL:
       framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
       break;
+    case proto::VarType::INT16:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::UINT8:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
     default:
       PADDLE_THROW("Not support type %d", src_type);
   }
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 45bad58145a1144dfabdd3e15b38d972d57b105e..6b0c0a6b9fb29e641449f0c21109611cccd4e5a9 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -98,7 +98,7 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
     return false;
   };
 
-  if (op.Type() == "split") {
+  if (op.Type() == "split" || op.Type() == "split_byref") {
     return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
   } else if (op.Type() == "concat") {
     return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index e33eb7f170ea04b998283b24d9210a9bb7538675..cc1589514aab3b973b4909159748bc4223cdce46 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -149,7 +149,7 @@ void TestInference(const std::string& dirname,
     state = paddle::platform::ProfilerState::kCPU;
   } else {
 #ifdef PADDLE_WITH_CUDA
-    state = paddle::platform::ProfilerState::kCUDA;
+    state = paddle::platform::ProfilerState::kAll;
     // The default device_id of paddle::platform::CUDAPlace is 0.
     // Users can get the device_id using:
     //   int device_id = place.GetDeviceId();
@@ -172,7 +172,7 @@ void TestInference(const std::string& dirname,
   }
   // Disable the profiler and print the timing information
   paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
-                                    "load_program_profiler.txt");
+                                    "load_program_profiler");
   paddle::platform::ResetProfiler();
 
   // 3. Get the feed_target_names and fetch_target_names
@@ -236,8 +236,7 @@ void TestInference(const std::string& dirname,
 
     // Disable the profiler and print the timing information
     paddle::platform::DisableProfiler(
-        paddle::platform::EventSortingKey::kDefault,
-        "run_inference_profiler.txt");
+        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
     paddle::platform::ResetProfiler();
   }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index ac1f3f44ae8703c3e0c792bd9a2e658f1341ec15..7fce138e3f47e0eb485afb4d5a665eb41f68e286 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -204,6 +204,7 @@ if(WITH_DISTRIBUTE)
     set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
     if(WITH_GPU)
+        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
         cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
         op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
         set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 9b51db8a45186c2a90cf8b2eb7966d0aaea04028..46bc4f6f936929050276e8b3b93f1eddd62ac638 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -14,10 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index d3f3ad92442cafdd8d4cdc396d89721863d069c2..29b73951bbddd9bfd73c932d7801797590de5e8e 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,45 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/is_empty_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kInput[] = "X";
-constexpr char kOutput[] = "Out";
-
-class IsEmptyOp : public framework::OperatorBase {
+class IsEmptyOp : public framework::OperatorWithKernel {
  public:
-  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get input
-    auto *var = scope.FindVar(Input(kInput));
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto &tensor = var->Get<framework::LoDTensor>();
-    // get output
-    auto *out = scope.FindVar(Output(kOutput));
-    PADDLE_ENFORCE_NOT_NULL(out);
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IsEmptyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of IsEmptyOp should not be null.");
+    ctx->SetOutputDim("Out", {1});
+  }
 
-    out_tensor->Resize({1});
-    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
-        framework::product(tensor.dims()) == 0;
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        platform::CPUPlace());
+    return kt;
   }
 };
 
-class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+class IsEmptyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
-    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+    AddInput("X", "(LoDTensor) Tensor which is to be checked.");
+    AddOutput("Out",
+              "(LoDTensor) a boolean Tensor that indicate empty or not.");
     AddComment(R"DOC(
 IsEmpty Operator which checks whether a tensor is empty.
 
@@ -62,5 +58,12 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
-                             paddle::operators::IsEmptyOpProtoMaker);
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e3af22fa8d842b6a1e67418446f1a40949e046b
--- /dev/null
+++ b/paddle/fluid/operators/is_empty_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IsEmptyOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get input
+    auto* input_tensor = context.Input<framework::LoDTensor>("X");
+    // get output
+    auto* output_tensor = context.Output<framework::LoDTensor>("Out");
+
+    output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(input_tensor->dims()) == 0;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index 63eaaedcd5fc3df17902511dc02b25bf43ccd241..60e936298defe7c6ce8a33bdc7de05b52eb950e7 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -18,6 +18,26 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+
+// Generate keys for storing/retriving primitives for this operator
+// TODO(jczaja): Make hashing function more optimial
+static std::string gethash(memory::dims& input_dims, std::string& pooling_type,
+                           std::vector<int>& ksize, std::vector<int>& strides,
+                           std::vector<int>& paddings, std::string suffix) {
+  auto dims2str = [](memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  };
+  return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) +
+         dims2str(paddings) + pooling_type + suffix;
+}
+
 template <typename T>
 class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -34,10 +54,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // Get an unique name from "argument" name of "Out" variable
     // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -63,37 +79,71 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                          mkldnn::memory::format::nchw);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
-                                          mkldnn::memory::format::nchw);
-
-    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-        CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
-                            pooling_type, mkldnn_engine);
-
-    // save pool_pd into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_pool_pd, pool_pd);
-
-    std::shared_ptr<mkldnn::memory> workspace_memory =
-        CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
-
-    // save pool_workspace_memory to be referred in backward path
-    dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
-
-    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(input_data)));
-    auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(output_data)));
+    const std::string key = gethash(src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Output("Out"));
+    const std::string key_pool_p = key + "@pool_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
 
-    auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
-                                             *workspace_memory);
+    auto pool_p =
+        std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
+    if (pool_p == nullptr) {
+      // TODO(pzelazko-intel): support more formats
+
+      auto src_md =
+          platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+      auto dst_md =
+          platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+
+      std::shared_ptr<pooling_forward::primitive_desc> pool_pd =
+          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
+                              pooling_type, mkldnn_engine);
+
+      // save pool_pd into global device context to be referred in backward path
+      dev_ctx.SetBlob(key_pool_pd, pool_pd);
+
+      std::shared_ptr<mkldnn::memory> workspace_memory =
+          CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
+
+      // save pool_workspace_memory to be referred in backward path
+      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+
+      auto pool_src_memory_p = std::make_shared<memory>(
+          memory::primitive_desc{src_md, mkldnn_engine},
+          static_cast<void*>(const_cast<T*>(input_data)));
+      dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p);
+
+      auto pool_dst_memory_p = std::make_shared<memory>(
+          memory::primitive_desc{dst_md, mkldnn_engine},
+          static_cast<void*>(output_data));
+      dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p);
+
+      pool_p = std::make_shared<pooling_forward>(
+          *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()),
+          *workspace_memory);
+      dev_ctx.SetBlob(key_pool_p, pool_p);
+    } else {
+      // Primitives already exist
+      auto pool_src_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
+      PADDLE_ENFORCE(pool_src_memory_p != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      auto pool_dst_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+      PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+      pool_src_memory_p->set_data_handle(
+          reinterpret_cast<void*>(const_cast<T*>(input_data)));
+      pool_dst_memory_p->set_data_handle(output_data);
+    }
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{pool_prim};
+    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
 
@@ -120,9 +170,10 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     mkldnn::memory::primitive_desc workspace_md =
         pooling_type == "max"
             ? pool_pd->workspace_primitive_desc()
-            : mkldnn::memory::primitive_desc(
-                  {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw},
-                  engine);
+            : mkldnn::memory::primitive_desc({{},
+                                              platform::MKLDNNGetDataType<T>(),
+                                              mkldnn::memory::format::nchw},
+                                             engine);
 
     auto p_workspace_memory = new mkldnn::memory(workspace_md);
     return std::unique_ptr<mkldnn::memory>(p_workspace_memory);
@@ -140,13 +191,6 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when referring info from device context
-    const std::string key = ctx.op().Input("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
-
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -171,43 +215,76 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> diff_dst_tz =
         paddle::framework::vectorize2int(out_grad->dims());
 
-    auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-    auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-
-    // Retrieve pool_pd/pool_workspace_memory from device context
-    auto pool_pd =
-        std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_pool_pd));
-    PADDLE_ENFORCE(pool_pd != nullptr,
-                   "Fail to find pool_pd in device context");
-
-    auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
-        dev_ctx.GetBlob(key_pool_workspace_memory));
-    PADDLE_ENFORCE(workspace_memory != nullptr,
-                   "Fail to find workspace_memory in device context");
-
-    auto pool_bwd_desc = mkldnn::pooling_backward::desc(
-        pooling_type == "max" ? mkldnn::algorithm::pooling_max
-                              : mkldnn::algorithm::pooling_avg,
-        diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
-        mkldnn::padding_kind::zero);
-    auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
-        pool_bwd_desc, mkldnn_engine, *pool_pd);
-
-    auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
-    auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(out_grad_data)));
+    // Get an unique name from "argument" name of "Out" variable
+    // This name will be used as key when referring info from device context
+    const std::string key = gethash(diff_src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Input("Out"));
+    const std::string key_pool_bwd_p = key + "@pool_bwd_p";
+    const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
+    const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
 
-    auto bwd_prim = mkldnn::pooling_backward(
-        pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
+    auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
+        dev_ctx.GetBlob(key_pool_bwd_p));
+    if (pool_bwd_p == nullptr) {
+      auto diff_src_md =
+          platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+      auto diff_dst_md =
+          platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::nchw);
+      // Retrieve pool_pd/pool_workspace_memory from device context
+      auto pool_pd =
+          std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
+              dev_ctx.GetBlob(key_pool_pd));
+      PADDLE_ENFORCE(pool_pd != nullptr,
+                     "Fail to find pool_pd in device context");
+
+      auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
+          dev_ctx.GetBlob(key_pool_workspace_memory));
+      PADDLE_ENFORCE(workspace_memory != nullptr,
+                     "Fail to find workspace_memory in device context");
+
+      auto pool_diff_src_memory_p = std::make_shared<memory>(memory(
+          {diff_src_md, mkldnn_engine}, static_cast<void*>(in_x_grad_data)));
+      dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p);
+
+      auto pool_diff_dst_memory_p = std::make_shared<memory>(
+          memory({diff_dst_md, mkldnn_engine},
+                 static_cast<void*>(const_cast<T*>(out_grad_data))));
+      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p);
+
+      auto pool_bwd_desc = mkldnn::pooling_backward::desc(
+          pooling_type == "max" ? mkldnn::algorithm::pooling_max
+                                : mkldnn::algorithm::pooling_avg,
+          diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
+          mkldnn::padding_kind::zero);
+      auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
+          pool_bwd_desc, mkldnn_engine, *pool_pd);
+
+      pool_bwd_p = std::make_shared<pooling_backward>(
+          pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory,
+          *(pool_diff_src_memory_p));
+      dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
+    } else {
+      // Primitives already exist
+      auto pool_diff_src_memory_p = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_diff_src_mem_p));
+      PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      auto pool_diff_dst_memory_p = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
+      PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+      pool_diff_src_memory_p->set_data_handle(
+          reinterpret_cast<void*>(in_x_grad_data));
+      pool_diff_dst_memory_p->set_data_handle(const_cast<T*>(out_grad_data));
+    }
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{bwd_prim};
+    std::vector<mkldnn::primitive> pipeline{*(pool_bwd_p.get())};
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }  // Compute()
 };
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 79e3c26fef51b4d27520a8079de1074d72f89617..b29035bafd34fa81dc6b59691142fe74439202b8 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 23f1d615daab91f0e4b353bc7d9a3ca7f5cec5ae..56ed5912a15437b72b769610912c7493d77e5964 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -71,5 +71,15 @@ inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
   return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
 }
 
+template <typename Type>
+mkldnn::memory::data_type MKLDNNGetDataType() {
+  return mkldnn::memory::data_undef;
+}
+
+template <>
+inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
+  return mkldnn::memory::f32;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 50bc0aba6aa0f056dc0b2d49f6b3b745433e0756..2fb5c6dc6b8ad25fa1ad5fcf7c2acfedd5be4a83 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 }
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
-    : start_ns_(PosixInNsec()) {
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
   dev_ctx_ = dev_ctx;
   name_ = name;
   PushEvent(name_, dev_ctx_);
@@ -183,7 +184,7 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 
 RecordEvent::~RecordEvent() {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
@@ -193,14 +194,16 @@ RecordEvent::~RecordEvent() {
   PopEvent(name_, dev_ctx_);
 }
 
-RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) {
+RecordBlock::RecordBlock(int block_id)
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
   SetCurBlock(block_id);
   name_ = string::Sprintf("block_%d", block_id);
 }
 
 RecordBlock::~RecordBlock() {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 61b98143e41abb9e47d2c717c7876f1bab7f5077..643bb6183d144ec11a4890d9ea1ca970acb08b4c 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -74,6 +74,7 @@ struct RecordEvent {
 
   ~RecordEvent();
 
+  bool is_enabled_;
   uint64_t start_ns_;
   // The device context is used by Event to get the current cuda stream.
   const DeviceContext* dev_ctx_;
@@ -89,6 +90,7 @@ struct RecordBlock {
   ~RecordBlock();
 
  private:
+  bool is_enabled_;
   std::string name_;
   uint64_t start_ns_;
 };
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 6471eb3ab7bf05365c0bb2bf68bb74ef9044c527..bcf6d4dd3087060c016e53722cde80704ef2e834 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -238,6 +238,7 @@ void BindVarDsec(pybind11::module *m) {
 
   pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
       .value("BOOL", pd::proto::VarType::BOOL)
+      .value("UINT8", pd::proto::VarType::UINT8)
       .value("INT16", pd::proto::VarType::INT16)
       .value("INT32", pd::proto::VarType::INT32)
       .value("INT64", pd::proto::VarType::INT64)
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 7e00bd38487902227c3b4521db20cdbe314059be..92b8b90880bc78dbc281a959a7472c2822f76fc3 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -198,7 +198,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
+        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index e5af42ed5703f2b277cb6e8b5c81b49656107ecf..fbe219a1c9cf85f19ae2ab991ae7e4207858f204 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -405,17 +405,19 @@ EOF
 
 function gen_dockerfile() {
     # Set BASE_IMAGE according to env variables
+    CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1).$(echo $CUDA_VERSION | cut -d '.' -f 2)"
+    CUDNN_MAJOR=$(echo $CUDNN_VERSION | cut -d '.' -f 1)
     if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+        BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-runtime-ubuntu16.04"
     else
-    BASE_IMAGE="ubuntu:16.04"
+        BASE_IMAGE="ubuntu:16.04"
     fi
 
     DOCKERFILE_GPU_ENV=""
     DOCKERFILE_CUDNN_DSO=""
     if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
         DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
     fi
 
     cat <<EOF
@@ -449,7 +451,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
+        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
@@ -490,7 +492,7 @@ function gen_fluid_inference_lib() {
     Deploying fluid inference library ...
     ========================================
 EOF
-        make inference_lib_dist
+        make -j `nproc` inference_lib_dist
     fi
 }
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 38c765938fe9d7b2103bfdd926874c485d0ff4dc..161ea55586bbb6bde2cbb0084bb67b184f91460e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT64
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint8:
+        return core.VarDesc.VarType.UINT8
     else:
         raise ValueError("Not supported numpy dtype " + str(dtype))
 
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 4b707973e27391a6bdcba138934f62a255e04bb2..dee41448081cbfcd8224ce2abbf3ba7b7b97eb7c 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -49,6 +49,7 @@ __all__ = [
     'reorder_lod_tensor_by_rank',
     'ParallelDo',
     'Print',
+    'is_empty',
 ]
 
 
@@ -1562,3 +1563,40 @@ def reorder_lod_tensor_by_rank(x, rank_table):
                 'RankTable': [rank_table]},
         outputs={'Out': [out]})
     return out
+
+
+def is_empty(x, cond=None, **ignored):
+    """
+    **Is Empty**
+
+    This layer returns the truth value of whether the variable is empty.
+
+    Args:
+        x(Variable): Operand of *is_empty*
+        cond(Variable|None): Optional output variable to store the result
+                             of *is_empty*
+
+    Returns:
+        Variable: The tensor variable storing the output of *is_empty*.
+
+    Raises:
+        TypeError: If input cond is not a variable, or cond's dtype is
+                   not bool
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.is_empty(x=input)
+    """
+    helper = LayerHelper("is_empty", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+    elif not isinstance(cond, Variable):
+        raise TypeError("cond takes a variable")
+    elif cond.dtype != 'bool':
+        raise TypeError("The data type of cond must be bool")
+
+    helper.append_op(
+        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]})
+    return cond
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index a5938fe494265778ef7032c56a8d6d35acd729c5..b33adf55cf1ded9795043e108f5814d3fc0e3ded 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -23,6 +23,7 @@ import nn
 import math
 
 __all__ = [
+    'prior_box',
     'multi_box_head',
     'bipartite_match',
     'target_assign',
@@ -564,6 +565,98 @@ def ssd_loss(location,
     return loss
 
 
+def prior_box(input,
+              image,
+              min_sizes,
+              max_sizes=None,
+              aspect_ratios=None,
+              variance=[0.1, 0.1, 0.2, 0.2],
+              flip=False,
+              clip=False,
+              steps=[0.0, 0.0],
+              offset=0.5,
+              name=None):
+    """
+    **Prior box operator**
+
+    Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+    Each position of the input produce N prior boxes, N is determined by
+    the count of min_sizes, max_sizes and aspect_ratios, The size of the
+    box is in range(min_size, max_size) interval, which is generated in
+    sequence according to the aspect_ratios.
+
+    Args:
+       input(Variable): The Input Variables, the format is NCHW.
+       image(Variable): The input image data of PriorBoxOp,
+            the layout is NCHW.
+       min_sizes(list|tuple): min sizes of generated prior boxes.
+       max_sizes(list|tuple|None): max sizes of generated prior boxes.
+            Default: None.
+       aspect_ratios(list|tuple): the aspect ratios of generated prior
+            boxes. Default: None.
+       variance(list|tuple): the variances to be encoded in prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       flip(bool): Whether to flip aspect ratios. Default:False.
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|turple): Prior boxes step across weight and height, If
+            step[0] == 0.0/step[1] == 0.0, the prior boxes step across
+            height/weight  of the input will be automatically calculated.
+            Default: [0.0]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the prior box op. Default: None.
+
+    Returns:
+        boxes(Variable): the output prior boxes of PriorBox.
+             The layout is [H, W, num_priors, 4].
+             H is the height of input, W is the width of input,
+             num_priors is the total
+             box count of each position of input.
+        Variances(Variable): the expanded variances of PriorBox.
+             The layout is [H, W, num_priors, 4].
+             H is the height of input, W is the width of input
+             num_priors is the total
+             box count of each position of input
+
+
+    Examples:
+        .. code-block:: python
+            box, var = prior_box(
+            input=conv1,
+            image=images,
+            min_sizes=[100.],
+            flip=True,
+            clip=True)
+    """
+    helper = LayerHelper("prior_box", **locals())
+    dtype = helper.input_dtype()
+
+    attrs = {
+        'min_sizes': min_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'flip': flip,
+        'clip': clip,
+        'step_w': steps[0],
+        'step_h': steps[1],
+        'offset': offset
+    }
+    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        attrs['max_sizes'] = max_sizes
+
+    box = helper.create_tmp_variable(dtype)
+    var = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="prior_box",
+        inputs={"Input": input,
+                "Image": image},
+        outputs={"Boxes": box,
+                 "Variances": var},
+        attrs=attrs, )
+    box.stop_gradient = True
+    var.stop_gradient = True
+    return box, var
+
+
 def multi_box_head(inputs,
                    image,
                    base_size,
@@ -660,47 +753,6 @@ def multi_box_head(inputs,
             clip=True)
     """
 
-    def _prior_box_(input,
-                    image,
-                    min_sizes,
-                    max_sizes,
-                    aspect_ratios,
-                    variance,
-                    flip=False,
-                    clip=False,
-                    step_w=0.0,
-                    step_h=0.0,
-                    offset=0.5,
-                    name=None):
-        helper = LayerHelper("prior_box", **locals())
-        dtype = helper.input_dtype()
-
-        attrs = {
-            'min_sizes': min_sizes,
-            'aspect_ratios': aspect_ratios,
-            'variances': variance,
-            'flip': flip,
-            'clip': clip,
-            'step_w': step_w,
-            'step_h': step_h,
-            'offset': offset
-        }
-        if len(max_sizes) > 0 and max_sizes[0] > 0:
-            attrs['max_sizes'] = max_sizes
-
-        box = helper.create_tmp_variable(dtype)
-        var = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="prior_box",
-            inputs={"Input": input,
-                    "Image": image},
-            outputs={"Boxes": box,
-                     "Variances": var},
-            attrs=attrs, )
-        box.stop_gradient = True
-        var.stop_gradient = True
-        return box, var
-
     def _reshape_with_axis_(input, axis=1):
         if not (axis > 0 and axis < len(input.shape)):
             raise ValueError("The axis should be smaller than "
@@ -777,11 +829,10 @@ def multi_box_head(inputs,
             aspect_ratio = aspect_ratios[i]
             if not _is_list_or_tuple_(aspect_ratio):
                 aspect_ratio = [aspect_ratio]
+        step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
 
-        box, var = _prior_box_(input, image, min_size, max_size, aspect_ratio,
-                               variance, flip, clip, step_w[i]
-                               if step_w else 0.0, step_h[i]
-                               if step_w else 0.0, offset)
+        box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
+                             variance, flip, clip, step, offset)
 
         box_results.append(box)
         var_results.append(var)
diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
index c2a15bdb3b17b65fe861dd429f548074c13e2f09..da76747f82d1ab51af07c2e942d1ea893e149b7e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -8,3 +8,4 @@ endforeach()
 
 add_subdirectory(fit_a_line)
 add_subdirectory(recognize_digits)
+add_subdirectory(image_classification)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fed6d914f75b690e34411aa154359c93b6ca989
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train10']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+
+
+def reader_creator(filename, sub_name, batch_size=None):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            batch_count = 0
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    if isinstance(batch_size, int) and batch_count > batch_size:
+                        break
+                    batch_count += 1
+                    yield item
+
+    return reader
+
+
+def train10(batch_size=None):
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch',
+        batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
similarity index 77%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py
rename to python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index 17db38797cf19ae387f69f66daa42fc78cfcb7d5..1160e500dbd6db784eeb81b72968386347fec59a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy
+import cifar10_small_test_set
 
 
 def resnet_cifar10(input, depth=32):
@@ -81,46 +82,50 @@ def train_network():
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(cost)
     accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return avg_cost, accuracy
+    return [avg_cost, accuracy]
 
 
-def train(use_cuda, save_path):
+def train(use_cuda, train_program, save_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
         batch_size=BATCH_SIZE)
 
     test_reader = paddle.batch(
         paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
-            if (event.batch_id % 10) == 0:
-                avg_cost, accuracy = trainer.test(reader=test_reader)
+        if isinstance(event, fluid.EndStepEvent):
+            avg_cost, accuracy = trainer.test(
+                reader=test_reader, feed_order=['pixel', 'label'])
 
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
-                    event.batch_id + 1, avg_cost, accuracy))
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
-                if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
-                    return
+            if accuracy > 0.01:  # Low threshold for speeding up CI
+                if save_dirname is not None:
+                    trainer.save_params(save_dirname)
+                return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_network,
+        train_func=train_program,
         optimizer=fluid.optimizer.Adam(learning_rate=0.001),
-        place=place,
-        event_handler=event_handler)
-    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
+        place=place)
 
+    trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
 
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
+
+def infer(use_cuda, inference_program, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(inference_network, params, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -135,8 +140,14 @@ def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     save_path = "image_classification_resnet.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
+
+    train(
+        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        save_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
similarity index 72%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py
rename to python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index e83afeed2f72635a40aa2ac21dc0c8611c309de4..1e3e955ba0299f2cc0fcc02d79ae6fd8ff4c1171 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy
+import cifar10_small_test_set
 
 
 def vgg16_bn_drop(input):
@@ -60,46 +61,48 @@ def train_network():
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(cost)
     accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return avg_cost, accuracy
+    return [avg_cost, accuracy]
 
 
-def train(use_cuda, save_path):
+def train(use_cuda, train_program, save_dirname):
     BATCH_SIZE = 128
-    EPOCH_NUM = 1
-
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
         batch_size=BATCH_SIZE)
 
     test_reader = paddle.batch(
         paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
-            if (event.batch_id % 10) == 0:
-                avg_cost, accuracy = trainer.test(reader=test_reader)
+        if isinstance(event, fluid.EndStepEvent):
+            avg_cost, accuracy = trainer.test(
+                reader=test_reader, feed_order=['pixel', 'label'])
 
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
-                    event.batch_id + 1, avg_cost, accuracy))
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
 
-                if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
-                    return
+            if accuracy > 0.01:  # Low threshold for speeding up CI
+                if save_dirname is not None:
+                    trainer.save_params(save_dirname)
+                return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_network,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
+        train_func=train_program,
         place=place,
-        event_handler=event_handler)
-    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
+        optimizer=fluid.optimizer.Adam(learning_rate=0.001))
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
+def infer(use_cuda, inference_program, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(inference_network, params, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_dirname, place=place)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -114,8 +117,14 @@ def main(use_cuda):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     save_path = "image_classification_vgg.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
+
+    train(
+        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        save_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
similarity index 77%
rename from python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py
rename to python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index 6d7495e4dd1cd4ae1486fb9a6dc5264bb26a5940..bf86cd9acf8da940fcc2fb5b594e33f9b6965acb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -90,7 +90,7 @@ def train_program(is_sparse):
     return avg_cost
 
 
-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, save_dirname):
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
     test_reader = paddle.batch(
@@ -99,27 +99,36 @@ def train(use_cuda, train_program, save_path):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
-            outs = trainer.test(reader=test_reader)
+        if isinstance(event, fluid.EndStepEvent):
+            outs = trainer.test(
+                reader=test_reader,
+                feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
             avg_cost = outs[0]
             print("loss= ", avg_cost)
 
-            if avg_cost < 5.0:
-                trainer.save_params(save_path)
-                return
+            if avg_cost < 10.0:
+                trainer.save_params(save_dirname)
+                trainer.stop()
+
             if math.isnan(avg_cost):
                 sys.exit("got NaN loss, training failed.")
 
     trainer = fluid.Trainer(
-        train_program, fluid.optimizer.SGD(learning_rate=0.001), place=place)
+        train_func=train_program,
+        optimizer=fluid.optimizer.SGD(learning_rate=0.001),
+        place=place)
+
     trainer.train(
-        reader=train_reader, num_epochs=1, event_handler=event_handler)
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
 
 
-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_path, place=place)
+        infer_func=inference_program, param_path=save_dirname, place=place)
 
     lod = [0, 1]
     first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
@@ -127,12 +136,14 @@ def infer(use_cuda, inference_program, save_path):
     third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
     fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
 
-    result = inferencer.infer({
-        'firstw': first_word,
-        'secondw': second_word,
-        'thirdw': third_word,
-        'forthw': fourth_word
-    })
+    result = inferencer.infer(
+        {
+            'firstw': first_word,
+            'secondw': second_word,
+            'thirdw': third_word,
+            'forthw': fourth_word
+        },
+        return_numpy=False)
     print(np.array(result[0]))
 
 
@@ -140,9 +151,17 @@ def main(use_cuda, is_sparse):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
-    save_path = "word2vec.params"
-    train(use_cuda, partial(train_program, is_sparse), save_path)
-    infer(use_cuda, partial(inference_program, is_sparse), save_path)
+    save_path = "word2vec.inference.model"
+
+    train(
+        use_cuda=use_cuda,
+        train_program=partial(train_program, is_sparse),
+        save_dirname=save_path)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=partial(inference_program, is_sparse),
+        save_dirname=save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 921260ef3f4b1f9e4c65b3ffb440dc34cb0a9376..8569d838bdd414eb84c6c87674990a25a2fdcdf9 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -109,6 +109,24 @@ class TestDetection(unittest.TestCase):
         print(str(program))
 
 
+class TestPriorBox(unittest.TestCase):
+    def test_prior_box(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        box, var = layers.prior_box(
+            input=conv1,
+            image=images,
+            min_sizes=[100.0],
+            aspect_ratios=[1.],
+            flip=True,
+            clip=True)
+        assert len(box.shape) == 4
+        assert box.shape == var.shape
+        assert box.shape[3] == 4
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 4d11cf226be2ba4ffbe015198fed3191f1e02f72..11121d9b65351eab639b7618fac0e54714cf4680 100644
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -14,42 +14,24 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.op import Operator
-import paddle.fluid.core as core
+from op_test import OpTest
 
 
-def create_tensor(scope, name, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(np_data.shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class TestIsEmptyOp(unittest.TestCase):
+class TestEmpty(OpTest):
     def setUp(self):
-        self.scope = core.Scope()
-        # create input variables
-        np_data0 = np.array([0, 1, 2])
-        create_tensor(self.scope, "X0", np_data0)
-
-        np_data1 = np.array([1])
-        t = create_tensor(self.scope, "X1", np_data1)
-        t.set_dims([0])
+        self.op_type = "is_empty"
+        self.inputs = {'X': np.array([1, 2, 3])}
+        self.outputs = {'Out': np.array([False])}
 
-        # create output variables
-        self.scope.var("out")
+    def test_check_output(self):
+        self.check_output()
 
-    def test_no_empty(self):
-        self.one_case("X0", False)
 
-    def test_empty(self):
-        self.one_case("X1", True)
-
-    def one_case(self, input, target):
-        op = Operator(type="is_empty", X=input, Out="out")
-        op.run(self.scope, core.CPUPlace())
-        out = self.scope.var("out").get_tensor()
-        self.assertEqual(np.array(out)[0], target)
+class TestNotEmpty(TestEmpty):
+    def setUp(self):
+        self.op_type = "is_empty"
+        self.inputs = {'X': np.array([])}
+        self.outputs = {'Out': np.array([True])}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 49034b47b2d184e4027bcebc29413a163340fdaa..80a8f7c09cfe521f8f94a27e85fc8d86c02b3e97 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -24,7 +24,8 @@ dtype_to_size = {
     core.VarDesc.VarType.INT16: 2,
     core.VarDesc.VarType.INT32: 4,
     core.VarDesc.VarType.INT64: 8,
-    core.VarDesc.VarType.BOOL: 1
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
 }
 
 SUB_BLOCK_OPS = [