Merge branch 'develop' of github.com:PaddlePaddle/Paddle into checkpoint

06aa23b0 · tangwei12 · be050565 · 35e55636 · 06aa23b0 · 06aa23b0
32 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -70,7 +70,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip==9.0.3 && \
+RUN easy_install -U pip && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
+# CPU性能调优
 此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
 Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
@@ -8,7 +10,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
 * Python 与 C++ 混合代码的性能分析
-# Python代码的性能分析
+## Python代码的性能分析
 ### 生成性能分析文件

--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
+# Tune CPU performance
 This tutorial introduces techniques we use to profile and tune the
 CPU performance of PaddlePaddle.  We will use Python packages
 `cProfile` and `yep`, and Google's `perftools`.
@@ -14,7 +16,7 @@ the profiling and tuning of
 1. the Python code and
 1. the mixture of Python and C++ code.
-# Profiling the Python Code
+## Profiling the Python Code
 ### Generate the Performance Profiling File

--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -37,12 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具
    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
    :widths: 1, 3, 3
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
 .. _pip_dependency:

--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -40,12 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
    :header: "version", "cp27-cp27mu", "cp27-cp27m"
    :widths: 1, 3, 3
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
 .. _pip_dependency:

--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -91,6 +91,12 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
    case proto::VarType::BOOL:
      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
      break;
+    case proto::VarType::INT16:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::UINT8:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
    default:
      PADDLE_THROW("Not support type %d", src_type);
  }

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -98,7 +98,7 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(const OpDesc &op,
    return false;
  };
-  if (op.Type() == "split") {
+  if (op.Type() == "split" || op.Type() == "split_byref") {
    return checker(op.OutputArgumentNames(), send_op->InputArgumentNames());
  } else if (op.Type() == "concat") {
    return checker(op.InputArgumentNames(), send_op->OutputArgumentNames());

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -149,7 +149,7 @@ void TestInference(const std::string& dirname,
    state = paddle::platform::ProfilerState::kCPU;
  } else {
 #ifdef PADDLE_WITH_CUDA
-    state = paddle::platform::ProfilerState::kCUDA;
+    state = paddle::platform::ProfilerState::kAll;
    // The default device_id of paddle::platform::CUDAPlace is 0.
    // Users can get the device_id using:
    //   int device_id = place.GetDeviceId();
@@ -172,7 +172,7 @@ void TestInference(const std::string& dirname,
  }
  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
-                                    "load_program_profiler.txt");
+                                    "load_program_profiler");
  paddle::platform::ResetProfiler();
  // 3. Get the feed_target_names and fetch_target_names
@@ -236,8 +236,7 @@ void TestInference(const std::string& dirname,
    // Disable the profiler and print the timing information
    paddle::platform::DisableProfiler(
-        paddle::platform::EventSortingKey::kDefault,
+        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
-        "run_inference_profiler.txt");
    paddle::platform::ResetProfiler();
  }

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -204,6 +204,7 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
    if(WITH_GPU)
+        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
        op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -14,10 +14,6 @@ limitations under the License. */
 #pragma once
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"

--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,45 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/operators/is_empty_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 namespace paddle {
 namespace operators {
-constexpr char kInput[] = "X";
+class IsEmptyOp : public framework::OperatorWithKernel {
-constexpr char kOutput[] = "Out";
-class IsEmptyOp : public framework::OperatorBase {
 public:
-  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
+  using framework::OperatorWithKernel::OperatorWithKernel;
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
- private:
+ protected:
-  void RunImpl(const framework::Scope &scope,
+  void InferShape(framework::InferShapeContext *ctx) const override {
-               const platform::Place &place) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
-    // get input
+                   "Input(X) of IsEmptyOp should not be null.");
-    auto *var = scope.FindVar(Input(kInput));
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-    PADDLE_ENFORCE_NOT_NULL(var);
+                   "Output(Out) of IsEmptyOp should not be null.");
-    auto &tensor = var->Get<framework::LoDTensor>();
+    ctx->SetOutputDim("Out", {1});
-    // get output
+  }
-    auto *out = scope.FindVar(Output(kOutput));
-    PADDLE_ENFORCE_NOT_NULL(out);
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
-    out_tensor->Resize({1});
+  framework::OpKernelType GetExpectedKernelType(
-    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+      const framework::ExecutionContext &ctx) const override {
-        framework::product(tensor.dims()) == 0;
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        platform::CPUPlace());
+    return kt;
  }
 };
-class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+class IsEmptyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
+    AddInput("X", "(LoDTensor) Tensor which is to be checked.");
-    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+    AddOutput("Out",
+              "(LoDTensor) a boolean Tensor that indicate empty or not.");
    AddComment(R"DOC(
 IsEmpty Operator which checks whether a tensor is empty.
@@ -62,5 +58,12 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
+namespace ops = paddle::operators;
-                             paddle::operators::IsEmptyOpProtoMaker);
+REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/is_empty_op.h
+++ b/paddle/fluid/operators/is_empty_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class IsEmptyOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get input
+    auto* input_tensor = context.Input<framework::LoDTensor>("X");
+    // get output
+    auto* output_tensor = context.Output<framework::LoDTensor>("Out");
+    output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(input_tensor->dims()) == 0;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -18,6 +18,26 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+// Generate keys for storing/retriving primitives for this operator
+// TODO(jczaja): Make hashing function more optimial
+static std::string gethash(memory::dims& input_dims, std::string& pooling_type,
+                           std::vector<int>& ksize, std::vector<int>& strides,
+                           std::vector<int>& paddings, std::string suffix) {
+  auto dims2str = [](memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  };
+  return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) +
+         dims2str(paddings) + pooling_type + suffix;
+}
 template <typename T>
 class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 public:
@@ -34,10 +54,6 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    // Get an unique name from "argument" name of "Out" variable
    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -63,13 +79,28 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    const std::string key = gethash(src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Output("Out"));
+    const std::string key_pool_p = key + "@pool_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
+    auto pool_p =
+        std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
+    if (pool_p == nullptr) {
      // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+      auto src_md =
+          platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType<T>(),
                                  mkldnn::memory::format::nchw);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
+      auto dst_md =
+          platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType<T>(),
                                  mkldnn::memory::format::nchw);
-    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
+      std::shared_ptr<pooling_forward::primitive_desc> pool_pd =
          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
                              pooling_type, mkldnn_engine);
@@ -82,18 +113,37 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      // save pool_workspace_memory to be referred in backward path
      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
-    auto src_memory =
+      auto pool_src_memory_p = std::make_shared<memory>(
-        mkldnn::memory({src_md, mkldnn_engine},
+          memory::primitive_desc{src_md, mkldnn_engine},
          static_cast<void*>(const_cast<T*>(input_data)));
-    auto dst_memory =
+      dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p);
-        mkldnn::memory({dst_md, mkldnn_engine},
-                       static_cast<void*>(const_cast<T*>(output_data)));
-    auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
+      auto pool_dst_memory_p = std::make_shared<memory>(
+          memory::primitive_desc{dst_md, mkldnn_engine},
+          static_cast<void*>(output_data));
+      dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p);
+      pool_p = std::make_shared<pooling_forward>(
+          *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()),
          *workspace_memory);
+      dev_ctx.SetBlob(key_pool_p, pool_p);
+    } else {
+      // Primitives already exist
+      auto pool_src_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
+      PADDLE_ENFORCE(pool_src_memory_p != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      auto pool_dst_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+      PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+      pool_src_memory_p->set_data_handle(
+          reinterpret_cast<void*>(const_cast<T*>(input_data)));
+      pool_dst_memory_p->set_data_handle(output_data);
+    }
    // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{pool_prim};
+    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
  }
@@ -120,8 +170,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    mkldnn::memory::primitive_desc workspace_md =
        pooling_type == "max"
            ? pool_pd->workspace_primitive_desc()
-            : mkldnn::memory::primitive_desc(
+            : mkldnn::memory::primitive_desc({{},
-                  {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw},
+                                              platform::MKLDNNGetDataType<T>(),
+                                              mkldnn::memory::format::nchw},
                                             engine);
    auto p_workspace_memory = new mkldnn::memory(workspace_md);
@@ -140,13 +191,6 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
    Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when referring info from device context
-    const std::string key = ctx.op().Input("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
@@ -171,11 +215,26 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> diff_dst_tz =
        paddle::framework::vectorize2int(out_grad->dims());
-    auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32,
+    // Get an unique name from "argument" name of "Out" variable
+    // This name will be used as key when referring info from device context
+    const std::string key = gethash(diff_src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Input("Out"));
+    const std::string key_pool_bwd_p = key + "@pool_bwd_p";
+    const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
+    const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
+    auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
+        dev_ctx.GetBlob(key_pool_bwd_p));
+    if (pool_bwd_p == nullptr) {
+      auto diff_src_md =
+          platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType<T>(),
                                  mkldnn::memory::format::nchw);
-    auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32,
+      auto diff_dst_md =
+          platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType<T>(),
                                  mkldnn::memory::format::nchw);
      // Retrieve pool_pd/pool_workspace_memory from device context
      auto pool_pd =
          std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
@@ -188,6 +247,15 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      PADDLE_ENFORCE(workspace_memory != nullptr,
                     "Fail to find workspace_memory in device context");
+      auto pool_diff_src_memory_p = std::make_shared<memory>(memory(
+          {diff_src_md, mkldnn_engine}, static_cast<void*>(in_x_grad_data)));
+      dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p);
+      auto pool_diff_dst_memory_p = std::make_shared<memory>(
+          memory({diff_dst_md, mkldnn_engine},
+                 static_cast<void*>(const_cast<T*>(out_grad_data))));
+      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p);
      auto pool_bwd_desc = mkldnn::pooling_backward::desc(
          pooling_type == "max" ? mkldnn::algorithm::pooling_max
                                : mkldnn::algorithm::pooling_avg,
@@ -196,18 +264,27 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
          pool_bwd_desc, mkldnn_engine, *pool_pd);
-    auto diff_src_memory =
+      pool_bwd_p = std::make_shared<pooling_backward>(
-        mkldnn::memory({diff_src_md, mkldnn_engine},
+          pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory,
-                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
+          *(pool_diff_src_memory_p));
-    auto diff_dst_memory =
+      dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
-        mkldnn::memory({diff_dst_md, mkldnn_engine},
+    } else {
-                       static_cast<void*>(const_cast<T*>(out_grad_data)));
+      // Primitives already exist
+      auto pool_diff_src_memory_p = std::static_pointer_cast<memory>(
-    auto bwd_prim = mkldnn::pooling_backward(
+          dev_ctx.GetBlob(key_pool_diff_src_mem_p));
-        pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
+      PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      auto pool_diff_dst_memory_p = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
+      PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+      pool_diff_src_memory_p->set_data_handle(
+          reinterpret_cast<void*>(in_x_grad_data));
+      pool_diff_dst_memory_p->set_data_handle(const_cast<T*>(out_grad_data));
+    }
    // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{bwd_prim};
+    std::vector<mkldnn::primitive> pipeline{*(pool_bwd_p.get())};
    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
  }  // Compute()
 };

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -71,5 +71,15 @@ inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
  return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
 }
+template <typename Type>
+mkldnn::memory::data_type MKLDNNGetDataType() {
+  return mkldnn::memory::data_undef;
+}
+template <>
+inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
+  return mkldnn::memory::f32;
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
 }
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
-    : start_ns_(PosixInNsec()) {
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
  dev_ctx_ = dev_ctx;
  name_ = name;
  PushEvent(name_, dev_ctx_);
@@ -183,7 +184,7 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
 }
 RecordEvent::~RecordEvent() {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
@@ -193,14 +194,16 @@ RecordEvent::~RecordEvent() {
  PopEvent(name_, dev_ctx_);
 }
-RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) {
+RecordBlock::RecordBlock(int block_id)
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
  SetCurBlock(block_id);
  name_ = string::Sprintf("block_%d", block_id);
 }
 RecordBlock::~RecordBlock() {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
  DeviceTracer* tracer = GetDeviceTracer();
  if (tracer) {
    // We try to put all blocks at the same nested depth in the

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -74,6 +74,7 @@ struct RecordEvent {
  ~RecordEvent();
+  bool is_enabled_;
  uint64_t start_ns_;
  // The device context is used by Event to get the current cuda stream.
  const DeviceContext* dev_ctx_;
@@ -89,6 +90,7 @@ struct RecordBlock {
  ~RecordBlock();
 private:
+  bool is_enabled_;
  std::string name_;
  uint64_t start_ns_;
 };

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -238,6 +238,7 @@ void BindVarDsec(pybind11::module *m) {
  pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
      .value("BOOL", pd::proto::VarType::BOOL)
+      .value("UINT8", pd::proto::VarType::UINT8)
      .value("INT16", pd::proto::VarType::INT16)
      .value("INT32", pd::proto::VarType::INT32)
      .value("INT64", pd::proto::VarType::INT64)

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -198,7 +198,7 @@ EOF
    # run paddle version to install python packages first
    RUN apt-get update &&\
        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
+        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -405,8 +405,10 @@ EOF
 function gen_dockerfile() {
    # Set BASE_IMAGE according to env variables
+    CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1).$(echo $CUDA_VERSION | cut -d '.' -f 2)"
+    CUDNN_MAJOR=$(echo $CUDNN_VERSION | cut -d '.' -f 1)
    if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+        BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-runtime-ubuntu16.04"
    else
        BASE_IMAGE="ubuntu:16.04"
    fi
@@ -415,7 +417,7 @@ function gen_dockerfile() {
    DOCKERFILE_CUDNN_DSO=""
    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
    fi
    cat <<EOF
@@ -449,7 +451,7 @@ EOF
    # run paddle version to install python packages first
    RUN apt-get update &&\
        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
+        apt-get install -y wget python-pip dmidecode python-tk && easy_install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \
@@ -490,7 +492,7 @@ function gen_fluid_inference_lib() {
    Deploying fluid inference library ...
    ========================================
 EOF
-        make inference_lib_dist
+        make -j `nproc` inference_lib_dist
    fi
 }

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,8 @@ def convert_np_dtype_to_dtype_(np_dtype):
        return core.VarDesc.VarType.INT64
    elif dtype == np.bool:
        return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint8:
+        return core.VarDesc.VarType.UINT8
    else:
        raise ValueError("Not supported numpy dtype " + str(dtype))

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -49,6 +49,7 @@ __all__ = [
    'reorder_lod_tensor_by_rank',
    'ParallelDo',
    'Print',
+    'is_empty',
 ]
@@ -1562,3 +1563,40 @@ def reorder_lod_tensor_by_rank(x, rank_table):
                'RankTable': [rank_table]},
        outputs={'Out': [out]})
    return out
+def is_empty(x, cond=None, **ignored):
+    """
+    **Is Empty**
+    This layer returns the truth value of whether the variable is empty.
+    Args:
+        x(Variable): Operand of *is_empty*
+        cond(Variable|None): Optional output variable to store the result
+                             of *is_empty*
+    Returns:
+        Variable: The tensor variable storing the output of *is_empty*.
+    Raises:
+        TypeError: If input cond is not a variable, or cond's dtype is
+                   not bool
+    Examples:
+        .. code-block:: python
+          less = fluid.layers.is_empty(x=input)
+    """
+    helper = LayerHelper("is_empty", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+    elif not isinstance(cond, Variable):
+        raise TypeError("cond takes a variable")
+    elif cond.dtype != 'bool':
+        raise TypeError("The data type of cond must be bool")
+    helper.append_op(
+        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]})
+    return cond
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -23,6 +23,7 @@ import nn
 import math
 __all__ = [
+    'prior_box',
    'multi_box_head',
    'bipartite_match',
    'target_assign',
@@ -564,6 +565,98 @@ def ssd_loss(location,
    return loss
+def prior_box(input,
+              image,
+              min_sizes,
+              max_sizes=None,
+              aspect_ratios=None,
+              variance=[0.1, 0.1, 0.2, 0.2],
+              flip=False,
+              clip=False,
+              steps=[0.0, 0.0],
+              offset=0.5,
+              name=None):
+    """
+    **Prior box operator**
+    Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+    Each position of the input produce N prior boxes, N is determined by
+    the count of min_sizes, max_sizes and aspect_ratios, The size of the
+    box is in range(min_size, max_size) interval, which is generated in
+    sequence according to the aspect_ratios.
+    Args:
+       input(Variable): The Input Variables, the format is NCHW.
+       image(Variable): The input image data of PriorBoxOp,
+            the layout is NCHW.
+       min_sizes(list|tuple): min sizes of generated prior boxes.
+       max_sizes(list|tuple|None): max sizes of generated prior boxes.
+            Default: None.
+       aspect_ratios(list|tuple): the aspect ratios of generated prior
+            boxes. Default: None.
+       variance(list|tuple): the variances to be encoded in prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       flip(bool): Whether to flip aspect ratios. Default:False.
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|turple): Prior boxes step across weight and height, If
+            step[0] == 0.0/step[1] == 0.0, the prior boxes step across
+            height/weight  of the input will be automatically calculated.
+            Default: [0.0]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the prior box op. Default: None.
+    Returns:
+        boxes(Variable): the output prior boxes of PriorBox.
+             The layout is [H, W, num_priors, 4].
+             H is the height of input, W is the width of input,
+             num_priors is the total
+             box count of each position of input.
+        Variances(Variable): the expanded variances of PriorBox.
+             The layout is [H, W, num_priors, 4].
+             H is the height of input, W is the width of input
+             num_priors is the total
+             box count of each position of input
+    Examples:
+        .. code-block:: python
+            box, var = prior_box(
+            input=conv1,
+            image=images,
+            min_sizes=[100.],
+            flip=True,
+            clip=True)
+    """
+    helper = LayerHelper("prior_box", **locals())
+    dtype = helper.input_dtype()
+    attrs = {
+        'min_sizes': min_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'flip': flip,
+        'clip': clip,
+        'step_w': steps[0],
+        'step_h': steps[1],
+        'offset': offset
+    }
+    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        attrs['max_sizes'] = max_sizes
+    box = helper.create_tmp_variable(dtype)
+    var = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="prior_box",
+        inputs={"Input": input,
+                "Image": image},
+        outputs={"Boxes": box,
+                 "Variances": var},
+        attrs=attrs, )
+    box.stop_gradient = True
+    var.stop_gradient = True
+    return box, var
 def multi_box_head(inputs,
                   image,
                   base_size,
@@ -660,47 +753,6 @@ def multi_box_head(inputs,
            clip=True)
    """
-    def _prior_box_(input,
-                    image,
-                    min_sizes,
-                    max_sizes,
-                    aspect_ratios,
-                    variance,
-                    flip=False,
-                    clip=False,
-                    step_w=0.0,
-                    step_h=0.0,
-                    offset=0.5,
-                    name=None):
-        helper = LayerHelper("prior_box", **locals())
-        dtype = helper.input_dtype()
-        attrs = {
-            'min_sizes': min_sizes,
-            'aspect_ratios': aspect_ratios,
-            'variances': variance,
-            'flip': flip,
-            'clip': clip,
-            'step_w': step_w,
-            'step_h': step_h,
-            'offset': offset
-        }
-        if len(max_sizes) > 0 and max_sizes[0] > 0:
-            attrs['max_sizes'] = max_sizes
-        box = helper.create_tmp_variable(dtype)
-        var = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="prior_box",
-            inputs={"Input": input,
-                    "Image": image},
-            outputs={"Boxes": box,
-                     "Variances": var},
-            attrs=attrs, )
-        box.stop_gradient = True
-        var.stop_gradient = True
-        return box, var
    def _reshape_with_axis_(input, axis=1):
        if not (axis > 0 and axis < len(input.shape)):
            raise ValueError("The axis should be smaller than "
@@ -777,11 +829,10 @@ def multi_box_head(inputs,
            aspect_ratio = aspect_ratios[i]
            if not _is_list_or_tuple_(aspect_ratio):
                aspect_ratio = [aspect_ratio]
+        step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
-        box, var = _prior_box_(input, image, min_size, max_size, aspect_ratio,
+        box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
-                               variance, flip, clip, step_w[i]
+                             variance, flip, clip, step, offset)
-                               if step_w else 0.0, step_h[i]
-                               if step_w else 0.0, offset)
        box_results.append(box)
        var_results.append(var)

--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -8,3 +8,4 @@ endforeach()
 add_subdirectory(fit_a_line)
 add_subdirectory(recognize_digits)
+add_subdirectory(image_classification)
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+"""
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+__all__ = ['train10']
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+def reader_creator(filename, sub_name, batch_size=None):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+            batch_count = 0
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    if isinstance(batch_size, int) and batch_count > batch_size:
+                        break
+                    batch_count += 1
+                    yield item
+    return reader
+def train10(batch_size=None):
+    """
+    CIFAR-10 training set creator.
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch',
+        batch_size=batch_size)
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_resnet.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy
+import cifar10_small_test_set
 def resnet_cifar10(input, depth=32):
@@ -81,46 +82,50 @@ def train_network():
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return avg_cost, accuracy
+    return [avg_cost, accuracy]
-def train(use_cuda, save_path):
+def train(use_cuda, train_program, save_dirname):
    BATCH_SIZE = 128
    EPOCH_NUM = 1
    train_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
        batch_size=BATCH_SIZE)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
    def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
+        if isinstance(event, fluid.EndStepEvent):
-            if (event.batch_id % 10) == 0:
+            avg_cost, accuracy = trainer.test(
-                avg_cost, accuracy = trainer.test(reader=test_reader)
+                reader=test_reader, feed_order=['pixel', 'label'])
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
-                    event.batch_id + 1, avg_cost, accuracy))
            if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
+                if save_dirname is not None:
+                    trainer.save_params(save_dirname)
                return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
-        train_network,
+        train_func=train_program,
        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
-        place=place,
+        place=place)
-        event_handler=event_handler)
-    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
+    trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
-def infer(use_cuda, save_path):
-    params = fluid.Params(save_path)
+def infer(use_cuda, inference_program, save_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(inference_network, params, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_dirname, place=place)
    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -135,8 +140,14 @@ def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    save_path = "image_classification_resnet.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
+    train(
+        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        save_dirname=save_path)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/notest_image_classification_vgg.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy
+import cifar10_small_test_set
 def vgg16_bn_drop(input):
@@ -60,46 +61,48 @@ def train_network():
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    return avg_cost, accuracy
+    return [avg_cost, accuracy]
-def train(use_cuda, save_path):
+def train(use_cuda, train_program, save_dirname):
    BATCH_SIZE = 128
-    EPOCH_NUM = 1
    train_reader = paddle.batch(
        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
        batch_size=BATCH_SIZE)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
    def event_handler(event):
-        if isinstance(event, fluid.EndIteration):
+        if isinstance(event, fluid.EndStepEvent):
-            if (event.batch_id % 10) == 0:
+            avg_cost, accuracy = trainer.test(
-                avg_cost, accuracy = trainer.test(reader=test_reader)
+                reader=test_reader, feed_order=['pixel', 'label'])
-                print('BatchID {1:04}, Loss {2:2.2}, Acc {3:2.2}'.format(
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
-                    event.batch_id + 1, avg_cost, accuracy))
            if accuracy > 0.01:  # Low threshold for speeding up CI
-                    trainer.params.save(save_path)
+                if save_dirname is not None:
+                    trainer.save_params(save_dirname)
                return
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
-        train_network,
+        train_func=train_program,
-        optimizer=fluid.optimizer.Adam(learning_rate=0.001),
        place=place,
-        event_handler=event_handler)
+        optimizer=fluid.optimizer.Adam(learning_rate=0.001))
-    trainer.train(train_reader, EPOCH_NUM, event_handler=event_handler)
+    trainer.train(
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
-def infer(use_cuda, save_path):
+def infer(use_cuda, inference_program, save_dirname=None):
-    params = fluid.Params(save_path)
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(inference_network, params, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_dirname, place=place)
    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -114,8 +117,14 @@ def main(use_cuda):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    save_path = "image_classification_vgg.inference.model"
-    train(use_cuda, save_path)
-    infer(use_cuda, save_path)
+    train(
+        use_cuda=use_cuda, train_program=train_network, save_dirname=save_path)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        save_dirname=save_path)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py
@@ -90,7 +90,7 @@ def train_program(is_sparse):
    return avg_cost
-def train(use_cuda, train_program, save_path):
+def train(use_cuda, train_program, save_dirname):
    train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
    test_reader = paddle.batch(
@@ -99,27 +99,36 @@ def train(use_cuda, train_program, save_path):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
+        if isinstance(event, fluid.EndStepEvent):
-            outs = trainer.test(reader=test_reader)
+            outs = trainer.test(
+                reader=test_reader,
+                feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
            avg_cost = outs[0]
            print("loss= ", avg_cost)
-            if avg_cost < 5.0:
+            if avg_cost < 10.0:
-                trainer.save_params(save_path)
+                trainer.save_params(save_dirname)
-                return
+                trainer.stop()
            if math.isnan(avg_cost):
                sys.exit("got NaN loss, training failed.")
    trainer = fluid.Trainer(
-        train_program, fluid.optimizer.SGD(learning_rate=0.001), place=place)
+        train_func=train_program,
+        optimizer=fluid.optimizer.SGD(learning_rate=0.001),
+        place=place)
    trainer.train(
-        reader=train_reader, num_epochs=1, event_handler=event_handler)
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
-def infer(use_cuda, inference_program, save_path):
+def infer(use_cuda, inference_program, save_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=save_path, place=place)
+        infer_func=inference_program, param_path=save_dirname, place=place)
    lod = [0, 1]
    first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
@@ -127,12 +136,14 @@ def infer(use_cuda, inference_program, save_path):
    third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
    fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
-    result = inferencer.infer({
+    result = inferencer.infer(
+        {
            'firstw': first_word,
            'secondw': second_word,
            'thirdw': third_word,
            'forthw': fourth_word
-    })
+        },
+        return_numpy=False)
    print(np.array(result[0]))
@@ -140,9 +151,17 @@ def main(use_cuda, is_sparse):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
-    save_path = "word2vec.params"
+    save_path = "word2vec.inference.model"
-    train(use_cuda, partial(train_program, is_sparse), save_path)
-    infer(use_cuda, partial(inference_program, is_sparse), save_path)
+    train(
+        use_cuda=use_cuda,
+        train_program=partial(train_program, is_sparse),
+        save_dirname=save_path)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=partial(inference_program, is_sparse),
+        save_dirname=save_path)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -109,6 +109,24 @@ class TestDetection(unittest.TestCase):
        print(str(program))
+class TestPriorBox(unittest.TestCase):
+    def test_prior_box(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        box, var = layers.prior_box(
+            input=conv1,
+            image=images,
+            min_sizes=[100.0],
+            aspect_ratios=[1.],
+            flip=True,
+            clip=True)
+        assert len(box.shape) == 4
+        assert box.shape == var.shape
+        assert box.shape[3] == 4
 class TestMultiBoxHead(unittest.TestCase):
    def test_multi_box_head(self):
        data_shape = [3, 224, 224]

--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -14,42 +14,24 @@
 import unittest
 import numpy as np
-from paddle.fluid.op import Operator
+from op_test import OpTest
-import paddle.fluid.core as core
-def create_tensor(scope, name, np_data):
+class TestEmpty(OpTest):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(np_data.shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-class TestIsEmptyOp(unittest.TestCase):
    def setUp(self):
-        self.scope = core.Scope()
+        self.op_type = "is_empty"
-        # create input variables
+        self.inputs = {'X': np.array([1, 2, 3])}
-        np_data0 = np.array([0, 1, 2])
+        self.outputs = {'Out': np.array([False])}
-        create_tensor(self.scope, "X0", np_data0)
-        np_data1 = np.array([1])
-        t = create_tensor(self.scope, "X1", np_data1)
-        t.set_dims([0])
-        # create output variables
+    def test_check_output(self):
-        self.scope.var("out")
+        self.check_output()
-    def test_no_empty(self):
-        self.one_case("X0", False)
-    def test_empty(self):
+class TestNotEmpty(TestEmpty):
-        self.one_case("X1", True)
+    def setUp(self):
+        self.op_type = "is_empty"
-    def one_case(self, input, target):
+        self.inputs = {'X': np.array([])}
-        op = Operator(type="is_empty", X=input, Out="out")
+        self.outputs = {'Out': np.array([True])}
-        op.run(self.scope, core.CPUPlace())
-        out = self.scope.var("out").get_tensor()
-        self.assertEqual(np.array(out)[0], target)
 if __name__ == "__main__":

--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -24,7 +24,8 @@ dtype_to_size = {
    core.VarDesc.VarType.INT16: 2,
    core.VarDesc.VarType.INT32: 4,
    core.VarDesc.VarType.INT64: 8,
-    core.VarDesc.VarType.BOOL: 1
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
 }
 SUB_BLOCK_OPS = [