diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 7d53554358497762b1cd91c39bdd23c5807af2bc..df186637726f60ee1b69cec7291477f3efcd059c 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -16,12 +16,10 @@ function(copy TARGET)
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
-        if(IS_DIRECTORY ${src})
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
-        else()
-            add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
-        endif()
+        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          COMMAND mkdir -p "${dst}"
+          COMMAND cp -r "${src}" "${dst}"
+          COMMENT "copying ${src} -> ${dst}")
     endforeach()
 endfunction()
 
@@ -53,11 +51,11 @@ IF(NOT PROTOBUF_FOUND)
 ENDIF(NOT PROTOBUF_FOUND)
 
 # paddle fluid module
-set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
+set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
+  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
 )
 
@@ -69,7 +67,7 @@ copy(memory_lib
 
 set(module "inference")
 copy(inference_lib DEPENDS paddle_fluid_shared
-  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
+  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
diff --git a/doc/build_and_install/build_cn.md b/doc/build_and_install/build_cn.md
deleted file mode 100644
index 4a80a5245102fb992f513a749f6a02e1130188af..0000000000000000000000000000000000000000
--- a/doc/build_and_install/build_cn.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# 用Docker编译和测试PaddlePaddle
-
-## 需要的软硬件
-
-为了开发PaddlePaddle，我们需要
-
-1. 一台电脑，可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统，以及
-1. Docker。
-
-不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker image 里。
-
-## 总体流程
-
-1. 获取源码
-
-   ```bash
-   git clone https://github.com/paddlepaddle/paddle
-   ```
-
-2. 安装开发工具到 Docker image 里
-
-   ```bash
-   cd paddle; docker build -t paddle:dev .
-   ```
-
-   请注意这个命令结尾处的 `.`；它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)，按照其内容创建一个名为 `paddle:dev` 的 Docker image，并且把各种开发工具安装进去。
-
-3. 编译
-
-   以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image，同时把当前目录（源码树根目录）映射为 container 里的 `/paddle` 目录，并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码，结果输出到 `/paddle/build`，也就是本地的源码树根目录里的 `build` 子目录。
-
-   ```bash
-   docker run --rm -v $PWD:/paddle paddle:dev
-   ```
-
-   上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本，可以用
-
-   ```bash
-   docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
-   ```
-
-4. 运行单元测试
-
-   用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试：
-
-   ```bash
-   NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   如果编译的时候我们用了 `WITH_GPU=OFF` 选项，那么编译过程只会产生 CPU-based 单元测试，那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要：
-
-   ```bash
-   docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   有时候我们只想运行一个特定的单元测试，比如 `memory_test`，我们可以
-
-   ```bash
-   nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
-   ```
-
-5. 清理
-
-   有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要：
-
-   ```bash
-   rm -rf build
-   ```
-
-## 为什么要 Docker 呀？
-
-- 什么是 Docker?
-
-  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
-
-- Docker 还是虚拟机？
-
-  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
-
-- 为什么用 Docker?
-
-  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
-
-  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
-
-- 我可以选择不用Docker吗？
-
-  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
-
-- 学习 Docker 有多难？
-
-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
-
-- 我可以用 IDE 吗？
-
-  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
-
-  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
-
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
-
-  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
-
-- 可以并行编译吗？
-
-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
-
-## 可能碰到的问题
-
-- Docker 需要 sudo
-
-  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
-
-- 在 Windows/MacOS 上编译很慢
-
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
-
-- 磁盘不够
-
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
diff --git a/doc/build_and_install/build_en.md b/doc/build_and_install/build_en.md
deleted file mode 100644
index 91c41ef8ce3abdec5d69a9cbcebbc49b17d8f663..0000000000000000000000000000000000000000
--- a/doc/build_and_install/build_en.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Build using Docker
-
-## What Developers Need
-
-To contribute to PaddlePaddle, you need
-
-1. A computer -- Linux, BSD, Windows, MacOS, and
-1. Docker.
-
-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.  We run all the tools by running this image.
-
-## General Process
-
-1. Retrieve source code.
-
-   ```bash
-   git clone https://github.com/paddlepaddle/paddle
-   ```
-
-2. Install build tools into a Docker image.
-
-   ```bash
-   cd paddle; docker build -t paddle:dev .
-   ```
-
-   Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).  `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
-
-3. Build from source.
-
-   This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile.  `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
-
-   ```bash
-   docker run -v $PWD:/paddle paddle:dev
-   ```
-
-   Above command builds a CUDA-enabled version.  If we want to build a CPU-only version, we can type
-
-   ```bash
-   docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
-   ```
-
-4. Run unit tests.
-
-   To run all unit tests using the first GPU of a node:
-
-   ```bash
-   NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them.  We can just run
-
-   ```bash
-   docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
-   ```
-
-   Sometimes we want to run a specific unit test, say `memory_test`, we can run
-
-   ```bash
-   nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
-   ```
-
-5. Clean Build.
-
-   Sometimes, we might want to clean all thirt-party dependents and built binaries.  To do so, just
-
-   ```bash
-   rm -rf build
-   ```
-
-## Docker, Or Not?
-
-- What is Docker?
-
-  If you haven't heard of it, consider it something like Python's virtualenv.
-
-- Docker or virtual machine?
-
-  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
-
-- Why Docker?
-
-  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
-
-  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
-
-- Can I choose not to use Docker?
-
-  Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer.  This document exists because Docker would make the development way easier.
-
-- How difficult is it to learn Docker?
-
-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
-
-- Can I use my favorite IDE?
-
-  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
-
-  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
-
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
-
-  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
-
-- Does Docker do parallel building?
-
-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
-
-## Some Gotchas
-
-- Docker requires sudo
-
-  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
-
-- Docker on Windows/MacOS builds slowly
-
-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
-
-- Not enough disk space
-
-  Examples in this article uses option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
diff --git a/doc/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
index ff904b1022a41612c9680dce92d3fc2c69ad7e93..fec2d412f03f6b94422f0463d1985decd0c1bf99 100644
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -1,14 +1,26 @@
 从源码编译
 ======================
 
+.. _requirements:
+
+需要的软硬件
+----------------
+
+为了编译PaddlePaddle，我们需要
+
+1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
+1. Docker
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
+
 .. _build_step:
 
 编译方法
 ----------------
 
-PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
-我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。或者
+参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
 
 如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
 
@@ -16,15 +28,19 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
 
 .. code-block:: bash
 
+   # 1. 获取源码
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
-   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
+   docker build -t paddle:dev .
+   # 3. 执行下面的命令编译CPU-Only的二进制
    docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
-   # 如果不使用Docker编译环境，执行下面的命令
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
+   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
+构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
+最后的执行脚本的命令。
 
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 
@@ -50,28 +66,83 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
 
 如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
 
-使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
 开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
 
 .. code-block:: bash
 
    docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
 
-如果不使用Docker，可以执行ctest命令即可：
+如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
 
 .. code-block:: bash
 
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
-   ctest
-   # 指定执行其中一个单元测试 test_mul_op
-   ctest -R test_mul_op
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   bash /paddle/paddle/scripts/docker/build.sh
+   cd /paddle/build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+常见问题
+----------------
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker，可以把它想象为一个类似 virtualenv 的系统，但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机？
+
+  有人用虚拟机来类比 Docker。需要强调的是：Docker 不会虚拟任何硬件，Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的，性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题，其他人可以复现问题以便帮助。
+
+  另外，对于习惯使用Windows和MacOS的开发者来说，使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗？
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式，把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程，是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难？
+
+  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗？
+
+  当然可以，因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码，我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗？
+
+  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发，自然也就有管理员权限（sudo）了。如果用公用的电脑开发，需要请管理员安装和配置好 Docker。此外，PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术，比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+
+- 磁盘不够
+
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+
 
 .. _compile_deps:
 
-编译依赖
+附录：编译依赖
 ----------------
 
 PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
@@ -91,7 +162,7 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
 
 .. _build_options:
 
-编译选项
+附录：编译选项
 ----------------
 
 PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
diff --git a/doc/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
index 718fb869c23a1f7be82c87c726282bded9dad516..29a1439e4cec50c15cb965a788070f21c704caad 100644
--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -1,32 +1,45 @@
 Build from Sources
 ==========================
 
-.. _build_step:
+.. _requirements:
 
-How To Build
+Requirements
 ----------------
 
-PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
-tools. We recommend you to use our pre-built Docker image to run the build
-to avoid installing dependencies by yourself. We have several build environment
-Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+To build PaddlePaddle, you need
+
+1. A computer -- Linux, Windows, MacOS.
+1. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+We run all the tools by running this image.
+
+.. _build_step:
 
-If you choose not to use Docker image for your build, you need to install the
-below `Compile Dependencies`_ before run the build.
+How To Build
+----------------
 
-Then run:
+You need to use Docker to build PaddlePaddle
+to avoid installing dependencies by yourself. We have several pre-built
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+Or you can build your own image from source as the optional step below:
 
 .. code-block:: bash
 
+   # 1. clone the source code
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
-   # run the following command to build a CPU-Only binaries if you are using docker
+   # 2. Optional: build development docker image from source
+   docker build -t paddle:dev .
+   # 3. Run the following command to build a CPU-Only binaries
    docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
-   # else run these commands
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
-   make
+   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+
+NOTE: The above command try to mount the current working directory (root directory of source code)
+into :code:`/paddle` directory inside docker container. If you are using your own image
+(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
+command in step 3.
 
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -61,22 +74,75 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 
    docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
 
-If you don't use Docker, just run ctest will start the tests:
+If you wish to run only one unit test, like :code:`test_sum_op`:
 
 .. code-block:: bash
 
-   mkdir build
-   cd build
-   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
-   make
-   ctest
-   # run a single test like test_mul_op
-   ctest -R test_mul_op
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   bash /paddle/paddle/scripts/docker/build.sh
+   cd /paddle/build
+   ctest -R test_sum_op -V
+
+.. _faq_docker:
+
+Frequently Asked Questions
+----------------
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
 
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them on your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+
+- Not enough disk space
+
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
 
 .. _compile_deps:
 
-Compile Dependencies
+Appendix: Compile Dependencies
 ----------------
 
 PaddlePaddle need the following dependencies when compiling, other dependencies
@@ -97,17 +163,13 @@ will be downloaded automatically.
 
 .. _build_options:
 
-Build Options
+Appendix: Build Options
 ----------------
 
 Build options include whether build binaries for CPU or GPU, which BLAS
 library to use etc. You may pass these settings when running cmake.
 For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
 
-.. _build_options_bool:
-
-Bool Type Options
-----------------
 
 You can add :code:`-D` argument to pass such options, like:
 
diff --git a/doc/build_and_install/index_cn.rst b/doc/build_and_install/index_cn.rst
index 4220ff2279333f25eb644227100308428bf72362..c0b60f55895f11bcbaa06bc65c973180b3661cfc 100644
--- a/doc/build_and_install/index_cn.rst
+++ b/doc/build_and_install/index_cn.rst
@@ -13,7 +13,6 @@ PaddlePaddle提供pip和Docker的安装方式：
 
    pip_install_cn.rst
    docker_install_cn.rst
-   build_cn.md
 
 编译流程
 ++++++++
diff --git a/doc/build_and_install/index_en.rst b/doc/build_and_install/index_en.rst
index db6b5be742be1619c52f5f7000bec013e818693d..7e0ca5bcbdbad0a3c97c0045bb57b51137668161 100644
--- a/doc/build_and_install/index_en.rst
+++ b/doc/build_and_install/index_en.rst
@@ -13,8 +13,6 @@ You can choose either pip or Docker to complete your install:
 
    pip_install_en.rst
    docker_install_en.rst
-   build_en.md
-
 
 Build from Source
 -----------------
diff --git a/doc/design/switch_kernel.md b/doc/design/kernel_selection.md
similarity index 100%
rename from doc/design/switch_kernel.md
rename to doc/design/kernel_selection.md
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 4dc3de54deef71e42d73e0943021691c5e39f7c7..6e5ceefadd7c35d743df454909beaef37d9343d5 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -106,9 +106,11 @@ class Vector {
   // std::vector iterator methods. Based on CPU data access method
   size_t size() const { return size_; }
 
-  T* begin() { return &this->operator[](0); }
+  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
 
-  T* end() { return &this->operator[](size()); }
+  T* end() {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
 
   T& front() { return *begin(); }
 
@@ -118,8 +120,13 @@ class Vector {
     return *it;
   }
 
-  const T* begin() const { return &this->operator[](0); }
-  const T* end() const { return &this->operator[](size()); }
+  const T* begin() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+  }
+
+  const T* end() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }
 
   const T* cbegin() const { return begin(); }
 
@@ -358,6 +365,11 @@ class Vector {
     }
   }
 
+  static T& EmptyDummy() {
+    static T dummy = T();
+    return dummy;
+  }
+
   mutable int flag_;
   mutable Tensor cpu_vec_;
   mutable Tensor cuda_vec_;
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 0d5a914eac78096bd41814080bf4b2105d25e187..8ea574b31cc5825cc2f788377d66c4c45e723898 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -98,3 +98,9 @@ TEST(mixed_vector, InitWithCount) {
     ASSERT_EQ(vec[i], 10);
   }
 }
+
+TEST(mixed_vector, ForEach) {
+  vec<int> tmp;
+  for (auto& v : tmp) {
+  }
+}
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index bfbb2cfc2c57c705cf42c65825edcc6dea08cf41..2746168f1dda493368b81820bde2f093d06d7b4e 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -25,7 +25,10 @@ namespace framework {
 class CosineOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -44,7 +47,10 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 61529fe38b15fe2a4bfa0d64159994d6b62fb086..8effbf1bc6298bdcc381e2176411a79da134653f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -64,6 +64,18 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 }
 
+void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  if (platform::is_gpu_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("Cannot run operator on place %s", place);
+#else
+    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+    platform::SetDeviceId(dev_id);
+#endif
+  }
+  RunImpl(scope, place);
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -479,8 +491,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
-void OperatorWithKernel::Run(const Scope& scope,
-                             const platform::Place& place) const {
+void OperatorWithKernel::RunImpl(const Scope& scope,
+                                 const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 52300abeb7df346d610d2363335dc9d3330ee39e..708f87dc8632ac500e1050122c5fd5412071fd22 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -89,8 +89,9 @@ class OperatorBase {
 
   std::string DebugString() const { return DebugStringEx(nullptr); }
 
-  /// Net will call this function to Run an op.
-  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
+  /// Net will call this interface function to Run an op.
+  //  The implementation should be written at RunImpl
+  void Run(const Scope& scope, const platform::Place& place);
 
   // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
   virtual void Stop() {}
@@ -144,6 +145,8 @@ class OperatorBase {
  private:
   void GenerateTemporaryNames();
   void CheckAllInputOutputSet() const;
+  virtual void RunImpl(const Scope& scope,
+                       const platform::Place& place) const = 0;
 };
 
 // Macro for define a clone method.
@@ -168,10 +171,13 @@ class OperatorBase {
 class NOP : public OperatorBase {
  public:
   using OperatorBase::OperatorBase;
-  void Run(const Scope& scope, const platform::Place& place) const override {}
   std::unique_ptr<OperatorBase> Clone() const override {
     return std::unique_ptr<OperatorBase>(new NOP(*this));
   }
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
 };
 
 class ExecutionContext {
@@ -363,8 +369,6 @@ class OperatorWithKernel : public OperatorBase {
                      const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const Scope& scope, const platform::Place& place) const final;
-
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
   AllOpKernels() {
     static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
@@ -393,6 +397,7 @@ class OperatorWithKernel : public OperatorBase {
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
   proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
+  void RunImpl(const Scope& scope, const platform::Place& place) const final;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index b90f5538bb620275521cdc11bf47b4014b2a66e2..0732ec5afe8738313e1d73c52c5303a2e8b1e96a 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -28,7 +28,10 @@ class OpWithoutKernelTest : public OperatorBase {
   OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
                       const VariableNameMap& outputs, const AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs), x(1) {}
-  void Run(const Scope& scope, const platform::Place& place) const override {
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {
     ++op_run_num;
     ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
     ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
@@ -259,8 +262,10 @@ class OperatorClone : public paddle::framework::OperatorBase {
                 const paddle::framework::VariableNameMap& outputs,
                 const paddle::framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const paddle::framework::Scope& scope,
-           const paddle::platform::Place& place) const override {}
+
+ private:
+  void RunImpl(const paddle::framework::Scope& scope,
+               const paddle::platform::Place& place) const override {}
 };
 
 TEST(Operator, Clone) {
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 9fe76afb582a13b741ab086f0c62d77e86d4e8bb..cddd5a786c45c804014d82012ec3a7ef988491a5 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -29,6 +29,6 @@ inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp)
 inference_test(recommender_system)
-inference_test(rnn_encoder_decoder)
+#inference_test(rnn_encoder_decoder)
 inference_test(understand_sentiment)
 inference_test(word2vec)
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index bf8e11bd8c047275fe341ead9424d02e98d5d8f4..69464c4cff52400d8a25a692c5df8d2fe06230e4 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -31,8 +31,10 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
                      const framework::VariableNameMap &outputs,
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index f99f9af4276c0e8928f821ae166d55aed02e8e27..b72e72b12f8a6155b6eb3be1468b8dbc7bd48d4e 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -71,8 +71,10 @@ class AssignOp : public framework::OperatorBase {
            const framework::VariableNameMap &outputs,
            const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) {
       return;
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 7737d4e098ac9a0e56e1db2aee796550e8d71ba3..6d3efcfeb8497a78d56180898e5e3a66e52ff22d 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -55,8 +55,10 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                      const framework::VariableNameMap& outputs,
                      const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& dev_ctx = *pool.Get(dev_place);
 
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index 9e2a05a60c30e388093aceddd40e58273364c8f9..bfbe78097d2f20ae4c5efa594d17f931c7ea5920 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -204,8 +204,9 @@ class BeamSearchOp : public framework::OperatorBase {
     PADDLE_THROW("Not Implemented");
   }
 
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
     auto pre_ids_var = scope.FindVar(Input("pre_ids"));
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index f3414c33b5ab3cc8dffee640fd85b9625b3f237b..b1f09fb0029affe671d63874cf3d3db86476c367 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -102,3 +102,5 @@ REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
 REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
 REGISTER_LOGICAL_OP(equal, "Out = X == Y");
 REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
+REGISTER_LOGICAL_OP(not_equal, "Out = X != Y");
+REGISTER_LOGICAL_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/compare_op.cu b/paddle/fluid/operators/compare_op.cu
index 3507af2ae3add8cf02f5b9f3b3d89b40d73bcb0d..00263a2ade4502e732d53b871665185f8d0fa9f1 100644
--- a/paddle/fluid/operators/compare_op.cu
+++ b/paddle/fluid/operators/compare_op.cu
@@ -17,3 +17,4 @@ limitations under the License. */
 REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
 REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
 REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
+REGISTER_LOGICAL_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/compare_op.h b/paddle/fluid/operators/compare_op.h
index 4b2ee5a9d68f5f1fd3d2d374669763855659f1db..c651335268fee08c08bcac6247f5a2ff92784330 100644
--- a/paddle/fluid/operators/compare_op.h
+++ b/paddle/fluid/operators/compare_op.h
@@ -48,6 +48,14 @@ struct EqualFunctor {
   }
 };
 
+template <typename T>
+struct NotEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    return !EqualFunctor<T>()(a, b);
+  }
+};
+
 template <typename DeviceContext, typename Functor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
index dd93790d5b52a2ccc8358a94f7ead346d384f191..d63748a61cec0f10269e05bcef3bb0d10345000d 100644
--- a/paddle/fluid/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
@@ -193,7 +193,7 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
   }
 }
 
-void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
   // get device context from pool
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& dev_ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
index 695af4490696b29d2d47f5825ebc0159b39663c0..0bb14bc8c2cfabeeb13e1e1afd51b034742b74f0 100644
--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
@@ -77,8 +77,9 @@ class CondOp : public framework::OperatorBase {
     sub_net_op_[FALSE_BRANCH] = std::move(net);
   }
 
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override;
 
  private:
   const int TRUE_BRANCH = 0;
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 30435c6cca0a4fb1d41dce47b8fefeafb6c48a51..228b0998360550348fdd30c842a394e8f8ce5935 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -65,8 +65,10 @@ class ConditionalBlockOp : public ConditionalOp {
                      const framework::VariableNameMap &outputs,
                      const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto xs = InputTensors(scope);
 
     bool need_run;
@@ -128,8 +130,10 @@ class ConditionalBlockGradOp : public ConditionalOp {
                          const framework::VariableNameMap &outputs,
                          const framework::AttributeMap &attrs)
       : ConditionalOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto xs = this->InputTensors(scope);
 
     bool need_run;
diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc
index d1ba51f2c0f13a1b6e4d7ccb93c912703a0b1d86..1393f1a66baaf3b53f797aa61fd42ac3cf54f8db 100644
--- a/paddle/fluid/operators/create_reader_op.cc
+++ b/paddle/fluid/operators/create_reader_op.cc
@@ -106,8 +106,10 @@ template <typename T>
 class CreateRandomDataGeneratorOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
@@ -155,8 +157,10 @@ class CreateRandomDataGeneratorOpMaker
 class CreateShuffleReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
@@ -187,8 +191,10 @@ class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
 class CreateBatchReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48308a11b4b313ec19b578110b9e369f4bfc52bf
--- /dev/null
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -0,0 +1,184 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection_map_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class DetectionMAPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("DetectRes"),
+                   "Input(DetectRes) of DetectionMAPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of DetectionMAPOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AccumPosCount"),
+        "Output(AccumPosCount) of DetectionMAPOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AccumTruePos"),
+        "Output(AccumTruePos) of DetectionMAPOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AccumFalsePos"),
+        "Output(AccumFalsePos) of DetectionMAPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MAP"),
+                   "Output(MAP) of DetectionMAPOp should not be null.");
+
+    auto det_dims = ctx->GetInputDim("DetectRes");
+    PADDLE_ENFORCE_EQ(det_dims.size(), 2UL,
+                      "The rank of Input(DetectRes) must be 2, "
+                      "the shape is [N, 6].");
+    PADDLE_ENFORCE_EQ(det_dims[1], 6UL,
+                      "The shape is of Input(DetectRes) [N, 6].");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "The rank of Input(Label) must be 2, "
+                      "the shape is [N, 6].");
+    PADDLE_ENFORCE_EQ(label_dims[1], 6UL,
+                      "The shape is of Input(Label) [N, 6].");
+
+    if (ctx->HasInput("PosCount")) {
+      PADDLE_ENFORCE(ctx->HasInput("TruePos"),
+                     "Input(TruePos) of DetectionMAPOp should not be null when "
+                     "Input(TruePos) is not null.");
+      PADDLE_ENFORCE(
+          ctx->HasInput("FalsePos"),
+          "Input(FalsePos) of DetectionMAPOp should not be null when "
+          "Input(FalsePos) is not null.");
+    }
+
+    ctx->SetOutputDim("MAP", framework::make_ddim({1}));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::Tensor>("DetectRes")->type()),
+        ctx.device_context());
+  }
+};
+
+class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DetectionMAPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("DetectRes",
+             "(LoDTensor) A 2-D LoDTensor with shape [M, 6] represents the "
+             "detections. Each row has 6 values: "
+             "[label, confidence, xmin, ymin, xmax, ymax], M is the total "
+             "number of detect results in this mini-batch. For each instance, "
+             "the offsets in first dimension are called LoD, the number of "
+             "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+             "no detected data.");
+    AddInput("Label",
+             "(LoDTensor) A 2-D LoDTensor with shape[N, 6] represents the"
+             "Labeled ground-truth data. Each row has 6 values: "
+             "[label, is_difficult, xmin, ymin, xmax, ymax], N is the total "
+             "number of ground-truth data in this mini-batch. For each "
+             "instance, the offsets in first dimension are called LoD, "
+             "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "
+             "means there is no ground-truth data.");
+    AddInput("PosCount",
+             "(Tensor) A tensor with shape [Ncls, 1], store the "
+             "input positive example count of each class, Ncls is the count of "
+             "input classification. "
+             "This input is used to pass the AccumPosCount generated by the "
+             "previous mini-batch when the multi mini-batches cumulative "
+             "calculation carried out. "
+             "When the input(PosCount) is empty, the cumulative "
+             "calculation is not carried out, and only the results of the "
+             "current mini-batch are calculated.")
+        .AsDispensable();
+    AddInput("TruePos",
+             "(LoDTensor) A 2-D LoDTensor with shape [Ntp, 2], store the "
+             "input true positive example of each class."
+             "This input is used to pass the AccumTruePos generated by the "
+             "previous mini-batch when the multi mini-batches cumulative "
+             "calculation carried out. ")
+        .AsDispensable();
+    AddInput("FalsePos",
+             "(LoDTensor) A 2-D LoDTensor with shape [Nfp, 2], store the "
+             "input false positive example of each class."
+             "This input is used to pass the AccumFalsePos generated by the "
+             "previous mini-batch when the multi mini-batches cumulative "
+             "calculation carried out. ")
+        .AsDispensable();
+    AddOutput("AccumPosCount",
+              "(Tensor) A tensor with shape [Ncls, 1], store the "
+              "positive example count of each class. It combines the input "
+              "input(PosCount) and the positive example count computed from "
+              "input(Detection) and input(Label).");
+    AddOutput("AccumTruePos",
+              "(LoDTensor) A LoDTensor with shape [Ntp', 2], store the "
+              "true positive example of each class. It combines the "
+              "input(TruePos) and the true positive examples computed from "
+              "input(Detection) and input(Label).");
+    AddOutput("AccumFalsePos",
+              "(LoDTensor) A LoDTensor with shape [Nfp', 2], store the "
+              "false positive example of each class. It combines the "
+              "input(FalsePos) and the false positive examples computed from "
+              "input(Detection) and input(Label).");
+    AddOutput("MAP",
+              "(Tensor) A tensor with shape [1], store the mAP evaluate "
+              "result of the detection.");
+
+    AddAttr<float>(
+        "overlap_threshold",
+        "(float) "
+        "The lower bound jaccard overlap threshold of detection output and "
+        "ground-truth data.")
+        .SetDefault(.3f);
+    AddAttr<bool>("evaluate_difficult",
+                  "(bool, default true) "
+                  "Switch to control whether the difficult data is evaluated.")
+        .SetDefault(true);
+    AddAttr<std::string>("ap_type",
+                         "(string, default 'integral') "
+                         "The AP algorithm type, 'integral' or '11point'.")
+        .SetDefault("integral")
+        .InEnum({"integral", "11point"})
+        .AddCustomChecker([](const std::string& ap_type) {
+          PADDLE_ENFORCE_NE(GetAPType(ap_type), APType::kNone,
+                            "The ap_type should be 'integral' or '11point.");
+        });
+    AddComment(R"DOC(
+Detection mAP evaluate operator.
+The general steps are as follows. First, calculate the true positive and
+ false positive according to the input of detection and labels, then
+ calculate the mAP evaluate value.
+ Supporting '11 point' and 'integral' mAP algorithm. Please get more information
+ from the following articles:
+ https://sanchom.wordpress.com/tag/average-precision/
+ https://arxiv.org/abs/1512.02325
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(detection_map, ops::DetectionMAPOp,
+                             ops::DetectionMAPOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
+    ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f5f588e9c448a6d84d388848aa5701f2b4882dd
--- /dev/null
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -0,0 +1,451 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+enum APType { kNone = 0, kIntegral, k11point };
+
+APType GetAPType(std::string str) {
+  if (str == "integral") {
+    return APType::kIntegral;
+  } else if (str == "11point") {
+    return APType::k11point;
+  } else {
+    return APType::kNone;
+  }
+}
+
+template <typename T>
+inline bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                                 const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <typename T>
+inline void GetAccumulation(std::vector<std::pair<T, int>> in_pairs,
+                            std::vector<int>* accu_vec) {
+  std::stable_sort(in_pairs.begin(), in_pairs.end(), SortScorePairDescend<int>);
+  accu_vec->clear();
+  size_t sum = 0;
+  for (size_t i = 0; i < in_pairs.size(); ++i) {
+    auto count = in_pairs[i].second;
+    sum += count;
+    accu_vec->push_back(sum);
+  }
+}
+
+template <typename Place, typename T>
+class DetectionMAPOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_detect = ctx.Input<framework::LoDTensor>("DetectRes");
+    auto* in_label = ctx.Input<framework::LoDTensor>("Label");
+    auto* out_map = ctx.Output<framework::Tensor>("MAP");
+
+    auto* in_pos_count = ctx.Input<framework::Tensor>("PosCount");
+    auto* in_true_pos = ctx.Input<framework::LoDTensor>("TruePos");
+    auto* in_false_pos = ctx.Input<framework::LoDTensor>("FalsePos");
+
+    auto* out_pos_count = ctx.Output<framework::Tensor>("AccumPosCount");
+    auto* out_true_pos = ctx.Output<framework::LoDTensor>("AccumTruePos");
+    auto* out_false_pos = ctx.Output<framework::LoDTensor>("AccumFalsePos");
+
+    float overlap_threshold = ctx.Attr<float>("overlap_threshold");
+    float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
+    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
+
+    auto label_lod = in_label->lod();
+    auto detect_lod = in_detect->lod();
+    PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
+                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
+                      "The batch_size of input(Label) and input(Detection) "
+                      "must be the same.");
+
+    std::vector<std::map<int, std::vector<Box>>> gt_boxes;
+    std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes;
+
+    GetBoxes(*in_label, *in_detect, gt_boxes, detect_boxes);
+
+    std::map<int, int> label_pos_count;
+    std::map<int, std::vector<std::pair<T, int>>> true_pos;
+    std::map<int, std::vector<std::pair<T, int>>> false_pos;
+
+    if (in_pos_count != nullptr) {
+      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count,
+                  true_pos, false_pos);
+    }
+
+    CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
+                             overlap_threshold, label_pos_count, true_pos,
+                             false_pos);
+
+    T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos);
+
+    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count,
+                 *out_true_pos, *out_false_pos);
+
+    T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
+    map_data[0] = map;
+  }
+
+ protected:
+  struct Box {
+    Box(T xmin, T ymin, T xmax, T ymax)
+        : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax), is_difficult(false) {}
+
+    T xmin, ymin, xmax, ymax;
+    bool is_difficult;
+  };
+
+  inline T JaccardOverlap(const Box& box1, const Box& box2) const {
+    if (box2.xmin > box1.xmax || box2.xmax < box1.xmin ||
+        box2.ymin > box1.ymax || box2.ymax < box1.ymin) {
+      return 0.0;
+    } else {
+      T inter_xmin = std::max(box1.xmin, box2.xmin);
+      T inter_ymin = std::max(box1.ymin, box2.ymin);
+      T inter_xmax = std::min(box1.xmax, box2.xmax);
+      T inter_ymax = std::min(box1.ymax, box2.ymax);
+
+      T inter_width = inter_xmax - inter_xmin;
+      T inter_height = inter_ymax - inter_ymin;
+      T inter_area = inter_width * inter_height;
+
+      T bbox_area1 = (box1.xmax - box1.xmin) * (box1.ymax - box1.ymin);
+      T bbox_area2 = (box2.xmax - box2.xmin) * (box2.ymax - box2.ymin);
+
+      return inter_area / (bbox_area1 + bbox_area2 - inter_area);
+    }
+  }
+
+  void GetBoxes(const framework::LoDTensor& input_label,
+                const framework::LoDTensor& input_detect,
+                std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
+                std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
+                    detect_boxes) const {
+    auto labels = framework::EigenTensor<T, 2>::From(input_label);
+    auto detect = framework::EigenTensor<T, 2>::From(input_detect);
+
+    auto label_lod = input_label.lod();
+    auto detect_lod = input_detect.lod();
+
+    int batch_size = label_lod[0].size() - 1;
+    auto label_index = label_lod[0];
+
+    for (int n = 0; n < batch_size; ++n) {
+      std::map<int, std::vector<Box>> boxes;
+      for (int i = label_index[n]; i < label_index[n + 1]; ++i) {
+        Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
+        int label = labels(i, 0);
+        auto is_difficult = labels(i, 1);
+        if (std::abs(is_difficult - 0.0) < 1e-6)
+          box.is_difficult = false;
+        else
+          box.is_difficult = true;
+        boxes[label].push_back(box);
+      }
+      gt_boxes.push_back(boxes);
+    }
+
+    auto detect_index = detect_lod[0];
+    for (int n = 0; n < batch_size; ++n) {
+      std::map<int, std::vector<std::pair<T, Box>>> boxes;
+      for (int i = detect_index[n]; i < detect_index[n + 1]; ++i) {
+        Box box(detect(i, 2), detect(i, 3), detect(i, 4), detect(i, 5));
+        int label = detect(i, 0);
+        auto score = detect(i, 1);
+        boxes[label].push_back(std::make_pair(score, box));
+      }
+      detect_boxes.push_back(boxes);
+    }
+  }
+
+  void GetOutputPos(
+      const framework::ExecutionContext& ctx,
+      const std::map<int, int>& label_pos_count,
+      const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+      const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
+      framework::Tensor& output_pos_count,
+      framework::LoDTensor& output_true_pos,
+      framework::LoDTensor& output_false_pos) const {
+    int max_class_id = 0;
+    int true_pos_count = 0;
+    int false_pos_count = 0;
+    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
+      int label = it->first;
+      if (label > max_class_id) max_class_id = label;
+      int label_num_pos = it->second;
+      if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
+        continue;
+      auto label_true_pos = true_pos.find(label)->second;
+      auto label_false_pos = false_pos.find(label)->second;
+      true_pos_count += label_true_pos.size();
+      false_pos_count += label_false_pos.size();
+    }
+
+    int* pos_count_data = output_pos_count.mutable_data<int>(
+        framework::make_ddim({max_class_id + 1, 1}), ctx.GetPlace());
+    T* true_pos_data = output_true_pos.mutable_data<T>(
+        framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
+    T* false_pos_data = output_false_pos.mutable_data<T>(
+        framework::make_ddim({false_pos_count, 2}), ctx.GetPlace());
+    true_pos_count = 0;
+    false_pos_count = 0;
+    std::vector<size_t> true_pos_starts = {0};
+    std::vector<size_t> false_pos_starts = {0};
+    for (int i = 0; i <= max_class_id; ++i) {
+      auto it_count = label_pos_count.find(i);
+      pos_count_data[i] = 0;
+      if (it_count != label_pos_count.end()) {
+        pos_count_data[i] = it_count->second;
+      }
+      auto it_true_pos = true_pos.find(i);
+      if (it_true_pos != true_pos.end()) {
+        const std::vector<std::pair<T, int>>& true_pos_vec =
+            it_true_pos->second;
+        for (const std::pair<T, int>& tp : true_pos_vec) {
+          true_pos_data[true_pos_count * 2] = tp.first;
+          true_pos_data[true_pos_count * 2 + 1] = static_cast<T>(tp.second);
+          true_pos_count++;
+        }
+      }
+      true_pos_starts.push_back(true_pos_count);
+
+      auto it_false_pos = false_pos.find(i);
+      if (it_false_pos != false_pos.end()) {
+        const std::vector<std::pair<T, int>>& false_pos_vec =
+            it_false_pos->second;
+        for (const std::pair<T, int>& fp : false_pos_vec) {
+          false_pos_data[false_pos_count * 2] = fp.first;
+          false_pos_data[false_pos_count * 2 + 1] = static_cast<T>(fp.second);
+          false_pos_count++;
+        }
+      }
+      false_pos_starts.push_back(false_pos_count);
+    }
+
+    framework::LoD true_pos_lod;
+    true_pos_lod.emplace_back(true_pos_starts);
+    framework::LoD false_pos_lod;
+    false_pos_lod.emplace_back(false_pos_starts);
+
+    output_true_pos.set_lod(true_pos_lod);
+    output_false_pos.set_lod(false_pos_lod);
+    return;
+  }
+
+  void GetInputPos(
+      const framework::Tensor& input_pos_count,
+      const framework::LoDTensor& input_true_pos,
+      const framework::LoDTensor& input_false_pos,
+      std::map<int, int>& label_pos_count,
+      std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+      std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
+    int class_number = input_pos_count.dims()[0];
+    const int* pos_count_data = input_pos_count.data<int>();
+    for (int i = 0; i < class_number; ++i) {
+      label_pos_count[i] = pos_count_data[i];
+    }
+
+    auto SetData = [](const framework::LoDTensor& pos_tensor,
+                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
+      const T* pos_data = pos_tensor.data<T>();
+      auto pos_data_lod = pos_tensor.lod();
+      for (int i = 0; i < pos_data_lod.size(); ++i) {
+        for (int j = pos_data_lod[0][i]; j < pos_data_lod[0][i + 1]; ++j) {
+          T score = pos_data[j * 2];
+          int flag = 1;
+          if (pos_data[j * 2 + 1] < kEPS) flag = 0;
+          pos[i].push_back(std::make_pair(score, flag));
+        }
+      }
+    };
+
+    SetData(input_true_pos, true_pos);
+    SetData(input_false_pos, false_pos);
+    return;
+  }
+
+  void CalcTrueAndFalsePositive(
+      const std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
+      const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
+          detect_boxes,
+      bool evaluate_difficult, float overlap_threshold,
+      std::map<int, int>& label_pos_count,
+      std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+      std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+    int batch_size = gt_boxes.size();
+    for (int n = 0; n < batch_size; ++n) {
+      auto image_gt_boxes = gt_boxes[n];
+      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
+        size_t count = 0;
+        auto labeled_bboxes = it->second;
+        if (evaluate_difficult) {
+          count = labeled_bboxes.size();
+        } else {
+          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
+            if (!(labeled_bboxes[i].is_difficult)) ++count;
+        }
+        if (count == 0) {
+          continue;
+        }
+        int label = it->first;
+        if (label_pos_count.find(label) == label_pos_count.end()) {
+          label_pos_count[label] = count;
+        } else {
+          label_pos_count[label] += count;
+        }
+      }
+    }
+
+    for (size_t n = 0; n < detect_boxes.size(); ++n) {
+      auto image_gt_boxes = gt_boxes[n];
+      auto detections = detect_boxes[n];
+
+      if (image_gt_boxes.size() == 0) {
+        for (auto it = detections.begin(); it != detections.end(); ++it) {
+          auto pred_boxes = it->second;
+          int label = it->first;
+          for (size_t i = 0; i < pred_boxes.size(); ++i) {
+            auto score = pred_boxes[i].first;
+            true_pos[label].push_back(std::make_pair(score, 0));
+            false_pos[label].push_back(std::make_pair(score, 1));
+          }
+        }
+        continue;
+      }
+
+      for (auto it = detections.begin(); it != detections.end(); ++it) {
+        int label = it->first;
+        auto pred_boxes = it->second;
+        if (image_gt_boxes.find(label) == image_gt_boxes.end()) {
+          for (size_t i = 0; i < pred_boxes.size(); ++i) {
+            auto score = pred_boxes[i].first;
+            true_pos[label].push_back(std::make_pair(score, 0));
+            false_pos[label].push_back(std::make_pair(score, 1));
+          }
+          continue;
+        }
+
+        auto matched_bboxes = image_gt_boxes.find(label)->second;
+        std::vector<bool> visited(matched_bboxes.size(), false);
+        // Sort detections in descend order based on scores
+        std::sort(pred_boxes.begin(), pred_boxes.end(),
+                  SortScorePairDescend<Box>);
+        for (size_t i = 0; i < pred_boxes.size(); ++i) {
+          T max_overlap = -1.0;
+          size_t max_idx = 0;
+          auto score = pred_boxes[i].first;
+          for (size_t j = 0; j < matched_bboxes.size(); ++j) {
+            T overlap = JaccardOverlap(pred_boxes[i].second, matched_bboxes[j]);
+            if (overlap > max_overlap) {
+              max_overlap = overlap;
+              max_idx = j;
+            }
+          }
+          if (max_overlap > overlap_threshold) {
+            bool match_evaluate_difficult =
+                evaluate_difficult ||
+                (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult);
+            if (match_evaluate_difficult) {
+              if (!visited[max_idx]) {
+                true_pos[label].push_back(std::make_pair(score, 1));
+                false_pos[label].push_back(std::make_pair(score, 0));
+                visited[max_idx] = true;
+              } else {
+                true_pos[label].push_back(std::make_pair(score, 0));
+                false_pos[label].push_back(std::make_pair(score, 1));
+              }
+            }
+          } else {
+            true_pos[label].push_back(std::make_pair(score, 0));
+            false_pos[label].push_back(std::make_pair(score, 1));
+          }
+        }
+      }
+    }
+  }
+
+  T CalcMAP(
+      APType ap_type, const std::map<int, int>& label_pos_count,
+      const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
+      const std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+    T mAP = 0.0;
+    int count = 0;
+    for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
+      int label = it->first;
+      int label_num_pos = it->second;
+      if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
+        continue;
+      auto label_true_pos = true_pos.find(label)->second;
+      auto label_false_pos = false_pos.find(label)->second;
+      // Compute average precision.
+      std::vector<int> tp_sum;
+      GetAccumulation<T>(label_true_pos, &tp_sum);
+      std::vector<int> fp_sum;
+      GetAccumulation<T>(label_false_pos, &fp_sum);
+      std::vector<T> precision, recall;
+      size_t num = tp_sum.size();
+      // Compute Precision.
+      for (size_t i = 0; i < num; ++i) {
+        precision.push_back(static_cast<T>(tp_sum[i]) /
+                            static_cast<T>(tp_sum[i] + fp_sum[i]));
+        recall.push_back(static_cast<T>(tp_sum[i]) / label_num_pos);
+      }
+      // VOC2007 style
+      if (ap_type == APType::k11point) {
+        std::vector<T> max_precisions(11, 0.0);
+        int start_idx = num - 1;
+        for (int j = 10; j >= 0; --j)
+          for (int i = start_idx; i >= 0; --i) {
+            if (recall[i] < j / 10.) {
+              start_idx = i;
+              if (j > 0) max_precisions[j - 1] = max_precisions[j];
+              break;
+            } else {
+              if (max_precisions[j] < precision[i])
+                max_precisions[j] = precision[i];
+            }
+          }
+        for (int j = 10; j >= 0; --j) mAP += max_precisions[j] / 11;
+        ++count;
+      } else if (ap_type == APType::kIntegral) {
+        // Nature integral
+        float average_precisions = 0.;
+        float prev_recall = 0.;
+        for (size_t i = 0; i < num; ++i) {
+          if (fabs(recall[i] - prev_recall) > 1e-6)
+            average_precisions += precision[i] * fabs(recall[i] - prev_recall);
+          prev_recall = recall[i];
+        }
+        mAP += average_precisions;
+        ++count;
+      } else {
+        LOG(FATAL) << "Unkown ap version: " << ap_type;
+      }
+    }
+    if (count != 0) mAP /= count;
+    return mAP * 100;
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index 0b3f5f0d1d09a932e15936285f5cb226daa86e95..41fa69a0972ef8ad528f2a04b0260c40155ffd3e 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -24,8 +24,10 @@ class FeedOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index 54e5892016cdb01f50189147a7453b868c5a48c0..6cb5565013dcacac33e828386f1ea8909e831c1a 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -26,8 +26,9 @@ class FetchOp : public framework::OperatorBase {
           const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index d4bf6406e5716a6b65a234d1cd642b64dcc5726f..6dd58d28db23ff3de8a27e898a9b539787d08718 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -33,8 +33,10 @@ class FillConstantInferShape : public framework::InferShapeBase {
 class FillConstantOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto data_type =
         static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 8e318f37cf0bc945597b5aa7b384e53038c97786..0b97c9c2827ac1be4e99c647dbedc2d9b8730e41 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -42,8 +42,10 @@ class FillOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &out =
         detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
                                 "Cannot find variable %s", Output("Out"))
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index ba908e472bbc165a244d8543713f1dbf293abb48..ef635048bd4faa2dc0067152f5f7472acbfe47af 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -37,8 +37,10 @@ class GetPlacesOp : public framework::OperatorBase {
               const framework::VariableNameMap &outputs,
               const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     bool is_gpu;
     if (Attr<std::string>("device_type") == "AUTO") {
       is_gpu = platform::is_gpu_place(place);
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 3d488067b254c37515c6bdb9a4589aad311f344f..de4949584b7b20bec7b31f2ad1b69053ee9ffc0f 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -51,8 +51,9 @@ class IncrementOp : public framework::OperatorBase {
               const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index ea424018d66dac85d5a4ad75cbf5199064d52848..dac8505e3f2cb33b35b6184184e4762078a19c49 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -28,8 +28,9 @@ class IsEmptyOp : public framework::OperatorBase {
             const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     // get input
     auto *var = scope.FindVar(Input(kInput));
     PADDLE_ENFORCE_NOT_NULL(var);
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 1948063d886b79964b1a52d9d82a8e7d2fb0d493..d043702ebae627951927f2dbec893d40f77f0c73 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -26,8 +26,10 @@ class LoadCombineOp : public framework::OperatorBase {
                 const framework::VariableNameMap &outputs,
                 const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
 
     std::ifstream fin(filename);
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index c9bf5d72b234f96d9eb5a4c275737ac8c18cd63d..9393cccfc66ec930db6ef68bd6f3c5065ceea80e 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -25,8 +25,10 @@ class LoadOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
index f11f5a89f5ad5b2f3deed905625aefa1e9d9935b..daa57c20450f1f92cb0bb500e37d0d8c49c05758 100644
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -25,8 +25,10 @@ class LoDArrayLengthOp : public framework::OperatorBase {
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
     auto &out =
         *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 0b9426a9f8f0b0b3082667dc7a1414aceb824aca..3264766d6b693244f8dbfa6462b9c7aa13d5b5ec 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -23,8 +23,10 @@ class LoDRankTableOp : public framework::OperatorBase {
                  const framework::VariableNameMap &outputs,
                  const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto *out =
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index edc32bcec1441e50e24612789727db9a044cde54..d6e24dc976a1ebe2afa182618d09839b105381c1 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -32,8 +32,10 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
                      const framework::VariableNameMap &outputs,
                      const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
                           Input("X"))
                   .Get<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index eff8b927e52c94a4e19bb10c644cbaa34a7a0581..cef0dc307dbe97473e9041f51c25eca7cc9a0f1a 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -27,8 +27,9 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto &rank_table =
         scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
     auto *out =
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 255f55334093213df867852e4d222f0e227e8c5d..88e67b6b86a3731cc2caf5529aa4892c6d605a86 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -27,8 +27,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
index 52420ceba0de0323dae000aa301ce7838b3311b6..703e8dd00fc8e613344db11065d6a45afa2a0cc8 100644
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -26,8 +26,9 @@ class NCCLInitOp : public framework::OperatorBase {
              const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     const auto &name = Output("Communicator");
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
                             "Can not find variable '%s' in the scope.", name);
diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h
index 14e5909851c4ac08b5f59c5c193c801827b91234..479ba386a70adaff09ae31e24c449fc18a9853b1 100644
--- a/paddle/fluid/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
@@ -57,20 +57,6 @@ class NetOp : public framework::OperatorBase {
     this->CompleteAddOp();
   }
 
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-
   bool SupportGPU() const override {
     for (auto& op : ops_) {
       if (!op->SupportGPU()) {
@@ -117,6 +103,20 @@ class NetOp : public framework::OperatorBase {
   std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
 
  private:
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, place);
+    }
+  }
+
   bool add_op_done_{false};
   std::set<std::string> intermediate_outputs_;
 
diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
index cc20be0c81763abe2adcf09de858ce51e16d77a6..265f15e82ed29824ed65917dbe45e5edf9dc8993 100644
--- a/paddle/fluid/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
@@ -26,7 +26,10 @@ class TestOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
   DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope, const platform::Place& place) const override {
+
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {
     ++run_cnt;
   }
 };
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index e25df92479943d210d98f02374f377f778f43d2c..d791d11172869d42b08c059b900e729bcc9b5d96 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -118,8 +118,9 @@ class ParallelDoOp : public framework::OperatorBase {
                const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
@@ -207,8 +208,9 @@ class ParallelDoGradOp : public framework::OperatorBase {
                    const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
     auto *program = block->Program();
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 3616545309e8c279f61a22e571a5e71335c47f93..4d12fdbb6b62d1d7095d10aa6f33d12598a8e99e 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -130,8 +130,9 @@ class TensorPrintOp : public framework::OperatorBase {
     PADDLE_THROW("Not implemented.");
   }
 
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
     const framework::Variable* in_var_ptr = nullptr;
     std::string phase = kForward;
     std::string printed_var_name = "";
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index ed48603e17f38f89705186fb9fb992f69d26d2ff..1385a6cdce838b7f376cd784a8eaa63f591c7ef2 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -38,8 +38,8 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
                       "The width of input must smaller than image.");
 
-    auto min_sizes = ctx->Attrs().Get<std::vector<int>>("min_sizes");
-    auto max_sizes = ctx->Attrs().Get<std::vector<int>>("max_sizes");
+    auto min_sizes = ctx->Attrs().Get<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx->Attrs().Get<std::vector<float>>("max_sizes");
     auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
     auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
     bool flip = ctx->Attrs().Get<bool>("flip");
@@ -47,15 +47,15 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     std::vector<float> aspect_ratios_vec;
     ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
 
-    int num_priors = aspect_ratios_vec.size() * min_sizes.size();
+    size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
     if (max_sizes.size() > 0) {
       PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
                         "The number of min_size and max_size must be equal.");
-      for (size_t i = 0; i < min_sizes.size(); ++i) {
+      num_priors += max_sizes.size();
+      for (size_t i = 0; i < max_sizes.size(); ++i) {
         PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
                           "max_size[%d] must be greater than min_size[%d].", i,
                           i);
-        num_priors += 1;
       }
     }
 
@@ -90,20 +90,20 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
               "H is the height of input, W is the width of input, num_priors "
               "is the box count of each position.");
 
-    AddAttr<std::vector<int>>("min_sizes",
-                              "(vector<int>) List of min sizes "
-                              "of generated prior boxes.")
-        .AddCustomChecker([](const std::vector<int>& min_sizes) {
+    AddAttr<std::vector<float>>("min_sizes",
+                                "(vector<float>) List of min sizes "
+                                "of generated prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& min_sizes) {
           PADDLE_ENFORCE_GT(min_sizes.size(), 0,
                             "Size of min_sizes must be at least 1.");
           for (size_t i = 0; i < min_sizes.size(); ++i) {
-            PADDLE_ENFORCE_GT(min_sizes[i], 0,
+            PADDLE_ENFORCE_GT(min_sizes[i], 0.0,
                               "min_sizes[%d] must be positive.", i);
           }
         });
-    AddAttr<std::vector<int>>(
+    AddAttr<std::vector<float>>(
         "max_sizes",
-        "(vector<int>) List of max sizes of generated prior boxes.");
+        "(vector<float>) List of max sizes of generated prior boxes.");
     AddAttr<std::vector<float>>(
         "aspect_ratios",
         "(vector<float>) List of aspect ratios of generated prior boxes.");
@@ -125,16 +125,16 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
 
     AddAttr<float>("step_w",
-                   "Prior boxes step across width, 0 for auto calculation.")
+                   "Prior boxes step across width, 0.0 for auto calculation.")
         .SetDefault(0.0)
         .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
         });
     AddAttr<float>("step_h",
-                   "Prior boxes step across height, 0 for auto calculation.")
+                   "Prior boxes step across height, 0.0 for auto calculation.")
         .SetDefault(0.0)
         .AddCustomChecker([](const float& step_h) {
-          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
         });
 
     AddAttr<float>("offset",
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index fd07041233495660605e9cf9acb33d57eb57bc30..e2c9514ed0814f21fec6c4184b7e971c4528d489 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -60,8 +60,8 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
     auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
 
-    auto min_sizes = ctx.Attr<std::vector<int>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<int>>("max_sizes");
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
     auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
     auto variances = ctx.Attr<std::vector<float>>("variances");
     auto flip = ctx.Attr<bool>("flip");
@@ -108,7 +108,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
         T box_width, box_height;
         int idx = 0;
         for (size_t s = 0; s < min_sizes.size(); ++s) {
-          int min_size = min_sizes[s];
+          auto min_size = min_sizes[s];
           // first prior: aspect_ratio = 1, size = min_size
           box_width = box_height = min_size;
           // xmin
@@ -124,7 +124,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
 
           idx++;
           if (max_sizes.size() > 0) {
-            int max_size = max_sizes[s];
+            auto max_size = max_sizes[s];
             // second prior: aspect_ratio = 1,
             // size = sqrt(min_size * max_size)
             box_width = box_height = sqrt(min_size * max_size);
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 4d562c291911f54c9d1e8fed2e84035808bffbb7..127df82ff13b89de42e45113a21d6f5e7c2f20ed 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -54,8 +54,10 @@ class ReadInferVarType : public framework::VarTypeInference {
 class ReadOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
-  void Run(const framework::Scope& scope,
-           const platform::Place& dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
     framework::ReaderHolder* reader =
         scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
     if (!reader->HasNext()) {
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index e4b9b8dab9b0394752d538aa5f59be3c06d0188f..33a744a5b7fef5802569a305d18746f04ed88136 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -226,8 +226,9 @@ class RecurrentOp : public RecurrentBase {
               const framework::AttributeMap &attrs)
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
@@ -315,8 +316,9 @@ class RecurrentGradOp : public RecurrentBase {
                   const framework::AttributeMap &attrs)
       : RecurrentBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index 148a65bb4b7fe599f2fdb833c179665e58fe1c41..79ba9e543b892d051995d4bafb0ceaaf09838cd2 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -75,8 +75,10 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
                                   const framework::VariableNameMap &outputs,
                                   const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto &x =
         detail::Ref(scope.FindVar(Input("X")),
                     "Cannot find input lod tensor variable %s", Input("X"))
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 504456c4b069f81319893ae51f57503f5025761a..e9329a0e7e279e2bdd3c45986580c87aa5d0b1fe 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -24,8 +24,10 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                     const framework::VariableNameMap &outputs,
                     const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto mem_var_name = Input("X");
     auto *mem_var = scope.FindVar(mem_var_name);
     PADDLE_ENFORCE(mem_var != nullptr,
@@ -76,8 +78,10 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                         const framework::VariableNameMap &outputs,
                         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto out_grad_var_name = Input(framework::GradVarName("Out"));
     auto *out_grad_var = scope.FindVar(out_grad_var_name);
 
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index c23de9073ef965b989e98936b2dd07fc6bce7fdc..e3953e4b08082c08e1bbf77a834d4a895b327f83 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -63,8 +63,10 @@ class SaveCombineOp : public framework::OperatorBase {
                 const framework::VariableNameMap &outputs,
                 const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 483cdfa4c3b9e3b9abd3f32bc5e6e5e0b493bd23..85ba8e01182c2cd01aa599ddbce68b6b2d9aa5f4 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -62,8 +62,10 @@ class SaveOp : public framework::OperatorBase {
          const framework::VariableNameMap &outputs,
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index df50a324fde1637f1f9f64a0b0d4eff8ba3f26d2..7fe0526381d1fc18ad0552c321875af42df0f6dc 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -27,8 +27,9 @@ class ShrinkRNNMemoryOp : public ArrayOp {
                     const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x_var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
     auto &x_tensor = x_var->Get<framework::LoDTensor>();
@@ -108,8 +109,9 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
                         const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
     auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
     PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index be4c7a56a84e84c39a578b958fe7c9ad551f54f6..e6eede23ee367200f9a2b531d1cbd402ceea6b54 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -44,7 +44,6 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename AttrType>
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
@@ -73,10 +72,10 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(Tensor, default Tensor<float>) A tensor with rank be 2. "
               "The output smooth l1 loss with shape [batch_size, 1].");
-    AddAttr<AttrType>("sigma",
-                      "Hyper parameter of smooth l1 loss op."
-                      "A float scalar with default value 3.0.")
-        .SetDefault(3.0);
+    AddAttr<float>("sigma",
+                   "Hyper parameter of smooth l1 loss op."
+                   "A float scalar with default value 3.0.")
+        .SetDefault(1.0);
     AddComment(R"DOC(
 Smooth L1 Loss Operator.
 
@@ -133,9 +132,8 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp,
-            ops::SmoothL1LossOpMaker<float>, smooth_l1_loss_grad,
-            ops::SmoothL1LossGradOp);
+REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
+            smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss,
     ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index f821dc54d7bbe697d3642e64dc1628ec7d966592..f9600d99a36f59feddfbb5295b8b21ca6d5034cd 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -33,8 +33,10 @@ class SplitLoDTensorOp : public framework::OperatorBase {
                    const framework::VariableNameMap &outputs,
                    const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
     auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
     auto *out_true =
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index 50811fb22491598849216f41a584ae0b68f8f306..704ee964c908c44d84316985429a6551b770e33f 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -24,8 +24,9 @@ class WriteToArrayOp : public ArrayOp {
                  const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     if (x == nullptr) return;
     auto &x_tensor = x->Get<framework::LoDTensor>();
@@ -122,8 +123,10 @@ class ReadFromArrayOp : public ArrayOp {
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
       : ArrayOp(type, inputs, outputs, attrs) {}
-  void Run(const framework::Scope &scope,
-           const platform::Place &place) const override {
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     auto *x = scope.FindVar(Input("X"));
     PADDLE_ENFORCE(x != nullptr, "X must be set");
     auto &x_array = x->Get<framework::LoDTensorArray>();
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index d254c572acff52d967e551c377b3b32b05c92973..a7a05cc5f79da6c1e6945a83f997e54041d2045d 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -39,8 +39,9 @@ class WhileOp : public framework::OperatorBase {
           const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
     auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
@@ -99,8 +100,9 @@ class WhileGradOp : public framework::OperatorBase {
               const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 1486d5ed2579a49a4722a8b0abdfdba6bf196615..442a7ea883052e73a5d50d5558f57732be93fb3a 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -204,6 +204,17 @@ function gen_capi_package() {
   fi
 }
 
+function gen_fluid_inference_lib() {
+    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+    cat <<EOF
+    ========================================
+    Building fluid inference library ...
+    ========================================
+EOF
+        make inference_lib_dist
+    fi
+}
+
 set -xe
 
 cmake_gen ${PYTHON_ABI:-""}
@@ -212,6 +223,7 @@ run_test
 gen_docs
 gen_dockerfile
 gen_capi_package
+gen_fluid_inference_lib
 
 if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
   printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
index a83dd3db74aed548a324a1c605723c957fca8604..cfbbf710b6ac63b9a0fe7d51b0d1940532e948fc 100644
--- a/python/paddle/v2/fluid/layers/__init__.py
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -16,6 +16,8 @@ import ops
 from ops import *
 import nn
 from nn import *
+import detection
+from detection import *
 import io
 from io import *
 import tensor
@@ -26,12 +28,16 @@ import device
 from device import *
 import math_op_patch
 from math_op_patch import *
+import detection
+from detection import *
 
 __all__ = []
+__all__ += math_op_patch.__all__
+__all__ += detection.__all__
 __all__ += nn.__all__
 __all__ += io.__all__
 __all__ += tensor.__all__
 __all__ += control_flow.__all__
 __all__ += ops.__all__
 __all__ += device.__all__
-__all__ += math_op_patch.__all__
+__all__ += detection.__all__
diff --git a/python/paddle/v2/fluid/layers/detection.py b/python/paddle/v2/fluid/layers/detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f3256d7652b25845fe577838030009a45ed16cd
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/detection.py
@@ -0,0 +1,328 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All layers just related to the detection neural network.
+"""
+
+from ..layer_helper import LayerHelper
+from ..framework import Variable
+from tensor import concat
+from ops import reshape
+import math
+
+__all__ = [
+    'detection_output',
+    'prior_box',
+]
+
+
+def detection_output(scores,
+                     loc,
+                     prior_box,
+                     prior_box_var,
+                     background_label=0,
+                     nms_threshold=0.3,
+                     nms_top_k=400,
+                     keep_top_k=200,
+                     score_threshold=0.01,
+                     nms_eta=1.0):
+    """
+    **Detection Output Layer**
+
+    This layer applies the NMS to the output of network and computes the 
+    predict bounding box location. The output's shape of this layer could
+    be zero if there is no valid bounding box.
+
+    Args:
+        scores(Variable): A 3-D Tensor with shape [N, C, M] represents the
+            predicted confidence predictions. N is the batch size, C is the
+            class number, M is number of bounding boxes. For each category
+            there are total M scores which corresponding M bounding boxes.
+        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+            predicted locations of M bounding bboxes. N is the batch size,
+            and each bounding box has four coordinate values and the layout
+            is [xmin, ymin, xmax, ymax].
+        prior_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
+            each box is represented as [xmin, ymin, xmax, ymax],
+            [xmin, ymin] is the left top coordinate of the anchor box,
+            if the input is image feature map, they are close to the origin
+            of the coordinate system. [xmax, ymax] is the right bottom
+            coordinate of the anchor box.
+        prior_box_var(Variable): A 2-D Tensor with shape [M, 4] holds M group
+            of variance.
+        background_label(float): The index of background label,
+            the background label will be ignored. If set to -1, then all
+            categories will be considered.
+        nms_threshold(float): The threshold to be used in NMS.
+        nms_top_k(int): Maximum number of detections to be kept according
+            to the confidences aftern the filtering detections based on
+            score_threshold.
+        keep_top_k(int): Number of total bboxes to be kept per image after
+            NMS step. -1 means keeping all bboxes after NMS step.
+        score_threshold(float): Threshold to filter out bounding boxes with
+            low confidence score. If not provided, consider all boxes.
+        nms_eta(float): The parameter for adaptive NMS.
+
+    Returns:
+        The detected bounding boxes which are a Tensor.
+
+    Examples:
+        .. code-block:: python
+
+        pb = layers.data(name='prior_box', shape=[10, 4],
+                         append_batch_size=False, dtype='float32')
+        pbv = layers.data(name='prior_box_var', shape=[10, 4],
+                          append_batch_size=False, dtype='float32')
+        loc = layers.data(name='target_box', shape=[21, 4],
+                          append_batch_size=False, dtype='float32')
+        scores = layers.data(name='scores', shape=[2, 21, 10],
+                          append_batch_size=False, dtype='float32')
+        nmsed_outs = fluid.layers.detection_output(scores=scores,
+                                       loc=loc,
+                                       prior_box=pb,
+                                       prior_box_var=pbv)
+    """
+
+    helper = LayerHelper("detection_output", **locals())
+    decoded_box = helper.create_tmp_variable(dtype=loc.dtype)
+    helper.append_op(
+        type="box_coder",
+        inputs={
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': loc
+        },
+        outputs={'OutputBox': decoded_box},
+        attrs={'code_type': 'decode_center_size'})
+    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+
+    helper.append_op(
+        type="multiclass_nms",
+        inputs={'Scores': scores,
+                'BBoxes': decoded_box},
+        outputs={'Out': nmsed_outs},
+        attrs={
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0
+        })
+    return nmsed_outs
+
+
+def prior_box(inputs,
+              image,
+              min_ratio,
+              max_ratio,
+              aspect_ratios,
+              base_size,
+              steps=None,
+              step_w=None,
+              step_h=None,
+              offset=0.5,
+              variance=[0.1, 0.1, 0.1, 0.1],
+              flip=False,
+              clip=False,
+              min_sizes=None,
+              max_sizes=None,
+              name=None):
+    """
+    **Prior_boxes**
+
+    Generate prior boxes for SSD(Single Shot MultiBox Detector)
+    algorithm. The details of this algorithm, please refer the
+    section 2.2 of SSD paper (SSD: Single Shot MultiBox Detector)
+    <https://arxiv.org/abs/1512.02325>`_ .
+    
+    Args:
+       inputs(list): The list of input Variables, the format
+            of all Variables is NCHW.
+       image(Variable): The input image data of PriorBoxOp,
+            the layout is NCHW.
+       min_ratio(int): the min ratio of generated prior boxes.
+       max_ratio(int): the max ratio of generated prior boxes.
+       aspect_ratios(list): the aspect ratios of generated prior
+            boxes. The length of input and aspect_ratios must be equal.
+       base_size(int): the base_size is used to get min_size
+            and max_size according to min_ratio and max_ratio.
+       step_w(list, optional, default=None): Prior boxes step
+            across width. If step_w[i] == 0.0, the prior boxes step
+            across width of the inputs[i] will be automatically calculated.
+       step_h(list, optional, default=None): Prior boxes step
+            across height, If step_h[i] == 0.0, the prior boxes
+            step across height of the inputs[i] will be automatically calculated.
+       offset(float, optional, default=0.5): Prior boxes center offset.
+       variance(list, optional, default=[0.1, 0.1, 0.1, 0.1]): the variances
+            to be encoded in prior boxes.
+       flip(bool, optional, default=False): Whether to flip
+            aspect ratios.
+       clip(bool, optional, default=False): Whether to clip
+            out-of-boundary boxes.
+       min_sizes(list, optional, default=None): If `len(inputs) <=2`,
+            min_sizes must be set up, and the length of min_sizes
+            should equal to the length of inputs.
+       max_sizes(list, optional, default=None): If `len(inputs) <=2`,
+            max_sizes must be set up, and the length of min_sizes
+            should equal to the length of inputs.
+       name(str, optional, None): Name of the prior box layer.
+    
+    Returns:
+        boxes(Variable): the output prior boxes of PriorBoxOp.
+             The layout is [num_priors, 4]. num_priors is the total
+             box count of each position of inputs.
+        Variances(Variable): the expanded variances of PriorBoxOp.
+             The layout is [num_priors, 4]. num_priors is the total
+             box count of each position of inputs
+    
+    Examples:
+        .. code-block:: python
+    
+          prior_box(
+             inputs = [conv1, conv2, conv3, conv4, conv5, conv6],
+             image = data,
+             min_ratio = 20, # 0.20
+             max_ratio = 90, # 0.90
+             offset = 0.5,
+             base_size = 300,
+             variance = [0.1,0.1,0.1,0.1],
+             aspect_ratios = [[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
+             flip=True,
+             clip=True)
+    """
+
+    def _prior_box_(input,
+                    image,
+                    min_sizes,
+                    max_sizes,
+                    aspect_ratios,
+                    variance,
+                    flip=False,
+                    clip=False,
+                    step_w=0.0,
+                    step_h=0.0,
+                    offset=0.5,
+                    name=None):
+        helper = LayerHelper("prior_box", **locals())
+        dtype = helper.input_dtype()
+
+        box = helper.create_tmp_variable(dtype)
+        var = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="prior_box",
+            inputs={"Input": input,
+                    "Image": image},
+            outputs={"Boxes": box,
+                     "Variances": var},
+            attrs={
+                'min_sizes': min_sizes,
+                'max_sizes': max_sizes,
+                'aspect_ratios': aspect_ratios,
+                'variances': variance,
+                'flip': flip,
+                'clip': clip,
+                'step_w': step_w,
+                'step_h': step_h,
+                'offset': offset
+            })
+        return box, var
+
+    def _reshape_with_axis_(input, axis=1):
+        if not (axis > 0 and axis < len(input.shape)):
+            raise ValueError("The axis should be smaller than "
+                             "the arity of input and bigger than 0.")
+        new_shape = [
+            -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
+        ]
+        out = reshape(x=input, shape=new_shape)
+        return out
+
+    assert isinstance(inputs, list), 'inputs should be a list.'
+    num_layer = len(inputs)
+
+    if num_layer <= 2:
+        assert min_sizes is not None and max_sizes is not None
+        assert len(min_sizes) == num_layer and len(max_sizes) == num_layer
+    else:
+        min_sizes = []
+        max_sizes = []
+        step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
+        for ratio in xrange(min_ratio, max_ratio + 1, step):
+            min_sizes.append(base_size * ratio / 100.)
+            max_sizes.append(base_size * (ratio + step) / 100.)
+        min_sizes = [base_size * .10] + min_sizes
+        max_sizes = [base_size * .20] + max_sizes
+
+    if aspect_ratios:
+        if not (isinstance(aspect_ratios, list) and
+                len(aspect_ratios) == num_layer):
+            raise ValueError(
+                'aspect_ratios should be list and the length of inputs '
+                'and aspect_ratios should be the same.')
+    if step_h:
+        if not (isinstance(step_h, list) and len(step_h) == num_layer):
+            raise ValueError(
+                'step_h should be list and the length of inputs and '
+                'step_h should be the same.')
+    if step_w:
+        if not (isinstance(step_w, list) and len(step_w) == num_layer):
+            raise ValueError(
+                'step_w should be list and the length of inputs and '
+                'step_w should be the same.')
+    if steps:
+        if not (isinstance(steps, list) and len(steps) == num_layer):
+            raise ValueError(
+                'steps should be list and the length of inputs and '
+                'step_w should be the same.')
+        step_w = steps
+        step_h = steps
+
+    box_results = []
+    var_results = []
+    for i, input in enumerate(inputs):
+        min_size = min_sizes[i]
+        max_size = max_sizes[i]
+        aspect_ratio = []
+        if not isinstance(min_size, list):
+            min_size = [min_size]
+        if not isinstance(max_size, list):
+            max_size = [max_size]
+        if aspect_ratios:
+            aspect_ratio = aspect_ratios[i]
+            if not isinstance(aspect_ratio, list):
+                aspect_ratio = [aspect_ratio]
+
+        box, var = _prior_box_(input, image, min_size, max_size, aspect_ratio,
+                               variance, flip, clip, step_w[i]
+                               if step_w else 0.0, step_h[i]
+                               if step_w else 0.0, offset)
+
+        box_results.append(box)
+        var_results.append(var)
+
+    if len(box_results) == 1:
+        box = box_results[0]
+        var = var_results[0]
+    else:
+        reshaped_boxes = []
+        reshaped_vars = []
+        for i in range(len(box_results)):
+            reshaped_boxes.append(_reshape_with_axis_(box_results[i], axis=3))
+            reshaped_vars.append(_reshape_with_axis_(var_results[i], axis=3))
+
+        box = concat(reshaped_boxes)
+        var = concat(reshaped_vars)
+
+    return box, var
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index 00e4e6907804c7a460e60d960b4aa94ca23b4886..d829bba1b101cc802ea29f32e0b7ecdb1ac448f5 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -152,7 +152,12 @@ def monkey_patch_variable():
         ("__div__", "elementwise_div", False),
         ("__rdiv__", "elementwise_div", True),
         ("__pow__", "elementwise_pow", False),
-        ("__rpow__", "elementwise_pow", True)):
+        ("__rpow__", "elementwise_pow", True),
+            # for logical compare
+        ("__eq__", "equal", False),
+        ("__ne__", "not_equal", False),
+        ("__lt__", "less_than", False),
+        ("__le__", "less_equal", False)):
         setattr(Variable, method_name,
                 _elemwise_method_creator_(method_name, op_type, reverse))
 
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 5ebd329fc0285a39111a23b3c58c80944cfe23f6..051b5368180d3f7951b100c26fb7367372d9a343 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -66,6 +66,8 @@ __all__ = [
     'row_conv',
     'multiplex',
     'layer_norm',
+    'softmax_with_cross_entropy',
+    'smooth_l1',
 ]
 
 
@@ -3091,3 +3093,122 @@ def multiplex(inputs, index):
                 'Ids': index},
         outputs={'Out': [out]})
     return out
+
+
+def softmax_with_cross_entropy(logits, label, soft_label=False):
+    """
+    **Softmax With Cross Entropy Operator.**
+    
+    Cross entropy loss with softmax is used as the output layer extensively. This
+    operator computes the softmax normalized values for each row of the input
+    tensor, after which cross-entropy loss is computed. This provides a more
+    numerically stable gradient.
+    
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+    
+    When the attribute soft_label is set false, this operators expects mutually
+    exclusive hard labels, each sample in a batch is in exactly one class with a
+    probability of 1.0. Each sample in the batch will have a single label.
+    
+    The equation is as follows:
+    
+    1) Hard label (one-hot label, so every sample has exactly one class)
+    
+    .. math::
+
+        loss_j =  -\\text{logit}_{label_j} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logit}_i)\\right), j = 1,..., K
+    
+    2) Soft label (each sample can have a distribution over all classes)
+
+    .. math::
+    
+        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
+        \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K}
+        \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K
+
+    Args:
+        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
+            with shape [N x K]. N is the batch_size, and K is the class number.
+        label (Variable): The ground truth which is a 2-D tensor. If soft_label
+            is set to false, Label is a Tensor<int64> with shape [N x 1]. If
+            soft_label is set to true, Label is a Tensor<float/double> with
+        soft_label (bool): A flag to indicate whether to interpretate the given
+            labels as soft labels. By default, `soft_label` is set to False.
+    Returns:
+        Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name='data', shape=[128], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            fc = fluid.layers.fc(input=data, size=100)
+            out = fluid.layers.softmax_with_cross_entropy(logits=fc, label=label)
+    """
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_tmp_variable(dtype=logits.dtype)
+    loss = helper.create_tmp_variable(dtype=logits.dtype)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': logits,
+                'Label': label},
+        outputs={'Softmax': softmax,
+                 'Loss': loss},
+        attrs={'soft_label': soft_label})
+    return loss
+
+
+def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
+    """
+    **Smooth L1 Loss Operator. **
+
+    This operator computes the smooth l1 loss for X and Y.
+    The operator takes the first dimension of X and Y as batch size.
+    For each instance, it computes the smooth l1 loss element by element first
+    and then sums all the losses. So the shape of Out is [batch_size, 1].
+    
+    Args:
+        x (Variable): A tensor with rank at least 2. The input value of smooth
+            l1 loss op with shape [batch_size, dim1, ..., dimN].
+        y (Variable): A tensor with rank at least 2. The target value of smooth
+            l1 loss op with same shape as x.
+        inside_weight (Variable|None):  A tensor with rank at least 2. This
+            input is optional and should have same shape with x. If provided,
+            the result of (x - y) will be multiplied by this tensor element by
+            element.
+        outside_weight (Variable|None): A tensor with rank at least 2. This
+            input is optional and should have same shape with x. If provided,
+            the out smooth l1 loss will be multiplied by this tensor element
+            by element.
+        sigma (float|None): Hyper parameter of smooth l1 loss op. A float scalar
+            with default value 1.0.
+    Returns:
+        Variable: A tensor with rank be 2. The output smooth l1 loss with
+            shape [batch_size, 1].
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name='data', shape=[128], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[100], dtype='int64')
+            fc = fluid.layers.fc(input=data, size=100)
+            out = fluid.layers.smooth_l1(logits=fc, label=label)
+    """
+    helper = LayerHelper('smooth_l1_loss', **locals())
+    diff = helper.create_tmp_variable(dtype=x.dtype)
+    loss = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='smooth_l1_loss',
+        inputs={
+            'X': x,
+            'Y': y,
+            'InsideWeight': inside_weight,
+            'OutsideWeight': outside_weight
+        },
+        outputs={'Diff': diff,
+                 'Out': loss},
+        attrs={'sigma': sigma})
+    return loss
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
index 2a2a29fd9cbedc138dc82ca75ccd78208fd33195..0826d3da79a96590f00159a2d2e6f069792909c4 100644
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -179,7 +179,7 @@ def polynomial_decay(learning_rate,
                 shape=[1], dtype='float32', value=1.0)
 
             with layers.Switch() as switch:
-                with switch.case(layers.equal(x=global_step, y=zero_var)):
+                with switch.case(global_step == zero_var):
                     layers.assign(input=one_var, output=div_res)
             decay_steps = decay_steps * div_res
         else:
@@ -229,7 +229,7 @@ def piecewise_decay(global_step, boundaries, values):
                     shape=[1], dtype='float32', value=float(boundaries[i]))
                 value_var = layers.fill_constant(
                     shape=[1], dtype='float32', value=float(values[i]))
-                with switch.case(layers.less_than(global_step, boundary_val)):
+                with switch.case(global_step < boundary_val):
                     layers.assign(value_var, lr)
             last_value_var = layers.fill_constant(
                 shape=[1],
diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/notest_rnn_encoder_decoer.py
similarity index 100%
rename from python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
rename to python/paddle/v2/fluid/tests/book/notest_rnn_encoder_decoer.py
diff --git a/python/paddle/v2/fluid/tests/test_detection.py b/python/paddle/v2/fluid/tests/test_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..fecc2a6226f5655adcb60ac3efef8fc26eec8ba2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_detection.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.layers.detection as detection
+from paddle.v2.fluid.framework import Program, program_guard
+import unittest
+import numpy as np
+
+
+class TestBook(unittest.TestCase):
+    def test_detection_output(self):
+        program = Program()
+        with program_guard(program):
+            pb = layers.data(
+                name='prior_box',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            pbv = layers.data(
+                name='prior_box_var',
+                shape=[10, 4],
+                append_batch_size=False,
+                dtype='float32')
+            loc = layers.data(
+                name='target_box',
+                shape=[20, 4],
+                append_batch_size=False,
+                dtype='float32')
+            scores = layers.data(
+                name='scores',
+                shape=[2, 20, 10],
+                append_batch_size=False,
+                dtype='float32')
+            out = layers.detection_output(
+                scores=scores, loc=loc, prior_box=pb, prior_box_var=pbv)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+
+class TestPriorBox(unittest.TestCase):
+    def test_prior_box(self):
+        data_shape = [3, 224, 224]
+        box, var = self.prior_box_output(data_shape)
+
+        assert len(box.shape) == 2
+        assert box.shape == var.shape
+        assert box.shape[1] == 4
+
+    def prior_box_output(self, data_shape):
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(
+            input=images,
+            num_filters=3,
+            filter_size=3,
+            stride=2,
+            use_cudnn=False)
+        conv2 = fluid.layers.conv2d(
+            input=conv1,
+            num_filters=3,
+            filter_size=3,
+            stride=2,
+            use_cudnn=False)
+        conv3 = fluid.layers.conv2d(
+            input=conv2,
+            num_filters=3,
+            filter_size=3,
+            stride=2,
+            use_cudnn=False)
+        conv4 = fluid.layers.conv2d(
+            input=conv3,
+            num_filters=3,
+            filter_size=3,
+            stride=2,
+            use_cudnn=False)
+        conv5 = fluid.layers.conv2d(
+            input=conv4,
+            num_filters=3,
+            filter_size=3,
+            stride=2,
+            use_cudnn=False)
+
+        box, var = detection.prior_box(
+            inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
+            image=images,
+            min_ratio=20,
+            max_ratio=90,
+            # steps=[8, 16, 32, 64, 100, 300],
+            aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
+            base_size=300,
+            offset=0.5,
+            flip=True,
+            clip=True)
+        return box, var
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_detection_map_op.py b/python/paddle/v2/fluid/tests/test_detection_map_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ccd885d89f245df492bad0fbcecc093dc1928c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_detection_map_op.py
@@ -0,0 +1,265 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import collections
+import math
+from op_test import OpTest
+
+
+class TestDetectionMAPOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+
+        self.mAP = [self.calc_map(self.tf_pos, self.tf_pos_lod)]
+        self.label = np.array(self.label).astype('float32')
+        self.detect = np.array(self.detect).astype('float32')
+        self.mAP = np.array(self.mAP).astype('float32')
+
+        if (len(self.class_pos_count) > 0):
+            self.class_pos_count = np.array(self.class_pos_count).astype(
+                'int32')
+            self.true_pos = np.array(self.true_pos).astype('float32')
+            self.false_pos = np.array(self.false_pos).astype('float32')
+
+            self.inputs = {
+                'Label': (self.label, self.label_lod),
+                'DetectRes': (self.detect, self.detect_lod),
+                'PosCount': self.class_pos_count,
+                'TruePos': (self.true_pos, self.true_pos_lod),
+                'FalsePos': (self.false_pos, self.false_pos_lod)
+            }
+        else:
+            self.inputs = {
+                'Label': (self.label, self.label_lod),
+                'DetectRes': (self.detect, self.detect_lod),
+            }
+
+        self.attrs = {
+            'overlap_threshold': self.overlap_threshold,
+            'evaluate_difficult': self.evaluate_difficult,
+            'ap_type': self.ap_type
+        }
+
+        self.out_class_pos_count = np.array(self.out_class_pos_count).astype(
+            'int')
+        self.out_true_pos = np.array(self.out_true_pos).astype('float32')
+        self.out_false_pos = np.array(self.out_false_pos).astype('float32')
+
+        self.outputs = {
+            'MAP': self.mAP,
+            'AccumPosCount': self.out_class_pos_count,
+            'AccumTruePos': (self.out_true_pos, self.out_true_pos_lod),
+            'AccumFalsePos': (self.out_false_pos, self.out_false_pos_lod)
+        }
+
+    def init_test_case(self):
+        self.overlap_threshold = 0.3
+        self.evaluate_difficult = True
+        self.ap_type = "integral"
+
+        self.label_lod = [[0, 2, 4]]
+        # label difficult xmin ymin xmax ymax
+        self.label = [[1, 0, 0.1, 0.1, 0.3, 0.3], [1, 1, 0.6, 0.6, 0.8, 0.8],
+                      [2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]]
+
+        # label score xmin ymin xmax ymax difficult
+        self.detect_lod = [[0, 3, 7]]
+        self.detect = [
+            [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
+            [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
+            [2, 0.1, 0.4, 0.3, 0.7, 0.5], [1, 0.2, 0.8, 0.1, 1.0, 0.3],
+            [3, 0.2, 0.8, 0.1, 1.0, 0.3]
+        ]
+
+        # label score true_pos false_pos
+        self.tf_pos_lod = [[0, 3, 7]]
+        self.tf_pos = [[1, 0.9, 1, 0], [1, 0.7, 1, 0], [1, 0.3, 0, 1],
+                       [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0],
+                       [3, 0.2, 0, 1]]
+
+        self.class_pos_count = []
+        self.true_pos_lod = [[]]
+        self.true_pos = [[]]
+        self.false_pos_lod = [[]]
+        self.false_pos = [[]]
+
+    def calc_map(self, tf_pos, tf_pos_lod):
+        mAP = 0.0
+        count = 0
+
+        def get_input_pos(class_pos_count, true_pos, true_pos_lod, false_pos,
+                          false_pos_lod):
+            class_pos_count_dict = collections.Counter()
+            true_pos_dict = collections.defaultdict(list)
+            false_pos_dict = collections.defaultdict(list)
+            for i, count in enumerate(class_pos_count):
+                class_pos_count_dict[i] = count
+
+            for i in range(len(true_pos_lod[0]) - 1):
+                start = true_pos_lod[0][i]
+                end = true_pos_lod[0][i + 1]
+                for j in range(start, end):
+                    true_pos_dict[i].append(true_pos[j])
+
+            for i in range(len(false_pos_lod[0]) - 1):
+                start = false_pos_lod[0][i]
+                end = false_pos_lod[0][i + 1]
+                for j in range(start, end):
+                    false_pos_dict[i].append(false_pos[j])
+
+            return class_pos_count_dict, true_pos_dict, false_pos_dict
+
+        def get_output_pos(label_count, true_pos, false_pos):
+            max_label = 0
+            for (label, label_pos_num) in label_count.items():
+                if max_label < label:
+                    max_label = label
+
+            label_number = max_label + 1
+
+            out_class_pos_count = []
+            out_true_pos_lod = [0]
+            out_true_pos = []
+            out_false_pos_lod = [0]
+            out_false_pos = []
+
+            for i in range(label_number):
+                out_class_pos_count.append([label_count[i]])
+                true_pos_list = true_pos[i]
+                out_true_pos += true_pos_list
+                out_true_pos_lod.append(len(out_true_pos))
+                false_pos_list = false_pos[i]
+                out_false_pos += false_pos_list
+                out_false_pos_lod.append(len(out_false_pos))
+
+            return out_class_pos_count, out_true_pos, [
+                out_true_pos_lod
+            ], out_false_pos, [out_false_pos_lod]
+
+        def get_accumulation(pos_list):
+            sorted_list = sorted(pos_list, key=lambda pos: pos[0], reverse=True)
+            sum = 0
+            accu_list = []
+            for (score, count) in sorted_list:
+                sum += count
+                accu_list.append(sum)
+            return accu_list
+
+        label_count, true_pos, false_pos = get_input_pos(
+            self.class_pos_count, self.true_pos, self.true_pos_lod,
+            self.false_pos, self.false_pos_lod)
+        for (label, difficult, xmin, ymin, xmax, ymax) in self.label:
+            if self.evaluate_difficult:
+                label_count[label] += 1
+            elif not difficult:
+                label_count[label] += 1
+
+        true_pos = collections.defaultdict(list)
+        false_pos = collections.defaultdict(list)
+        for (label, score, tp, fp) in tf_pos:
+            true_pos[label].append([score, tp])
+            false_pos[label].append([score, fp])
+
+        for (label, label_pos_num) in label_count.items():
+            if label_pos_num == 0 or label not in true_pos: continue
+            label_true_pos = true_pos[label]
+            label_false_pos = false_pos[label]
+
+            accu_tp_sum = get_accumulation(label_true_pos)
+            accu_fp_sum = get_accumulation(label_false_pos)
+
+            precision = []
+            recall = []
+
+            for i in range(len(accu_tp_sum)):
+                precision.append(
+                    float(accu_tp_sum[i]) /
+                    float(accu_tp_sum[i] + accu_fp_sum[i]))
+                recall.append(float(accu_tp_sum[i]) / label_pos_num)
+
+            if self.ap_type == "11point":
+                max_precisions = [0.0] * 11
+                start_idx = len(accu_tp_sum) - 1
+                for j in range(10, -1, -1):
+                    for i in range(start_idx, -1, -1):
+                        if recall[i] < float(j) / 10.0:
+                            start_idx = i
+                            if j > 0:
+                                max_precisions[j - 1] = max_precisions[j]
+                                break
+                        else:
+                            if max_precisions[j] < precision[i]:
+                                max_precisions[j] = precision[i]
+                for j in range(10, -1, -1):
+                    mAP += max_precisions[j] / 11
+                count += 1
+            elif self.ap_type == "integral":
+                average_precisions = 0.0
+                prev_recall = 0.0
+                for i in range(len(accu_tp_sum)):
+                    if math.fabs(recall[i] - prev_recall) > 1e-6:
+                        average_precisions += precision[i] * \
+                            math.fabs(recall[i] - prev_recall)
+                        prev_recall = recall[i]
+
+                mAP += average_precisions
+                count += 1
+        self.out_class_pos_count, self.out_true_pos, self.out_true_pos_lod, self.out_false_pos, self.out_false_pos_lod = get_output_pos(
+            label_count, true_pos, false_pos)
+        if count != 0:
+            mAP /= count
+        return mAP * 100.0
+
+    def setUp(self):
+        self.op_type = "detection_map"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
+    def init_test_case(self):
+        super(TestDetectionMAPOpSkipDiff, self).init_test_case()
+
+        self.evaluate_difficult = False
+
+        self.tf_pos_lod = [[0, 2, 6]]
+        # label score true_pos false_pos
+        self.tf_pos = [[1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0],
+                       [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
+
+
+class TestDetectionMAPOp11Point(TestDetectionMAPOp):
+    def init_test_case(self):
+        super(TestDetectionMAPOp11Point, self).init_test_case()
+
+        self.ap_type = "11point"
+
+
+class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
+    def init_test_case(self):
+        super(TestDetectionMAPOpMultiBatch, self).init_test_case()
+        self.class_pos_count = [0, 2, 1]
+        self.true_pos_lod = [[0, 0, 3, 5]]
+        self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
+        self.false_pos_lod = [[0, 0, 3, 5]]
+        self.false_pos = [[0.7, 0.], [0.3, 1.], [0.2, 0.], [0.8, 1.], [0.1, 0.]]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index aea43c2517a02c72c1ee3307afdd3b21910f0064..50ef8204249250b5ca1555a5192bc3ed0ca108b9 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,8 +161,8 @@ class TestBook(unittest.TestCase):
                 label=label,
                 chunk_scheme="IOB",
                 num_chunk_types=(label_dict_len - 1) / 2)
-            self.assertNotEqual(crf, None)
-            self.assertNotEqual(crf_decode, None)
+            self.assertFalse(crf is None)
+            self.assertFalse(crf_decode is None)
 
         print(str(program))
 
@@ -309,6 +309,24 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_softmax_with_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[16], dtype='float32')
+            y = layers.data(name='label', shape=[1], dtype='int64')
+            loss = layers.softmax_with_cross_entropy(x, y)
+            self.assertIsNotNone(loss)
+        print(str(program))
+
+    def test_smooth_l1(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[4], dtype='float32')
+            y = layers.data(name='label', shape=[4], dtype='float32')
+            loss = layers.smooth_l1(x, y)
+            self.assertIsNotNone(loss)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_prior_box_op.py b/python/paddle/v2/fluid/tests/test_prior_box_op.py
index ca8d2bca74ce2d4be8160c8851e393489691ae56..a6c21af49f63269720156ec833c94688d0e3230e 100644
--- a/python/paddle/v2/fluid/tests/test_prior_box_op.py
+++ b/python/paddle/v2/fluid/tests/test_prior_box_op.py
@@ -65,9 +65,9 @@ class TestPriorBoxOp(OpTest):
         self.batch_size = 10
 
         self.min_sizes = [2, 4]
-        self.min_sizes = np.array(self.min_sizes).astype('int64')
+        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
         self.max_sizes = [5, 10]
-        self.max_sizes = np.array(self.max_sizes).astype('int64')
+        self.max_sizes = np.array(self.max_sizes).astype('float32').tolist()
         self.aspect_ratios = [2.0, 3.0]
         self.flip = True
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
diff --git a/python/paddle/v2/fluid/tests/test_python_operator_overriding.py b/python/paddle/v2/fluid/tests/test_python_operator_overriding.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5198ec17d027f007b4a831ef2e427481f8ff8c4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_python_operator_overriding.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid as fluid
+
+
+class TestPythonOperatorOverride(unittest.TestCase):
+    def check_result(self, fn, place, dtype):
+        shape = [9, 10]
+
+        x_data = np.random.random(size=shape).astype(dtype)
+        y_data = np.random.random(size=shape).astype(dtype)
+        python_out = fn(x_data, y_data)
+
+        x_var = layers.create_global_var(
+            name='x', shape=shape, value=0.0, dtype=dtype, persistable=True)
+        y_var = layers.create_global_var(
+            name='y', shape=shape, value=0.0, dtype=dtype, persistable=True)
+        out = fn(x_var, y_var)
+
+        exe = fluid.Executor(place)
+
+        exe.run(fluid.default_startup_program())
+        fluid_out = exe.run(fluid.default_main_program(),
+                            feed={'x': x_data,
+                                  'y': y_data},
+                            fetch_list=[out])
+
+        np.testing.assert_array_equal(python_out, fluid_out[0])
+
+    def test_override(self):
+        # compare func to check
+        compare_fns = [
+            lambda _a, _b: _a == _b,
+            lambda _a, _b: _a != _b,
+            lambda _a, _b: _a < _b,
+            lambda _a, _b: _a <= _b,
+            lambda _a, _b: _a > _b,
+            lambda _a, _b: _a >= _b,
+        ]
+
+        # places to check
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        # dtypes to check
+        dtypes = ['int32', 'float32']
+
+        for place in places:
+            for dtype in dtypes:
+                for compare_fn in compare_fns:
+                    with framework.program_guard(framework.Program(),
+                                                 framework.Program()):
+                        self.check_result(compare_fn, place, dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index 0f1b8331309248aaaf0ed32cf14c583a4cdb7437..93cab692e363cde43bdd4dd9ad20f4a2c06be121 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -52,3 +52,5 @@ RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/
 
 RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
     make -j `nproc` install <NCCL_MAKE_OPTS>  && cd .. && rm -rf nccl
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]