diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4e7d5c20db5fb95dfd5de05f8209608707b772c..7c7eb260aea8478f4833cb79253f4481e10b8685 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-if(NOT ANDROID AND NOT IOS)
-    find_package(Boost QUIET)
-endif()
 
 include(simd)
 
@@ -140,6 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
+include(external/boost)     # download, build, install boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}")
 include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
-include_directories(${Boost_INCLUDE_DIRS})
 
 set(EXTERNAL_LIBS
     ${GFLAGS_LIBRARIES}
diff --git a/Dockerfile b/Dockerfile
index 857d3f3e5f64791146741ffb29feabfcb2ecbb84..6ac9901ac6cea12e97047efdfb6272c957f166ae 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,7 +27,7 @@ RUN apt-get update && \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
     automake locales clang-format swig doxygen cmake  \
-    liblapack-dev liblapacke-dev libboost-dev \
+    liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     net-tools libtool && \
     apt-get clean -y
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..137f11da7f2f1c46eebf6590d93402786ef543c9
--- /dev/null
+++ b/cmake/external/boost.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(BOOST_PROJECT       "extern_boost")
+set(BOOST_VER           "1.66.0")
+set(BOOST_TAR           "boost_1_66_0")
+set(BOOST_URL           "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
+set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
+set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
+set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+
+include_directories(${BOOST_INCLUDE_DIR})
+
+ExternalProject_Add(
+    ${BOOST_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
+                          && tar zxf ${BOOST_TAR}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    PREFIX                ${BOOST_SOURCES_DIR}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    INSTALL_COMMAND       ""
+    UPDATE_COMMAND        ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+    add_library(boost STATIC ${dummyfile})
+else()
+    add_library(boost INTERFACE)
+endif()
+
+add_dependencies(boost ${BOOST_PROJECT})
+list(APPEND external_project_dependencies boost)
+set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index d3a9f9423f59e9f88a6bb99943e77308c2e87b2e..25d28de0aad3b1788083c92c4adff8b9a86da9b1 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -18,6 +18,11 @@ dynamic_lstm
 ..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
     :noindex:
 
+dynamic_gru
+-----------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+    :noindex:
+
 data
 ----
 ..  autofunction:: paddle.v2.fluid.layers.data
@@ -500,6 +505,11 @@ swish
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
 
+im2sequence
+------
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
+    :noindex:
+
 edit_distance
 ---------------
 ..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
diff --git a/doc/design/csp.md b/doc/design/csp.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba9cacfdea7dcf7c6499b562dfc58400d082f2c8
--- /dev/null
+++ b/doc/design/csp.md
@@ -0,0 +1,96 @@
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning.  Few example applications are:
+
+1.  The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2.  The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+| concurrent programming model | implementation |
+|-----|-----|
+| mutex | types and functions in standard libraries |
+| semaphore | types and functions in standard libraries |
+| communicating sequential processes (CSP) | Go programming language |
+| actor model | Erlang programming language |
+| message passing | MPI |
+| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language.  In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs.  We can find the three ingredients, process with ID, send, and recv, in MPI too.  Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code.  Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*.  If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue.  In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value.  To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file.  [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch  := make(chan int)       // a channel without buffer
+ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch  = fluid.make_chan(dtype=INT)
+ch1 = fluid.make_chan(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_chan(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
+
+### Send and Recv
+
+### Select
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
diff --git a/doc/design/dist_refactor/parameter_server.md b/doc/design/dist_refactor/parameter_server.md
index 1094f06d461275a9ad4034d5e48b39856d967b71..805dd13048d41b995d2a01cda52b2ea33e4bbe1d 100644
--- a/doc/design/dist_refactor/parameter_server.md
+++ b/doc/design/dist_refactor/parameter_server.md
@@ -9,16 +9,16 @@ different purposes.
 
 ## Background
 
-The previous implementations of the parameter server does not run a
+The previous implementations of the parameter server do not run a
 fluid sub-program. Parameter initialization, optimizer computation, network
 communication and checkpointing are implemented twice on both the
-trainer and the parameter server.
+trainer as well as the parameter server.
 
-It would be great if we can write code once and use them on both the
-trainer and the parameter server: reduces code duplication and
-improves extensibility. Given that after the current refactor, we are
-representing everything as a computing graph on the
-trainer. Representing everything as a computing graph on the parameter
+It would be great if we can write code once and use them on both: the
+trainer and the parameter server, since this reduces code duplication and
+improves extensibility. Given that after the current refactoring, we are
+representing everything as a computation graph on the
+trainer. Representing everything as a computation graph on the parameter
 server becomes a natural extension.
 
 ## Design
@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following
 steps:
 
 1. OP placement: the OPs will be placed on different nodes according
-   to heuristic that minimizes estimated total computation
+   to a heuristic that minimizes the estimated total computation
    time. Currently we will use a simple heuristic that puts parameter
-   varable on parameter server workers and everything else on trainer
+   variable on parameter server workers and everything else on trainer
    workers.
 1. Add communication OPs to enable the communication between nodes.
 
@@ -47,22 +47,22 @@ After converting:
 
 <img src="src/dist-graph.png" width="700"/>
 
-1. The parameter variable W and it's optimizer program are placed on the parameter server.
+1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
    - *Send* sends data to the connected *Recv* operator.  The
 	 scheduler on the receive node will only schedule *Recv* operator
 	 to run when the *Send* operator has ran (the *Send* OP will mark
 	 the *Recv* OP runnable automatically).
-   - *Enueue* enqueues the input variable, it can block until space
+   - *Enqueue* enqueues the input variable, it can block until space
      become available in the queue.
    - *Dequeue* outputs configurable numbers of tensors from the
-     queue. It will block until the queue have the required number of
+     queue. It will block until the queue has the required number of
      tensors.
 
 
 ### Benefits
 
-- Model parallelism become easier to implement: it's an extension to
+- Model parallelism becomes easier to implement: it is an extension to
   the trainer - parameter server approach. We can have several "Transpilers"
   to achieve different goals.
 - User-defined optimizer is easier to add - user can now express it as
@@ -72,22 +72,22 @@ After converting:
 
 ### Challenges
 
-- It's important to balance the parameter shards of on multiple
-  parameter server. If a single parameter is very big (some
+- It is important to balance the parameter shards on multiple
+  parameter servers. If a single parameter is very big (for example: some
   word-embedding, fully connected, softmax layer), we need to
   automatically partition the single parameter onto different
   parameter servers when possible (only element-wise optimizer depends
   on the parameter variable).
-- In the "Aync SGD" figure, the "W" variable on the parameter server
-  could be read and wrote concurrently. See
+- In the "Async SGD" figure, the "W" variable on the parameter server
+  could be read and written concurrently. See
   [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
-  details about concurrent program in fluid.
+  details about concurrent program in Fluid.
 
 ### Discussion
 
 - Can the Enqueue OP be implemented under our current tensor design
-  (puts the input tensor into the queue tensor)?
-- *Dequeue* OP will have variable numbers of output (depends on the
+  (put the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depending on the
   `min_count` attribute), does our current design support it? (similar
   question for the *Add* OP)
 
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
index 9db5fb8e9a9f89b004bf71ddc064cd976c0d0bee..c4a9bbeeefca0e05c335dd60233691e8bac33015 100644
--- a/doc/design/ops/sequence_decoder.md
+++ b/doc/design/ops/sequence_decoder.md
@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
 The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
 let's call this format the **absolute-offset LoD** for clarity.
 
-The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
 ```python
 [[0, 3, 9]
  [0, 2, 3, 3, 3, 9]]
@@ -119,7 +119,7 @@ def generate():
         encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
         decoder_input = pd.fc(
             act=pd.activation.Linear(),
-            input=[target_word, encoder_ctx],
+            input=[target_word, encoder_ctx_expanded],
             size=3 * decoder_dim)
         gru_out, cur_mem = pd.gru_step(
             decoder_input, mem=decoder_mem, size=decoder_dim)
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index bae42593ddc6f7a7eb47d603752ad6efa9820b45..98fada7bdb46f4dd2927d6f93bcbcebbe7d18604 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -25,14 +25,14 @@
 
   .. code-block:: bash
 
-     docker pull docker.paddlepaddle.org/paddle
+     docker pull docker.paddlepaddlehub.com/paddle
 
 下载GPU版本（cuda8.0_cudnn5_avx_mkl）的Docker镜像：
 
   .. code-block:: bash
 
      docker pull paddlepaddle/paddle:latest-gpu
-     docker pull docker.paddlepaddle.org/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
 
 选择下载使用不同的BLAS库的Docker镜像：
 
@@ -49,7 +49,7 @@
 
      docker pull paddlepaddle/paddle:[tag]
      # 比如：
-     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
 
 .. _docker_run:
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 56a7c68e4d39c45249fa55a964dc48b7081596a6..b1d0890b4cdddb77114a80276130afd07c22d270 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -26,14 +26,14 @@ For users in China, we provide a faster mirror:
 
   .. code-block:: bash
 
-     docker pull docker.paddlepaddle.org/paddle
+     docker pull docker.paddlepaddlehub.com/paddle
 
 Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
 
   .. code-block:: bash
 
      docker pull paddlepaddle/paddle:latest-gpu
-     docker pull docker.paddlepaddle.org/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
 
 Choose between different BLAS version:
 
@@ -53,7 +53,7 @@ and run:
 
      docker pull paddlepaddle/paddle:[tag]
      # i.e.
-     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
 
 .. _docker_run:
 
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
index 1775374cf6e518586c28bbd8e04946c74df7e4c5..368af40cc7308cf6f4c609361078fe3ba02213ed 100644
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -60,8 +60,7 @@ each column is as follows:
 | column | meaning |
 | --- | --- |
 | ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the
- execution time of other functions called by the function |
+| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
 | percall | tottime divided by ncalls |
 | cumtime | the total execution time of the function, including the execution time of other functions being called |
 | percall | cumtime divided by ncalls |
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 4a98ede278fad85ff2beef3c8e7dd158912f693a..3f9c132ef6ae03c7614e10484715676c8019821e 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -18,7 +18,7 @@ else()
     add_subdirectory(capi)
   endif()
 
-  if(Boost_FOUND)
+  if(NOT ANDROID AND NOT IOS)
     add_subdirectory(memory)
     add_subdirectory(platform)
     add_subdirectory(framework)
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index a912d8492fc6c23f88dd675694b805d0eda88335..b83007ac3bc6ed8713ca65fddabccfd292a2732f 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,7 +1,7 @@
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
-cc_library(ddim SRCS ddim.cc DEPS eigen3)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
         framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
 
-cc_library(attribute SRCS attribute.cc DEPS framework_proto)
+cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index b0fd4d2750eb2529706d871947332d39494505cd..5074e8f5a05ed4e824b3db7e506b30eb1b70c3fd 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
       }
       return val;
     }
+    case proto::AttrType::LONG: {
+      return attr_desc.l();
+    }
     default:
       PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
   }
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index c1c63d9cb13acb195b3bc3b30088f5fa7daf2a3d..bcff9bc4c48f8f233b7f811640c2789f9618a972 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -168,6 +168,32 @@ struct ExtractAttribute<bool> {
   const std::string& attr_name_;
 };
 
+template <>
+struct ExtractAttribute<int64_t> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  int64_t* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<int64_t>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      int val = boost::get<float>(attr);
+      attr = static_cast<int64_t>(val);
+    }
+    int64_t* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<int64_t>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 54498e175dacfa0a220e3d839f4feb02502b2c03..dd2ed87252102aee6d384f37365d19305f19b281 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -75,7 +75,7 @@ std::vector<VarDesc *> BlockDesc::AllVars() const {
 
 OpDesc *BlockDesc::AppendOp() {
   need_update_ = true;
-  ops_.emplace_back(new OpDesc());
+  ops_.emplace_back(new OpDesc(this));
   return ops_.back().get();
 }
 
@@ -86,7 +86,7 @@ void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
 
 OpDesc *BlockDesc::PrependOp() {
   need_update_ = true;
-  ops_.emplace_front(new OpDesc());
+  ops_.emplace_front(new OpDesc(this));
   return ops_.front().get();
 }
 
@@ -153,7 +153,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
   for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog));
+    ops_.emplace_back(new OpDesc(op_desc, prog, this));
   }
 }
 
@@ -162,7 +162,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op));
+    ops_.emplace_back(new OpDesc(*op, this));
   }
 
   for (auto &it : other.vars_) {
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index 1382bfca19a674a404916a5c709276ce41219d2f..bd58c0a7f8161f6b45f2b500f3685e4028d97e96 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -116,8 +116,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
-    VLOG(3) << op->DebugStringEx(local_scope);
+    VLOG(4) << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
+    VLOG(3) << op->DebugStringEx(local_scope);
     if (FLAGS_do_memory_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index ea69b87e2ac7dc587333b623c310182bb39eb452..5b6ef03f610926578d2c02dcf06f399f106a30a1 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -26,6 +26,7 @@ enum AttrType {
   BOOLEAN = 6;
   BOOLEANS = 7;
   BLOCK = 8;
+  LONG = 9;
 }
 
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -44,6 +45,7 @@ message OpDesc {
     optional bool b = 10;
     repeated bool bools = 11;
     optional int32 block_idx = 12;
+    optional int64 l = 13;
   };
 
   message Var {
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index b29f528f3f749efa3463125c774c2f4d4ebcbc7c..53b0d0fe083579da4f0bb600f292765aa2aa0d8a 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -107,9 +107,10 @@ LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
-  for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto &ele : result[level]) {
-      ele = result[level + 1][ele];
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
     }
   }
   return result;
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 1c0372bb16c04e155a68a0411939e4887322107a..f8df2cf97ad532f06cb1393b1a24cd789f8bde29 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -97,7 +97,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
   need_update_ = true;
 }
 
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
@@ -131,6 +131,7 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
       attrs_[attr_name] = prog->MutableBlock(bid);
     }
   }
+  this->block_ = block;
 }
 
 proto::OpDesc *OpDesc::Proto() {
@@ -282,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
     VectorToRepeated(v, attr_->mutable_bools());
   }
   void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
+  void operator()(int64_t v) const { attr_->set_l(v); }
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index a5ffb162928bfd355d35d3f9b63aab59a88dd061..13695cff59f0bfd79c48eb28670ecc67a0309332 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -25,7 +25,6 @@ namespace framework {
 
 class BlockDesc;
 class ProgramDesc;
-
 class OpDesc {
  public:
   OpDesc() {}
@@ -33,7 +32,14 @@ class OpDesc {
   OpDesc(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+
+  explicit OpDesc(BlockDesc *block) : block_(block) {}
+
+  OpDesc(const OpDesc &other, BlockDesc *block) {
+    *this = other;
+    block_ = block;
+  }
 
   void CopyFrom(const OpDesc &op_desc);
 
@@ -117,6 +123,10 @@ class OpDesc {
 
   void Flush();
 
+  BlockDesc *Block() { return this->block_; }
+
+  void SetBlock(BlockDesc *block) { this->block_ = block; }
+
  private:
   template <typename MapType>
   static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
@@ -129,6 +139,7 @@ class OpDesc {
   }
 
   proto::OpDesc desc_;
+  BlockDesc *block_;  // not_own
   // input arg name => input variable names
   VariableNameMap inputs_;
   // output arg name => output variable names
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
index d834d343759fa279a1444c6337956ffce1b9061a..1eedbbc419ab660f5ce00aa891ef80ca245bc0a8 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -35,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*>;
+                   std::vector<bool>, BlockDesc*, int64_t>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index fc482c467404a6b9dfed64c43871d91d3d10c766..9316b14bb695c185efd6db4296d422ef0c476d57 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -66,6 +66,8 @@ class VarDesc {
 
   std::string Name() const { return desc_.name(); }
 
+  void SetName(std::string name) { desc_.set_name(name); }
+
   void SetShape(const std::vector<int64_t> &dims);
 
   void SetDataType(proto::DataType data_type);
diff --git a/paddle/framework/variable_test.cc b/paddle/framework/variable_test.cc
index e4732d9718e2b46a068963d44c4c1e04024f2330..e5585c8724d712e273d086001b6cbc3d59c46ebe 100644
--- a/paddle/framework/variable_test.cc
+++ b/paddle/framework/variable_test.cc
@@ -12,19 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-  http://www.apache.org/licenses/LICENSE-2.0
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
-
 #include <memory>
 #include <string>
 
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index 337b9ba7bc0fc4e4bb80ee7b248d934f111379d5..8faf032f550836579522016b4fff3db7e94746e3 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap,
   if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
 
   // flip aspect ratios
-  for (int index = 0; index < tmp.size(); index++) {
+  for (unsigned index = 0; index < tmp.size(); index++) {
     real ar = tmp[index];
     if (fabs(ar - 1.) < 1e-6) continue;
     aspectRatio_.push_back(ar);
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 061ee1a4d4c97842efe6e64b89f09cfe5c65cd47..496098f80423854be62dc99b8601209ff6a6b182 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_subdirectory(detail)
 
 cc_library(memory SRCS memory.cc DEPS place enforce)
-cc_library(memcpy SRCS memcpy.cc)
+cc_library(memcpy SRCS memcpy.cc DEPS place)
 
 cc_library(paddle_memory
     DEPS
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
index 4c71d66d22899d2cf6418935bf9358a0f73cec27..844ade40eb2a7ae239b079daa609f03b9e7a06df 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -24,8 +24,18 @@ namespace operators {
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
                             framework::LoDTensor *selected_ids,
                             framework::LoDTensor *selected_scores) {
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+
   auto items = SelectTopBeamSizeItems();
-  auto selected_items = ToMap(items);
+  auto selected_items = ToMap(items, high_level.back());
+  VLOG(3) << "selected_items:";
+  for (size_t i = 0; i < selected_items.size(); ++i) {
+    VLOG(3) << "offset:" << i;
+    for (auto &item : selected_items[i]) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
   PruneEndidCandidates(pre_ids, &selected_items);
   // calculate the output tensor's height
   size_t num_instances = std::accumulate(
@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   low_level.push_back(low_offset);
 
   // fill lod
-  auto abs_lod = framework::ToAbsOffset(ids_->lod());
-  auto &high_level = abs_lod[lod_level_];
   framework::LoD lod(2);
   lod[0].assign(high_level.begin(), high_level.end());
   lod[1].assign(low_level.begin(), low_level.end());
+  if (!framework::CheckLoD(lod)) {
+    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+  }
   selected_ids->set_lod(lod);
   selected_scores->set_lod(lod);
 }
@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
 }
 
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
-    const std::vector<std::vector<Item>> &items) {
+    const std::vector<std::vector<Item>> &items, size_t element_num) {
   std::vector<std::vector<Item>> result;
+  result.resize(element_num);
   for (auto &entries : items) {
     for (const auto &item : entries) {
-      if (item.offset >= result.size()) {
-        result.resize(item.offset + 1);
-      }
       result[item.offset].push_back(item);
     }
   }
@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() {
     }
     result.emplace_back(items);
   }
+  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+  for (auto &items : result) {
+    VLOG(3) << "item set:";
+    for (auto &item : items) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
+
   return result;
 }
 
@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
   return true;
 }
 
+std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
+  os << "{";
+  os << "offset: " << item.offset << ", ";
+  os << "id: " << item.id << ", ";
+  os << "score: " << item.score << "";
+  os << "}";
+
+  return os;
+}
+
+std::string ItemToString(const BeamSearch::Item &item) {
+  std::ostringstream stream;
+  stream << item;
+  return stream.str();
+}
+
 class BeamSearchProtoAndCheckerMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker
   }
 };
 
+class BeamSearchInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    for (const std::string &arg :
+         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+      PADDLE_ENFORCE(context->HasInput(arg),
+                     "BeamSearch need input argument '%s'", arg);
+    }
+    for (const std::string &arg :
+         std::vector<std::string>({"selected_ids", "selected_scores"})) {
+      PADDLE_ENFORCE(context->HasOutput(arg),
+                     "BeamSearch need output argument '%s'", arg);
+    }
+  }
+};
+
+class BeamSearchInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o : op_desc.Output("selected_ids")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+    for (auto &o : op_desc.Output("selected_scores")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp,
-                             paddle::operators::BeamSearchProtoAndCheckerMaker);
+REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
+                  paddle::operators::BeamSearchProtoAndCheckerMaker,
+                  paddle::operators::BeamSearchInferShape,
+                  paddle::operators::BeamSearchInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
index 45d14d68fe8d1c4a84aa826e68e76692444765a8..7ad85874fcbd6ea48d688b32f2cc982d6b76d3c4 100644
--- a/paddle/operators/beam_search_op.h
+++ b/paddle/operators/beam_search_op.h
@@ -136,8 +136,6 @@ class BeamSearch {
   void operator()(const framework::LoDTensor& pre_ids,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores);
-
- protected:
   /*
    * The basic items help to sort.
    */
@@ -155,6 +153,7 @@ class BeamSearch {
     score_t score;
   };
 
+ protected:
   /*
    * Delete all the records that follows the end token.
    */
@@ -166,7 +165,7 @@ class BeamSearch {
    * NOTE low performance
    */
   std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>>& inputs);
+      const std::vector<std::vector<Item>>& inputs, size_t element_num);
 
   /*
    * For each source, select top beam_size records.
@@ -187,6 +186,10 @@ class BeamSearch {
   int end_id_{0};
 };
 
+std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
+
+std::string ItemToString(const BeamSearch::Item& item);
+
 class BeamSearchOp : public framework::OperatorBase {
  public:
   BeamSearchOp(const std::string& type,
@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase {
 
   void Run(const framework::Scope& scope,
            const platform::Place& dev_place) const override {
-    LOG(INFO) << "run beam search op";
     auto ids_var = scope.FindVar(Input("ids"));
     auto scores_var = scope.FindVar(Input("scores"));
     auto pre_ids_var = scope.FindVar(Input("pre_ids"));
@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase {
     size_t level = Attr<int>("level");
     size_t beam_size = Attr<int>("beam_size");
     int end_id = Attr<int>("end_id");
-    LOG(INFO) << "init beam search";
     BeamSearch alg(ids, scores, level, beam_size, end_id);
 
-    LOG(INFO) << "after beam search";
     auto selected_ids_var = scope.FindVar(Output("selected_ids"));
     auto selected_scores_var = scope.FindVar(Output("selected_scores"));
     PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase {
         *selected_ids_var->GetMutable<framework::LoDTensor>();
     auto& selected_scores_tensor =
         *selected_scores_var->GetMutable<framework::LoDTensor>();
-    LOG(INFO) << "run beam search";
     alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
-    LOG(INFO) << "finish beam search";
   }
 };
 
diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
index b0f7376d272a66e0b01d6b3f7e546372397772f7..83c8778fe4cec4d9d80de691e117a39fdd92f494 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@@ -21,8 +21,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-constexpr char kEPS = 1e-6;
-
 class BipartiteMatchOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -46,6 +44,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   // The match_dist must be initialized to 0 at first.
   void BipartiteMatch(const Tensor& dist, int* match_indices,
                       T* match_dist) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
     PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
     int64_t row = dist.dims()[0];
     int64_t col = dist.dims()[1];
diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h
index 589413feb3dcbb7fea1f0a878b35d4bf714b5318..fed89aa1e899a2450b315f352b9695056ed13aec 100644
--- a/paddle/operators/ctc_align_op.h
+++ b/paddle/operators/ctc_align_op.h
@@ -51,7 +51,7 @@ class CTCAlignKernel : public framework::OpKernel<T> {
       T prev_token = -1;
       for (size_t i = input_lod[level][seq_idx];
            i < input_lod[level][seq_idx + 1]; ++i) {
-        if (input_data[i] != blank &&
+        if ((unsigned)input_data[i] != blank &&
             !(merge_repeated && input_data[i] == prev_token)) {
           output_data[output_idx] = input_data[i];
           ++output_idx;
diff --git a/paddle/operators/iou_similarity_op.cc b/paddle/operators/iou_similarity_op.cc
new file mode 100755
index 0000000000000000000000000000000000000000..c520b28b83e66dbf53d2e19985370be4a2f69e23
--- /dev/null
+++ b/paddle/operators/iou_similarity_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/iou_similarity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IOUSimilarityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IOUSimilarityOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of IOUSimilarityOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
+    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
+  }
+};
+
+class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
+             "each box is represented as [xmin, ymin, xmax, ymax], "
+             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
+             "coordinate of the box if the input is image feature map, they "
+             "are close to the origin of the coordinate system. "
+             "[xmax, ymax] is the right bottom coordinate of the box. "
+             "This tensor can contain LoD information to represent a batch "
+             "of inputs. One instance of this batch can contain different "
+             "numbers of entities.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>) "
+             "Box list Y holds M boxes, each box is represented as "
+             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
+             "[xmin, ymin] is the left top coordinate of the box if the "
+             "input is image feature map, and [xmax, ymax] is the right "
+             "bottom coordinate of the box.");
+
+    AddOutput("Out",
+              "(LoDTensor, the lod is same as input X) The output of "
+              "iou_similarity op, a tensor with shape [N, M] "
+              "representing pairwise iou scores.");
+
+    AddComment(R"DOC(
+IOU Similarity Operator.
+Computes intersection-over-union (IOU) between two box lists.
+ Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+ boxes in 'Y' are shared by all instance of the batched inputs of X.
+ Given two boxes A and B, the calculation of IOU is as follows:
+
+$$
+IOU(A, B) = 
+\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
+                             ops::IOUSimilarityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/iou_similarity_op.cu b/paddle/operators/iou_similarity_op.cu
new file mode 100755
index 0000000000000000000000000000000000000000..fa5052624618c35875b241419946f69b776c81d4
--- /dev/null
+++ b/paddle/operators/iou_similarity_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/iou_similarity_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/iou_similarity_op.h b/paddle/operators/iou_similarity_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e36177069d7b18ea23759f99c4679218fbfd32b8
--- /dev/null
+++ b/paddle/operators/iou_similarity_op.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/for_range.h"
+
+template <typename T>
+inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
+                                  T ymin2, T xmax2, T ymax2) {
+  constexpr T zero = static_cast<T>(0);
+  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
+  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
+  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
+  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
+  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
+  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
+  T inter_height = inter_ymax - inter_ymin;
+  T inter_width = inter_xmax - inter_xmin;
+  inter_height = inter_height > zero ? inter_height : zero;
+  inter_width = inter_width > zero ? inter_width : zero;
+  T inter_area = inter_width * inter_height;
+  T union_area = area1 + area2 - inter_area;
+  T sim_score = inter_area / union_area;
+  return sim_score;
+}
+
+template <typename T>
+struct IOUSimilarityFunctor {
+  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
+      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    T x_min1 = x_[row_id * 4];
+    T y_min1 = x_[row_id * 4 + 1];
+    T x_max1 = x_[row_id * 4 + 2];
+    T y_max1 = x_[row_id * 4 + 3];
+    for (size_t i = 0; i < cols_; ++i) {
+      T x_min2 = y_[i * 4];
+      T y_min2 = y_[i * 4 + 1];
+      T x_max2 = y_[i * 4 + 2];
+      T y_max2 = y_[i * 4 + 3];
+
+      T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
+                            x_max2, y_max2);
+
+      z_[row_id * cols_ + i] = sim;
+    }
+  }
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IOUSimilarityKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
+    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+
+    int x_n = in_x->dims()[0];
+    int y_n = in_y->dims()[0];
+    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
+                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
+
+    platform::ForRange<DeviceContext> for_range(
+        static_cast<const DeviceContext&>(ctx.device_context()), x_n);
+    for_range(functor);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index bb03def4391da80c6219f7863d300fd3c8d8c7ac..2405852f53d46356a474897d3a111d1c94eed081 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -66,6 +66,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update")
         .SetDefault(false);
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(-1);
     AddComment(R"DOC(
 Lookup Table Operator.
 
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 261a28da694bf551d8d9e630139680aebc4be51a..d97390fa1c53fa0bdf16ab34cb209b994621f83c 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -21,9 +21,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
+          bool PaddingFlag>
 __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
-                            const int64_t N, const int64_t K, const int64_t D) {
+                            const int64_t N, const int64_t K, const int64_t D,
+                            const int64_t padding_idx) {
   int idx = threadIdx.x;
   int idy = blockIdx.x + threadIdx.y * GridDimX;
 
@@ -34,7 +36,14 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
     T* out = output + idy * D;
     const T* tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
-      out[i] = tab[i];
+      if (PaddingFlag) {
+        if (id == padding_idx)
+          out[i] = static_cast<T>(0);
+        else
+          out[i] = tab[i];
+      } else {
+        out[i] = tab[i];
+      }
     }
     idy += BlockDimY * GridDimX;
   }
@@ -67,6 +76,7 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     auto* table_t = context.Input<LoDTensor>("W");
     auto* ids_t = context.Input<LoDTensor>("Ids");
     auto* output_t = context.Output<LoDTensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
 
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
@@ -77,10 +87,17 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
 
     dim3 threads(128, 8);
     dim3 grids(8, 1);
-    LookupTable<
-        T, 128, 8,
-        8><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-        output, table, ids, N, K, D);
+
+    if (padding_idx == -1)
+      LookupTable<
+          T, 128, 8, 8,
+          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
+    else
+      LookupTable<
+          T, 128, 8, 8,
+          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
   }
 };
 
@@ -91,6 +108,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
     auto& dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
       auto* ids = context.Input<LoDTensor>("Ids");
       auto* table = context.Input<LoDTensor>("W");
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index 2fd3335868406455ec01f9ded6bacc7bda5e2a67..0842c422f7bfd3cad9b36dfdbab930f3cc4a8728 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -32,16 +32,30 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto* table_t = context.Input<LoDTensor>("W");      // float tensor
     auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
     auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
 
     int N = table_t->dims()[0];
     int D = table_t->dims()[1];
     auto* ids = ids_t->data<int64_t>();
     auto* table = table_t->data<T>();
     auto* output = output_t->mutable_data<T>(context.GetPlace());
-    for (int64_t i = 0; i < ids_t->numel(); ++i) {
-      PADDLE_ENFORCE_LT(ids[i], N);
-      PADDLE_ENFORCE_GE(ids[i], 0);
-      memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+
+    if (padding_idx == -1) {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids[i], N);
+        PADDLE_ENFORCE_GE(ids[i], 0);
+        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+      }
+    } else {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        if (ids[i] == padding_idx) {
+          memset(output + i * D, 0, D * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], N);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        }
+      }
     }
   }
 };
@@ -51,6 +65,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
       auto* ids = context.Input<LoDTensor>("Ids");
       auto* table = context.Input<LoDTensor>("W");
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
index 84ba3ead2b52547b989a4541f31ea31ffcce6c63..994ddf717e7a5b883d8071c6a47da0b4b4074f2e 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -124,7 +124,8 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                               "This attribute only be used in unitest. Classes "
                               "in this list wiil be used as negative classes "
                               "for every samples. Under normal conditions, "
-                              "user should avoid setting this attribute.");
+                              "user should avoid setting this attribute.")
+        .SetDefault({});
     AddComment(R"DOC(
 Compute and return the noise-contrastive estimation training loss.
 See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
index e6b496f7896dcb412be8ff096fdccb2f0b682369..86fa13a649ce7fdcaad64e2609ceea2fb4d7e072 100644
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
@@ -197,7 +197,8 @@ class NCEGradKernel : public framework::OpKernel<T> {
     // get d_x
     auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
     if (d_x != nullptr) {
-      d_x->mutable_data<T>(context.GetPlace());
+      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+      std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
index a00458ea068dd703d2c7f362511ed08bc212d2a8..09e808902f8fe3a7a07153d3432866c18e81dc7c 100644
--- a/paddle/operators/parallel_do_op.cc
+++ b/paddle/operators/parallel_do_op.cc
@@ -31,6 +31,7 @@ static constexpr char kParallelScopes[] = "parallel_scopes";
 static constexpr char kParallelBlock[] = "sub_block";
 
 using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
 
 static void SplitTensorAndMoveTensorToScopes(
     const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
@@ -64,6 +65,30 @@ static void SplitTensorAndMoveTensorToScopes(
   }
 }
 
+inline void CopyOrShare(const framework::Variable &src,
+                        const platform::Place &dst_place,
+                        framework::Variable *dst) {
+  if (src.IsType<LoDTensor>()) {
+    if (src.Get<LoDTensor>().place() == dst_place) {
+      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+    } else {
+      Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+    }
+  } else if (src.IsType<SelectedRows>()) {
+    auto &src_sr = src.Get<SelectedRows>();
+    auto *dst_sr = dst->GetMutable<SelectedRows>();
+    dst_sr->set_rows(src_sr.rows());
+    dst_sr->set_height(src_sr.height());
+    if (src_sr.value().place() == dst_place) {
+      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+    } else {
+      Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+    }
+  } else {
+    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
+  }
+}
+
 void WaitOnPlace(const platform::Place place) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &dev_ctx = *pool.Get(place);
@@ -210,30 +235,30 @@ class ParallelDoGradOp : public framework::OperatorBase {
     }
     WaitOnPlaces(places);
 
-    // merge grad
+    AccumulateGrad(scope, place, sub_scopes, places);
+  }
+
+  void AccumulateGrad(const framework::Scope &scope,
+                      const platform::Place &place,
+                      const std::vector<framework::Scope *> &sub_scopes,
+                      const platform::PlaceList &places) const {
     for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      auto &result = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
       std::string tmp_name;
-      auto *tmp = sub_scopes[0]->Var(&tmp_name)->GetMutable<LoDTensor>();
+      auto *tmp = sub_scopes[0]->Var(&tmp_name);
 
       for (size_t i = 1; i < sub_scopes.size(); ++i) {
-        auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get<LoDTensor>();
-        if (!(places[i] == places[0])) {
-          framework::Copy(tensor_to_merge, places[0], tmp);
-          WaitOnPlace(places[0]);
-        } else {
-          tmp->ShareDataWith(tensor_to_merge);
-        }
+        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
+        WaitOnPlace(places[0]);
 
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
             framework::AttributeMap{});
+        VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
       }
 
-      VLOG(3) << result;
-      framework::Copy(result, place, scope.FindVar(s)->GetMutable<LoDTensor>());
+      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
     }
     WaitOnPlaces(places);
   }
@@ -289,7 +314,7 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
 
     PADDLE_ENFORCE(ctx->HasInputs(kParameters));
     PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
-    PADDLE_ENFORCE(ctx->HasInput(kInputs));
+    PADDLE_ENFORCE(ctx->HasInputs(kInputs));
 
     for (auto &s : output) {
       PADDLE_ENFORCE(ctx->HasInputs(s));
diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..105ff4ac3e3ba889aad880f4204af15829c6da47
--- /dev/null
+++ b/paddle/operators/prior_box_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PriorBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of PriorBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Image"),
+                   "Input(Image) of PriorBoxOp should not be null.");
+
+    auto image_dims = ctx->GetInputDim("Image");
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
+                      "The height of input must smaller than image.");
+
+    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
+                      "The width of input must smaller than image.");
+
+    auto min_sizes = ctx->Attrs().Get<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx->Attrs().Get<std::vector<int>>("max_sizes");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    bool flip = ctx->Attrs().Get<bool>("flip");
+
+    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                      "Size of min_sizes must be at least 1.");
+    for (size_t i = 0; i < min_sizes.size(); ++i) {
+      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
+    }
+
+    std::vector<float> aspect_ratios_vec;
+    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+
+    int num_priors = aspect_ratios_vec.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
+                        "The number of min_size and max_size must be equal.");
+      for (size_t i = 0; i < min_sizes.size(); ++i) {
+        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
+                          "max_size[%d] must be greater than min_size[%d].", i,
+                          i);
+        num_priors += 1;
+      }
+    }
+
+    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
+    for (size_t i = 0; i < variances.size(); ++i) {
+      PADDLE_ENFORCE_GT(variances[i], 0.0,
+                        "variance[%d] must be greater than 0.", i);
+    }
+
+    const float step_h = ctx->Attrs().Get<float>("step_h");
+    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+    const float step_w = ctx->Attrs().Get<float>("step_w");
+    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_priors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+};
+
+class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PriorBoxOp, The layout is NCHW.");
+    AddInput("Image",
+             "(Tensor, default Tensor<float>), "
+             "the input image data of PriorBoxOp, The layout is NCHW.");
+    AddOutput("Boxes",
+              "(Tensor, default Tensor<float>), the output prior boxes of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
+                              "List of min sizes of generated prior boxes.");
+    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
+                              "List of max sizes of generated prior boxes.");
+    AddAttr<std::vector<float>>(
+        "aspect_ratios", "(vector<float>) ",
+        "List of aspect ratios of generated prior boxes.");
+    AddAttr<std::vector<float>>(
+        "variances", "(vector<float>) ",
+        "List of variances to be encoded in prior boxes.");
+    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
+        .SetDefault(true);
+    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
+        .SetDefault(true);
+    AddAttr<float>("step_w",
+                   "Prior boxes step across width, 0 for auto calculation.")
+        .SetDefault(0.0);
+    AddAttr<float>("step_h",
+                   "Prior boxes step across height, 0 for auto calculation.")
+        .SetDefault(0.0);
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Prior boxes center offset.")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+Prior box operator
+Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+Each position of the input produce N prior boxes, N is determined by
+ the count of min_sizes, max_sizes and aspect_ratios, The size of the
+ box is in range(min_size, max_size) interval, which is generated in
+ sequence according to the aspect_ratios.
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1512.02325.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
+    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0a663ace8f38c2d08fd4714c1247d3313ffae3e
--- /dev/null
+++ b/paddle/operators/prior_box_op.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>& output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior.clear();
+  output_aspect_ratior.push_back(1.);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
+      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior.push_back(ar);
+      if (flip) {
+        output_aspect_ratior.push_back(1. / ar);
+      }
+    }
+  }
+}
+
+template <typename T>
+struct ClipFunctor {
+  HOSTDEVICE T operator()(T in) const {
+    return std::min<T>(std::max<T>(in, 0.), 1.);
+  }
+};
+
+template <typename Place, typename T>
+class PriorBoxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<int>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    for (int h = 0; h < feature_height; ++h) {
+      for (int w = 0; w < feature_width; ++w) {
+        T center_x = (w + offset) * step_width;
+        T center_y = (h + offset) * step_height;
+        T box_width, box_height;
+        int idx = 0;
+        for (size_t s = 0; s < min_sizes.size(); ++s) {
+          int min_size = min_sizes[s];
+          // first prior: aspect_ratio = 1, size = min_size
+          box_width = box_height = min_size;
+          // xmin
+          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+          // ymin
+          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+          // xmax
+          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+          // ymax
+          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+
+          idx++;
+          if (max_sizes.size() > 0) {
+            int max_size = max_sizes[s];
+            // second prior: aspect_ratio = 1,
+            // size = sqrt(min_size * max_size)
+            box_width = box_height = sqrt(min_size * max_size);
+            // xmin
+            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            idx++;
+          }
+
+          // rest of priors
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar);
+            box_height = min_size / sqrt(ar);
+            // xmin
+            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            idx++;
+          }
+        }
+      }
+    }
+
+    if (clip) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      ClipFunctor<T> clip_func;
+      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+            boxes->data<T>(), clip_func);
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int box_num = feature_height * feature_width * num_priors;
+    auto var_dim = vars->dims();
+    vars->Resize({box_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_expand_op.h b/paddle/operators/sequence_expand_op.h
index 2ba628e9c37278025e31779ab0468db46f2ff40a..6021526eee8e0a1f58885f6de38b14048787a828 100644
--- a/paddle/operators/sequence_expand_op.h
+++ b/paddle/operators/sequence_expand_op.h
@@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
     const T* x_data = x->data<T>();
     auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
     PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
                       y->lod().back().size() - 1,
                       "The size of last lod level in Input(Y)"
diff --git a/paddle/operators/sequence_reshape_op.h b/paddle/operators/sequence_reshape_op.h
index c6f528ab8a73294bb8ee91425f34e44c66f1932c..aaae7ab29281b72848515b80cc60931c13a294c9 100644
--- a/paddle/operators/sequence_reshape_op.h
+++ b/paddle/operators/sequence_reshape_op.h
@@ -35,7 +35,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
                       "Only support one level sequence now.");
     PADDLE_ENFORCE_EQ(
-        in_dims[0], in_lod[0].back(),
+        (uint64_t)in_dims[0], in_lod[0].back(),
         "Inconsistent size between X.shape[0] and X.lod()[0].back().");
 
     auto in_lod_l0 = in_lod[0];
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index e9cd9bbd4d964c28f305fb4ab4c4733ed27ebfff..bf42e15e6b234125d9ec24e8500367b9915213ab 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -22,6 +22,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
     // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    auto* indices = ctx.Output<Tensor>("Indices");
+    auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<LoDTensor>("Out");
+    auto* indices = ctx.Output<LoDTensor>("Indices");
     // k is determined by Attr
     const size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 3742594a504ed728019ac9665c022503748bea01..d68caea99719b37816391f9bddcc5cac051025b2 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -10,7 +10,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
 
-cc_library(place SRCS place.cc DEPS enforce)
+cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 add_subdirectory(dynload)
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
index 99694fa592059d979297b72748125d02b2dd70a3..b55ddee17616ced4de659be8e55acd5e072c66b7 100644
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@@ -64,6 +64,8 @@ std::string AttrType(paddle::framework::proto::AttrType at) {
       return "bool array";
     case paddle::framework::proto::BLOCK:
       return "block id";
+    case paddle::framework::proto::LONG:
+      return "long";
   }
   return "UNKNOWN";  // not possible
 }
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 4f959481537d29c089be24f9ae306f860c196c0f..371d6119d4ab73e683821d0dc5db5194f44a64ce 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -212,6 +212,7 @@ void BindVarDsec(py::module &m) {
              return name;
            },
            py::return_value_policy::reference)
+      .def("set_name", &VarDesc::SetName)
       .def("set_shape", &VarDesc::SetShape)
       .def("set_dtype", &VarDesc::SetDataType)
       .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
@@ -280,7 +281,8 @@ void BindOpDesc(py::module &m) {
       .def("check_attrs", &OpDesc::CheckAttrs)
       .def("infer_shape", &OpDesc::InferShape)
       .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>);
+      .def("serialize_to_string", SerializeMessage<OpDesc>)
+      .def("block", &OpDesc::Block, py::return_value_policy::reference);
 }
 
 }  // namespace pybind
diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py
index e2f463be2f7bcd667855f64206d78f387e92ef33..c8818f715beadd9499ae588f2c19a57fbf26f372 100644
--- a/python/paddle/v2/dataset/wmt16.py
+++ b/python/paddle/v2/dataset/wmt16.py
@@ -305,9 +305,9 @@ def get_dict(lang, dict_size, reverse=False):
 
     dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
                              "wmt16/%s_%d.dict" % (lang, dict_size))
-    assert (os.path.exists(dict_path), "Word dictionary does not exist. "
-            "Please invoke paddle.dataset.wmt16.train/test/validation "
-            "first to build the dictionary.")
+    assert os.path.exists(dict_path), "Word dictionary does not exist. "
+    "Please invoke paddle.dataset.wmt16.train/test/validation first "
+    "to build the dictionary."
     tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
     return __load_dict(tar_file, dict_size, lang, reverse)
 
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 0b0064ade90d2b70dd1458cb4d20d741fbf1efcd..8c481444e9bf895d29ed4e4952e825c2eaafc915 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -100,7 +100,8 @@ class LayerHelper(object):
             if dtype is None:
                 dtype = each.dtype
             elif dtype != each.dtype:
-                raise ValueError("Data Type mismatch")
+                raise ValueError("Data Type mismatch: %d to %d" %
+                                 (dtype, each.dtype))
         return dtype
 
     def create_parameter(self,
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 5f01fdb076d3bf7d060a805d1431f4973993a843..0fcbfe0e2f2f9686366139e84b7fdcc158bf0aa7 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -289,6 +289,7 @@ class ParallelDo(object):
                 for in_var_name in op.input(iname):
                     if in_var_name not in local_inputs:
                         params.append(in_var_name)
+        params = list(set(params))
 
         return [parent_block.var(name) for name in params]
 
@@ -769,7 +770,7 @@ def topk(input, k):
           array = fluid.layers.topk(x, k)
     """
     helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
     topk_indices = helper.create_tmp_variable(dtype='int64')
     helper.append_op(
         type='top_k',
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 8a29a7ba7bd8afde7040b5c6e8b6d45bbb5ec77a..10a126933472d2f09d39f34ac9389ea8cfe315b4 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -19,12 +19,14 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
+from layer_function_generator import autodoc
 from tensor import concat
 
 __all__ = [
     'fc',
     'embedding',
     'dynamic_lstm',
+    'dynamic_gru',
     'gru_unit',
     'linear_chain_crf',
     'crf_decoding',
@@ -57,6 +59,9 @@ __all__ = [
     'warpctc',
     'sequence_reshape',
     'transpose',
+    'im2sequence',
+    'nce',
+    'beam_search',
     'row_conv',
 ]
 
@@ -160,10 +165,8 @@ def fc(input,
         tmp = helper.create_tmp_variable(dtype)
         helper.append_op(
             type="mul",
-            inputs={
-                "X": input_var,
-                "Y": w,
-            },
+            inputs={"X": input_var,
+                    "Y": w},
             outputs={"Out": tmp},
             attrs={"x_num_col_dims": num_flatten_dims,
                    "y_num_col_dims": 1})
@@ -182,22 +185,35 @@ def fc(input,
     return helper.append_activation(pre_activation)
 
 
-def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
+def embedding(input,
+              size,
+              is_sparse=False,
+              padding_idx=None,
+              param_attr=None,
+              dtype='float32'):
     """
     **Embedding Layer**
 
-    This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table.
-    The result of this lookup is the embedding of each ID in the *input*.
+    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
+    a lookup table. The result of this lookup is the embedding of each ID in the
+    :attr:`input`.
 
     All the input variables are passed in as local variables to the LayerHelper
     constructor.
 
     Args:
-       input(Variable): Input to the function
-       size(tuple|list|None): Shape of the look up table parameter
-       is_sparse(bool): Boolean flag that specifying whether the input is sparse
-       param_attr(ParamAttr): Parameters for this layer
-       dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
+        input(Variable): The tensor variable containing the IDs.
+        size(tuple|list): The shape of the look up table parameter. It should
+            have two elements which indicate the size of the dictionary of
+            embeddings and the size of each embedding vector respectively.
+        is_sparse(bool): The flag indicating whether to use sparse update.
+        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
+            Otherwise the given :attr:`padding_idx` indicates padding the output
+            with zeros whenever lookup encounters it in :attr:`input`. If
+            :math:`padding_idx < 0`, the padding_idx to use in lookup is
+            :math:`size[0] + dim`.
+        param_attr(ParamAttr): Parameters for this layer
+        dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
 
     Returns:
         Variable: The tensor variable storing the embeddings of the \
@@ -215,12 +231,15 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
     tmp = helper.create_tmp_variable(dtype)
+    padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
+        size[0] + padding_idx)
     helper.append_op(
         type='lookup_table',
         inputs={'Ids': input,
                 'W': w},
         outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse})
+        attrs={'is_sparse': is_sparse,
+               'padding_idx': padding_idx})
     return tmp
 
 
@@ -367,6 +386,113 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
+def dynamic_gru(input,
+                size,
+                param_attr=None,
+                bias_attr=None,
+                is_reverse=False,
+                gate_activation='sigmoid',
+                candidate_activation='tanh',
+                h_0=None):
+    """
+    **Dynamic GRU Layer**
+
+    Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
+    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_
+
+    The formula is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
+
+    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
+    is the update gate and reset gate activation function and :math:`sigmoid`
+    is usually used for it. :math:`act_c` is the activation function for
+    candidate hidden state and :math:`tanh` is usually used for it.
+
+    Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on
+    the input :math:`x_{t}` are NOT included in this operator. Users can choose
+    to use fully-connect layer before GRU layer.
+
+    Args:
+        input(Variable): The input of dynamic_gru layer, which supports
+            variable-time length input sequence. The underlying tensor in this
+            Variable is a matrix with shape :math:`(T \\times 3D)`, where
+            :math:`T` is the total time steps in this mini-batch, :math:`D`
+            is the hidden size.
+        size(int): The dimension of the gru cell.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+        bias_attr(ParamAttr): The parameter attribute for learnable the
+            hidden-hidden bias.
+        is_reverse(bool): Whether to compute reversed GRU, default
+            :attr:`False`.
+        gate_activation(str): The activation for update gate and reset gate.
+            Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
+        activation(str): The activation for candidate hidden state.
+            Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
+
+    Returns:
+        Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
+            is the same with the input.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            x = fluid.layers.fc(input=data, size=hidden_dim * 3)
+            hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim)
+    """
+
+    helper = LayerHelper('gru', **locals())
+    dtype = helper.input_dtype()
+
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    if h_0 != None:
+        assert h_0.shape == (
+            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
+        inputs['h0'] = h_0
+
+    hidden = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_reset_hidden_prev = helper.create_tmp_variable(dtype)
+    batch_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'BatchGate': batch_gate,
+            'BatchResetHiddenPrev': batch_reset_hidden_prev,
+            'BatchHidden': batch_hidden
+        },
+        attrs={
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'activation': candidate_activation
+        })
+    return hidden
+
+
 def gru_unit(input,
              hidden,
              size,
@@ -1425,6 +1551,38 @@ def sequence_expand(x, y, name=None):
     return tmp
 
 
+def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
+    '''
+    This function implements the beam search algorithm.
+    '''
+    helper = LayerHelper('beam_search', **locals())
+    score_type = scores.dtype
+    id_type = ids.dtype
+
+    selected_scores = helper.create_tmp_variable(dtype=score_type)
+    selected_ids = helper.create_tmp_variable(dtype=id_type)
+
+    helper.append_op(
+        type='beam_search',
+        inputs={
+            'pre_ids': pre_ids,
+            'ids': ids,
+            'scores': scores,
+        },
+        outputs={
+            'selected_ids': selected_ids,
+            'selected_scores': selected_scores,
+        },
+        attrs={
+            # TODO(ChunweiYan) to assure other value support
+            'level': level,
+            'beam_size': beam_size,
+            'end_id': end_id,
+        })
+
+    return selected_ids, selected_scores
+
+
 def lstm_unit(x_t,
               hidden_t_prev,
               cell_t_prev,
@@ -2191,6 +2349,61 @@ def sequence_reshape(input, new_dim):
     return out
 
 
+@autodoc()
+def nce(input,
+        label,
+        num_total_classes,
+        sample_weight=None,
+        param_attr=None,
+        bias_attr=None,
+        num_neg_samples=None):
+    helper = LayerHelper('nce', **locals())
+    assert isinstance(input, Variable)
+    dim = input.shape[1]
+    assert isinstance(label, Variable)
+    num_true_class = label.shape[1]
+    w = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[num_total_classes, dim],
+        is_bias=False,
+        dtype=input.dtype)
+    b = helper.create_parameter(
+        attr=helper.bias_attr,
+        shape=[num_total_classes, 1],
+        is_bias=True,
+        dtype=input.dtype)
+    cost = helper.create_tmp_variable(dtype=input.dtype)
+    sample_logits = helper.create_tmp_variable(dtype=input.dtype)
+    sample_labels = helper.create_tmp_variable(dtype=label.dtype)
+
+    if num_neg_samples is None:
+        num_neg_samples = 10
+    else:
+        num_neg_samples = int(num_neg_samples)
+
+    attrs = {
+        'num_total_classes': int(num_total_classes),
+        'num_neg_samples': num_neg_samples
+    }
+
+    helper.append_op(
+        type='nce',
+        inputs={
+            'Input': input,
+            'Label': label,
+            'Weight': w,
+            'Bias': b,
+            'SampleWeight': sample_weight if sample_weight is not None else []
+        },
+        outputs={
+            'Cost': cost,
+            'SampleLogits': sample_logits,
+            'SampleLabels': sample_labels
+        },
+        attrs=attrs)
+    return cost / (num_neg_samples + 1)
+
+
 def transpose(x, perm, name=None):
     """
     **transpose Layer**
@@ -2229,6 +2442,131 @@ def transpose(x, perm, name=None):
     return out
 
 
+def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
+    """
+    Extracts image patches from the input tensor to form a tensor of shape
+    {input.batch_size * output_height * output_width, filter_size_H *
+    filter_size_W * input.channels} which is similar with im2col.
+    This op use filter / kernel to scan images and convert these images to
+    sequences. After expanding, the number of time step are
+    output_height * output_width for an image, in which output_height and
+    output_width are calculated by below equation:
+
+    .. math::
+
+        output\_size = 1 + \
+            (2 * padding + img\_size - block\_size + stride - 1) / stride
+
+    And the dimension of each time step is block_y * block_x * input.channels.
+
+    Args:
+        input (Variable): The input should be a tensor in NCHW format.
+
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+
+        padding(int|tuple): The padding size. If padding is a tuple, it can
+            contain two integers like (padding_H, padding_W) which means
+            padding_up = padding_down = padding_H and
+            padding_left = padding_right = padding_W. Or it can use
+            (padding_up, padding_left, padding_down, padding_right) to indicate
+            paddings of four direction. Otherwise, a scalar padding means
+            padding_up = padding_down = padding_left = padding_right = padding
+            Default: padding = 0.
+
+        name (int): The name of this layer. It is optional.
+
+    Returns:
+        output: The output is a LoDTensor with shape
+        {input.batch_size * output_height * output_width,
+        filter_size_H * filter_size_W * input.channels}.
+        If we regard output as a matrix, each row of this matrix is
+        a step of a sequence.
+
+    Examples:
+
+    As an example:
+
+        .. code-block:: text
+
+            Given:
+
+            x = [[[[ 6.  2.  1.]
+                   [ 8.  3.  5.]
+                   [ 0.  2.  6.]]
+
+                  [[ 2.  4.  4.]
+                   [ 6.  3.  0.]
+                   [ 6.  4.  7.]]]
+
+                 [[[ 6.  7.  1.]
+                   [ 5.  7.  9.]
+                   [ 2.  4.  8.]]
+
+                  [[ 1.  2.  1.]
+                   [ 1.  3.  5.]
+                   [ 9.  0.  8.]]]]
+
+            x.dims = {2, 2, 3, 3}
+
+            And:
+
+            filter = [2, 2]
+            stride = [1, 1]
+            padding = [0, 0]
+
+            Then:
+
+            output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
+                           [ 2.  1.  3.  5.  4.  4.  3.  0.]
+                           [ 8.  3.  0.  2.  6.  3.  6.  4.]
+                           [ 3.  5.  2.  6.  3.  0.  4.  7.]
+                           [ 6.  7.  5.  7.  1.  2.  1.  3.]
+                           [ 7.  1.  7.  9.  2.  1.  3.  5.]
+                           [ 5.  7.  2.  4.  1.  3.  9.  0.]
+                           [ 7.  9.  4.  8.  3.  5.  0.  8.]]
+
+            output.dims = {8, 9}
+
+            output.lod = [[0, 4, 8]]
+
+        The simple usage is:
+
+        .. code-block:: python
+
+            output = fluid.layers.im2sequence(input=layer, stride=[1, 1], filter_size=[2, 2])
+
+    """
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    if len(padding) == 2:
+        padding.append(padding[0])
+        padding.append(padding[1])
+
+    helper = LayerHelper('im2sequence', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='im2sequence',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'kernels': filter_size,
+            'strides': stride,
+            'paddings': padding,
+        })
+    return out
+
+
 def row_conv(input, future_context_size, param_attr=None, act=None):
     """Row Conv Operator. This layer will apply lookahead convolution to
     **input**. The input variable should be a 2D LoDTensor with shape [T, D].
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 1b4b64755963b5edc3d07d861c2a9b6cc3f23587..956c5b66da28fd8e74d4fd12f249688daa72d8ac 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -31,10 +31,12 @@ dtype_to_size = {
 
 
 class ControlFlowGraph(object):
-    def __init__(self, Program):
+    def __init__(self, Program, ops, forward_num):
         self._program = Program
-        self._succesors = defaultdict(set)
-        self._presucessors = defaultdict(set)
+        self._ops = ops
+        self._forward_num = forward_num
+        self._successors = defaultdict(set)
+        self._presuccessors = defaultdict(set)
         self._uses = defaultdict(set)
         self._defs = defaultdict(set)
         self._live_in = defaultdict(set)
@@ -45,25 +47,16 @@ class ControlFlowGraph(object):
             self._add(node1, node2)
 
     def _add(self, node1, node2):
-        self._succesors[node1].add(node2)
-        self._presucessors[node2].add(node1)
+        self._successors[node1].add(node2)
+        self._presuccessors[node2].add(node1)
 
     def _build_graph(self):
-        program_desc = self._program.get_desc()
-        block_size = program_desc.num_blocks()
-
-        # TODO(qijun) handle Program with if/while operators
-        self.global_block_desc = program_desc.block(0)
-        self.op_size = self.global_block_desc.op_size()
-
+        self.op_size = len(self._ops)
         op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
         self._add_connections(op_node_connections)
-
-        self.ops = [self.global_block_desc.op(i) for i in range(self.op_size)]
-
         for i in range(self.op_size):
-            self._uses[i].update(self.ops[i].input_arg_names())
-            self._defs[i].update(self.ops[i].output_arg_names())
+            self._uses[i].update(self._ops[i].input_arg_names())
+            self._defs[i].update(self._ops[i].output_arg_names())
 
     def _update_graph(self, old_name, new_name, begin_idx=0):
         for i in range(begin_idx, self.op_size):
@@ -103,7 +96,7 @@ class ControlFlowGraph(object):
                 live_out[i] = set(self._live_out[i])
                 self._live_in[i] = self._uses[i] | (
                     self._live_out[i] - self._defs[i])
-                for s in self._succesors[i]:
+                for s in self._successors[i]:
                     self._live_out[i] |= self._live_in[s]
 
             if self._reach_fixed_point(live_in, live_out):
@@ -113,39 +106,76 @@ class ControlFlowGraph(object):
         u = a & b
         return a - u, b - u
 
+    def _has_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.has_var(str(var_name))
+        else:
+            return block_desc.has_var_recursive(str(var_name))
+
+    def _find_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.find_var(str(var_name))
+        else:
+            return block_desc.find_var_recursive(str(var_name))
+
     def memory_optimize(self):
+        def check_var_validity(block_desc, x, is_forward):
+            if str(x) == "@EMPTY@":
+                return False
+            if not self._has_var(block_desc, x, is_forward):
+                return False
+            if self._find_var(block_desc, x, is_forward).persistable():
+                return False
+            if self._find_var(
+                    block_desc, x,
+                    is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
+                return False
+            return True
+
         self._build_graph()
         self._dataflow_analyze()
         self.pool = []
         for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() == "while" or op.type() == "while_grad":
+                continue
+            block_desc = op.block()
+            is_forward = i < self._forward_num
             if self.pool:
-                out_pair = [(x, self.global_block_desc.var(str(x)).shape())
-                            for x in self._defs[i]]
+                defs_can_optimize = filter(
+                    lambda x: check_var_validity(block_desc, x, is_forward),
+                    self._defs[i])
+                out_pair = [
+                    (x, self._find_var(block_desc, x, is_forward).shape())
+                    for x in defs_can_optimize
+                ]
                 for x, x_shape in out_pair:
-                    if not self.global_block_desc.var(str(x)).persistable():
-                        for index, cache_pair in enumerate(self.pool):
-                            cache_var = cache_pair[0]
-                            cache_shape = cache_pair[1]
-                            if x_shape == cache_shape:
-                                x_dtype = self.global_block_desc.var(str(
-                                    x)).dtype()
-                                cache_dtype = self.global_block_desc.var(
-                                    str(cache_var)).dtype()
+                    for index, cache_pair in enumerate(self.pool):
+                        cache_var = cache_pair[0]
+                        cache_shape = cache_pair[1]
+                        if x_shape == cache_shape:
+                            if self._has_var(block_desc, cache_var, is_forward):
+                                x_dtype = self._find_var(block_desc, x,
+                                                         is_forward).dtype()
+                                cache_dtype = self._find_var(
+                                    block_desc, cache_var, is_forward).dtype()
                                 # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
                                 # and dtype_to_size[cache_dtype]
                                 if x_dtype == cache_dtype:
-                                    print(
-                                        ("Hit Cache !!!! cache pool index "
-                                         "is %d, var name is %s, "
-                                         "cached var name is %s, "
-                                         "var shape is %s ") %
-                                        (index, x, cache_var, str(cache_shape)))
+                                    print(("Hit Cache !!!! cache pool index "
+                                           "is %d, var name is %s, "
+                                           "cached var name is %s, "
+                                           "var shape is %s ") %
+                                          (index, x, cache_var,
+                                           str(cache_shape)))
                                     self.pool.pop(index)
+                                    if x == cache_var:
+                                        break
                                     _rename_arg_(
-                                        self.ops, x, cache_var, begin_idx=i)
-                                    self._program.current_block().var(str(
-                                        x)).desc = self.global_block_desc.var(
-                                            str(cache_var))
+                                        self._ops, x, cache_var, begin_idx=i)
+                                    self._program.block(block_desc.id).var(
+                                        str(x)).desc = self._find_var(
+                                            block_desc, cache_var, is_forward)
                                     self._update_graph(
                                         x, cache_var, begin_idx=i)
                                     break
@@ -153,20 +183,70 @@ class ControlFlowGraph(object):
             in_diff, out_diff = self._get_diff(self._live_in[i],
                                                self._live_out[i])
             can_optimize = filter(
-                lambda x: not self.global_block_desc.var(str(x)).persistable(),
+                lambda x: check_var_validity(block_desc, x, is_forward),
                 in_diff)
             if can_optimize:
                 for var_name in can_optimize:
-                    self.pool.append(
-                        (var_name,
-                         self.global_block_desc.var(str(var_name)).shape()))
-
-    def get_program(self):
-        return self._program
+                    self.pool.append((var_name, self._find_var(
+                        block_desc, var_name, is_forward).shape()))
+
+
+def get_cfgs(input_program):
+    ops_list = []
+    pdesc = input_program.get_desc()
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    # Get global block ops
+    ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size))
+
+    while_sub_block_ids = []
+    while_grad_sub_block_ids = []
+    while_pair = []
+
+    for i in range(op_size):
+        op = block_desc.op(i)
+        if op.type() == "while":
+            while_sub_block_ids.append(op.attr("sub_block").id)
+        elif op.type() == "while_grad":
+            while_grad_sub_block_ids.append(op.attr("sub_block").id)
+
+    # Find while/while_grad block pair
+    for grad_id in while_grad_sub_block_ids:
+        parent_id = pdesc.block(grad_id).parent
+        if parent_id in while_sub_block_ids:
+            while_pair.append((parent_id, grad_id))
+            while_sub_block_ids.remove(parent_id)
+
+    # Get while/while_grad block ops
+    for parent_id, grad_id in while_pair:
+        while_block_ops = []
+        while_block = pdesc.block(parent_id)
+        while_block_op_size = while_block.op_size()
+        for i in range(while_block_op_size):
+            while_block_ops.append(while_block.op(i))
+
+        while_grad_block = pdesc.block(grad_id)
+        while_grad_block_op_size = while_grad_block.op_size()
+        for i in range(while_grad_block_op_size):
+            while_block_ops.append(while_grad_block.op(i))
+
+        ops_list.append((while_block_ops, while_block_op_size))
+
+    # Process rest while block ops
+    for parent_id in while_sub_block_ids:
+        while_block_ops = []
+        while_block = pdesc.block(parent_id)
+        while_block_op_size = while_block.op_size()
+        for i in range(while_block_op_size):
+            while_block_ops.append(while_block.op(i))
+
+        ops_list.append((while_block_ops, while_block_op_size))
+
+    cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list]
+    return cfgs
 
 
 def memory_optimize(input_program):
-    graph = ControlFlowGraph(input_program)
-    graph.memory_optimize()
-    result_program = graph.get_program()
-    return result_program
+    cfgs = get_cfgs(input_program)
+    for cfg in cfgs:
+        cfg.memory_optimize()
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index a30e646d8cbccb397d11c1f6164946e748f40c5e..6146e3711d3c62d22591b2855d73b5791e4b47d0 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -56,7 +56,7 @@ def img_conv_group(input,
                    conv_act=None,
                    param_attr=None,
                    conv_with_batchnorm=False,
-                   conv_batchnorm_drop_rate=None,
+                   conv_batchnorm_drop_rate=0.0,
                    pool_stride=1,
                    pool_type=None,
                    use_cudnn=True):
@@ -127,21 +127,21 @@ def sequence_conv_pool(input,
 
 def glu(input, dim=-1):
     """
-    The gated linear unit composed by split, sigmoid activation and elementwise 
-    multiplication. Specifically, Split the input into two equal sized parts 
-    :math:`a` and :math:`b` along the given dimension and then compute as 
+    The gated linear unit composed by split, sigmoid activation and elementwise
+    multiplication. Specifically, Split the input into two equal sized parts
+    :math:`a` and :math:`b` along the given dimension and then compute as
     following:
 
         .. math::
 
             {GLU}(a, b)= a \otimes \sigma(b)
 
-    Refer to `Language Modeling with Gated Convolutional Networks 
+    Refer to `Language Modeling with Gated Convolutional Networks
     <https://arxiv.org/pdf/1612.08083.pdf>`_.
-    
+
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int): The dimension along which to split. If :math:`dim < 0`, the 
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the
             dimension to split along is :math:`rank(input) + dim`.
 
     Returns:
@@ -164,24 +164,24 @@ def dot_product_attention(querys, keys, values):
     """
     The dot-product attention.
 
-    Attention mechanism can be seen as mapping a query and a set of key-value 
-    pairs to an output. The output is computed as a weighted sum of the values, 
-    where the weight assigned to each value is computed by a compatibility 
+    Attention mechanism can be seen as mapping a query and a set of key-value
+    pairs to an output. The output is computed as a weighted sum of the values,
+    where the weight assigned to each value is computed by a compatibility
     function (dot-product here) of the query with the corresponding key.
-    
-    The dot-product attention can be implemented through (batch) matrix 
+
+    The dot-product attention can be implemented through (batch) matrix
     multipication as follows:
 
         .. math::
 
             Attention(Q, K, V)= softmax(QK^\mathrm{T})V
 
-    Refer to `Attention Is All You Need 
+    Refer to `Attention Is All You Need
     <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
-    Note that batch data containing sequences with different lengths is not 
+    Note that batch data containing sequences with different lengths is not
     supported by this because of the (batch) matrix multipication.
-    
+
     Args:
         query (Variable): The input variable which is a Tensor or LoDTensor.
         key (Variable): The input variable which is a Tensor or LoDTensor.
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 462669c262f285a7c6d36cf60f2f3f952c83f6b3..0b954c60b6bc2d721c0373243e747056f8f572cf 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -49,7 +49,7 @@ for pass_id in range(PASS_NUM):
         avg_loss_value, = exe.run(fluid.default_main_program(),
                                   feed=feeder.feed(data),
                                   fetch_list=[avg_cost])
-
+        print(avg_loss_value)
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
index 53ae200a2387712c63ab67f44d4e9da03ebbe4b2..82b760d693560dae1ab1fa39afdc186f60423e65 100644
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -17,7 +17,7 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.layers as pd
 from paddle.v2.fluid.executor import Executor
 
 dict_size = 30000
@@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
 hidden_dim = 32
 word_dim = 16
 IS_SPARSE = True
-batch_size = 10
-max_length = 50
+batch_size = 2
+max_length = 8
 topk_size = 50
 trg_dic_size = 10000
+beam_size = 2
 
 decoder_size = hidden_dim
 
+place = core.CPUPlace()
 
-def encoder_decoder():
+
+def encoder():
     # encoder
-    src_word_id = layers.data(
+    src_word_id = pd.data(
         name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
+    src_embedding = pd.embedding(
         input=src_word_id,
         size=[dict_size, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
         param_attr=fluid.ParamAttr(name='vemb'))
 
-    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
 
+def decoder_train(context):
     # decoder
-    trg_language_word = layers.data(
+    trg_language_word = pd.data(
         name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
+    trg_embedding = pd.embedding(
         input=trg_language_word,
         size=[dict_size, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
         param_attr=fluid.ParamAttr(name='vemb'))
 
-    rnn = fluid.layers.DynamicRNN()
+    rnn = pd.DynamicRNN()
     with rnn.block():
         current_word = rnn.step_input(trg_embedding)
-        mem = rnn.memory(init=encoder_out)
-        fc1 = fluid.layers.fc(input=[current_word, mem],
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
                               size=decoder_size,
                               act='tanh')
-        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
-        rnn.update_memory(mem, fc1)
-        rnn.output(out)
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)
 
     return rnn()
 
 
+def decoder_decode(context):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64')
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
+                              size=decoder_size,
+                              act='tanh')
+
+        # use score to do beam search
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        pd.less_than(x=counter, y=array_len, cond=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array)
+
+    # return init_ids, init_scores
+
+    return translation_ids, translation_scores
+
+
+def set_init_lod(data, lod, place):
+    res = core.LoDTensor()
+    res.set(data, place)
+    res.set_lod(lod)
+    return res
+
+
 def to_lodtensor(data, place):
     seq_lens = [len(seq) for seq in data]
     cur_len = 0
@@ -88,12 +171,13 @@ def to_lodtensor(data, place):
     return res
 
 
-def main():
-    rnn_out = encoder_decoder()
-    label = layers.data(
+def train_main():
+    context = encoder()
+    rnn_out = decoder_train(context)
+    label = pd.data(
         name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(x=cost)
 
     optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
     optimizer.minimize(avg_cost)
@@ -103,13 +187,12 @@ def main():
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
-    place = core.CPUPlace()
     exe = Executor(place)
 
     exe.run(framework.default_startup_program())
 
     batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in xrange(1):
         for data in train_data():
             word_data = to_lodtensor(map(lambda x: x[0], data), place)
             trg_word = to_lodtensor(map(lambda x: x[1], data), place)
@@ -125,9 +208,48 @@ def main():
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
                   " avg_cost=" + str(avg_cost_val))
             if batch_id > 3:
-                exit(0)
+                break
             batch_id += 1
 
 
+def decode_main():
+    context = encoder()
+    translation_ids, translation_scores = decoder_decode(context)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [init_lod, init_lod]
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+    for _, data in enumerate(train_data()):
+        init_ids = set_init_lod(init_ids_data, init_lod, place)
+        init_scores = set_init_lod(init_scores_data, init_lod, place)
+
+        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+
+        result_ids, result_scores = exe.run(
+            framework.default_main_program(),
+            feed={
+                'src_word_id': src_word_data,
+                'init_ids': init_ids,
+                'init_scores': init_scores
+            },
+            fetch_list=[translation_ids, translation_scores],
+            return_numpy=False)
+        print result_ids.lod()
+        break
+
+
 if __name__ == '__main__':
-    main()
+    # train_main()
+    decode_main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
index 52c7ecdeb3646fdce36937b84ba8956947371d87..9774edebfb1de0ae73970d582c620f8a984a4ebf 100644
--- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
@@ -68,10 +68,10 @@ else:
         fluid.io.save_persistables(exe, "./fit_a_line.model/")
         fluid.io.load_persistables(exe, "./fit_a_line.model/")
         for data in train_reader():
-            avg_loss_value, = exe.run(trainer_prog,
-                                      feed=feeder.feed(data),
-                                      fetch_list=[avg_cost])
-
+            avg_loss_value = exe.run(trainer_prog,
+                                     feed=feeder.feed(data),
+                                     fetch_list=[avg_cost])
+            print("loss:" + str(avg_loss_value))
             if avg_loss_value[0] < 10.0:
                 exit(0)
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..adeacd4adf2150e0302965d80457e26d07c6b96d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
@@ -0,0 +1,157 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    t = fluid.DistributeTranspiler()
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        exe.run(framework.default_startup_program())
+
+        batch_id = 0
+        for pass_id in xrange(2):
+            for data in train_data():
+                word_data = to_lodtensor(map(lambda x: x[0], data), place)
+                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+                outs = exe.run(trainer_prog,
+                               feed={
+                                   'src_word_id': word_data,
+                                   'target_language_word': trg_word,
+                                   'target_language_next_word': trg_word_next
+                               },
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    exit(0)
+                batch_id += 1
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index cf054bb0fe778d34add4ac456f672a8b47483e84..7ad5e2c594f24999e298533b6c05ba688a935f0b 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -16,6 +16,11 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
 y_predict = fluid.layers.fc(input=x, size=1, act=None)
@@ -28,15 +33,18 @@ avg_cost = fluid.layers.mean(x=cost)
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
 sgd_optimizer.minimize(avg_cost)
 
-# memopt_program = fluid.default_main_program()
-memopt_program = fluid.memory_optimize(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program())
 
 BATCH_SIZE = 200
 
+# fix the order of training data
 train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+
+# train_reader = paddle.batch(
+#     paddle.reader.shuffle(
+#         paddle.dataset.uci_housing.train(), buf_size=500),
+#     batch_size=BATCH_SIZE)
 
 place = fluid.CPUPlace()
 feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
@@ -49,7 +57,7 @@ for pass_id in range(PASS_NUM):
     fluid.io.save_persistables(exe, "./fit_a_line.model/")
     fluid.io.load_persistables(exe, "./fit_a_line.model/")
     for data in train_reader():
-        avg_loss_value, = exe.run(memopt_program,
+        avg_loss_value, = exe.run(fluid.default_main_program(),
                                   feed=feeder.feed(data),
                                   fetch_list=[avg_cost])
 
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 42b3cb81ce67d38494677f3ecbfb1e07f7c0c3ad..26673afd83c48328c3f354e82bfa3725aa4805b5 100644
--- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -19,6 +19,11 @@ import sys
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
 
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
@@ -117,31 +122,37 @@ opts = optimizer.minimize(avg_cost)
 
 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
-# memopt_program = fluid.default_main_program()
-memopt_program = fluid.memory_optimize(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program())
 
 BATCH_SIZE = 128
 PASS_NUM = 1
 
+# fix the order of training data
 train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
+    paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
+
+# train_reader = paddle.batch(
+#     paddle.reader.shuffle(
+#         paddle.dataset.cifar.train10(), buf_size=128 * 10),
+#     batch_size=BATCH_SIZE)
 
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
 feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe.run(fluid.default_startup_program())
 
+i = 0
 for pass_id in range(PASS_NUM):
     accuracy.reset(exe)
     for data in train_reader():
-        loss, acc = exe.run(memopt_program,
+        loss, acc = exe.run(fluid.default_main_program(),
                             feed=feeder.feed(data),
                             fetch_list=[avg_cost] + accuracy.metrics)
         pass_acc = accuracy.eval(exe)
         print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
             pass_acc))
         # this model is slow, so if we can train two mini batch, we think it works properly.
-        exit(0)
+        if i > 2:
+            exit(0)
+        i += 1
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd53e7a78142162317a677de49c1821635a65b5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # fix the order of training data
+    train_data = paddle.batch(
+        paddle.dataset.wmt14.train(dict_size), batch_size=batch_size)
+
+    # train_data = paddle.batch(
+    #     paddle.reader.shuffle(
+    #         paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+    #     batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(10):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(fluid.default_main_program(),
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 2:
+                exit(0)
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
index 34101b1da46d46d0e7a995ba80d8644dc586065d..74138298978c7c18936f53761b313887f07aea81 100644
--- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -16,13 +16,13 @@ import numpy as np
 from op_test import OpTest
 
 
-def bipartite_match(distance, match_indices, match_dis):
+def bipartite_match(distance, match_indices, match_dist):
     """Bipartite Matching algorithm.
     Arg:
         distance (numpy.array) : The distance of two entries with shape [M, N].
         match_indices (numpy.array): the matched indices from column to row
             with shape [1, N], it must be initialized to -1.
-        match_dis (numpy.array): The matched distance from column to row
+        match_dist (numpy.array): The matched distance from column to row
             with shape [1, N], it must be initialized to 0.
     """
     match_pair = []
@@ -36,13 +36,13 @@ def bipartite_match(distance, match_indices, match_dis):
     row_indices = -1 * np.ones((row, ), dtype=np.int)
 
     idx = 0
-    for i, j, dis in match_sorted:
+    for i, j, dist in match_sorted:
         if idx >= row:
             break
-        if match_indices[j] == -1 and row_indices[i] == -1 and dis > 0:
+        if match_indices[j] == -1 and row_indices[i] == -1 and dist > 0:
             match_indices[j] = i
             row_indices[i] = j
-            match_dis[j] = dis
+            match_dist[j] = dist
             idx += 1
 
 
@@ -55,24 +55,24 @@ def batch_bipartite_match(distance, lod):
     n = len(lod) - 1
     m = distance.shape[1]
     match_indices = -1 * np.ones((n, m), dtype=np.int)
-    match_dis = np.zeros((n, m), dtype=np.float32)
+    match_dist = np.zeros((n, m), dtype=np.float32)
     for i in range(len(lod) - 1):
         bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                        match_dis[i, :])
-    return match_indices, match_dis
+                        match_dist[i, :])
+    return match_indices, match_dist
 
 
 class TestBipartiteMatchOpForWithLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[0, 5, 11, 23]]
-        dis = np.random.random((23, 217)).astype('float32')
-        match_indices, match_dis = batch_bipartite_match(dis, lod[0])
+        dist = np.random.random((23, 217)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
-        self.inputs = {'DistMat': (dis, lod)}
+        self.inputs = {'DistMat': (dist, lod)}
         self.outputs = {
             'ColToRowMatchIndices': (match_indices),
-            'ColToRowMatchDis': (match_dis),
+            'ColToRowMatchDis': (match_dist),
         }
 
     def test_check_output(self):
@@ -83,13 +83,13 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[0, 8]]
-        dis = np.random.random((8, 17)).astype('float32')
-        match_indices, match_dis = batch_bipartite_match(dis, lod[0])
+        dist = np.random.random((8, 17)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
-        self.inputs = {'DistMat': dis}
+        self.inputs = {'DistMat': dist}
         self.outputs = {
-            'ColToRowMatchIndices': (match_indices),
-            'ColToRowMatchDis': (match_dis),
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDis': match_dist,
         }
 
     def test_check_output(self):
diff --git a/python/paddle/v2/fluid/tests/test_detection_output_op.py b/python/paddle/v2/fluid/tests/test_detection_output_op.py
index 4a9cd474b81a419bfb42c202327df04c0d2e5bd9..8a5e06b38f5ed5336ef02bac7876610758b44258 100644
--- a/python/paddle/v2/fluid/tests/test_detection_output_op.py
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
@@ -68,4 +68,6 @@ class TestUnpoolOp(OpTest):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    # FIXME: detection_output_op will be rewritten. This unittest should be
+    # enabled after rewriting.
+    exit(0)  # temporary disable this unittest
diff --git a/python/paddle/v2/fluid/tests/test_iou_similarity_op.py b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py
new file mode 100755
index 0000000000000000000000000000000000000000..128f2e4977195a563efcd26364cc6261da2dd685
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestIOUSimilarityOp(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "iou_similarity"
+        self.boxes1 = np.array(
+            [[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]).astype('float32')
+        self.boxes2 = np.array([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0],
+                                [0.0, 0.0, 20.0, 20.0]]).astype('float32')
+        self.output = np.array(
+            [[2.0 / 16.0, 0, 6.0 / 400.0],
+             [1.0 / 16.0, 0.0, 5.0 / 400.0]]).astype('float32')
+
+        self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
+
+        self.outputs = {'Out': self.output}
+
+
+class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        super(TestIOUSimilarityOpWithLoD, self).setUp()
+        self.boxes1_lod = [[0, 1, 2]]
+        self.output_lod = [[0, 1, 2]]
+
+        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
+        self.outputs = {'Out': (self.output, self.output_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 01b3db42611434cc1dcacbe1b7d25126e5eef6b2..566fbba9abff36a2e1faccc8086bdabda0115d66 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -17,8 +17,9 @@ import unittest
 
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.framework import Program, program_guard, default_main_program
 from paddle.v2.fluid.param_attr import ParamAttr
+import decorators
 
 
 class TestBook(unittest.TestCase):
@@ -225,6 +226,51 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_im2sequence(self):
+        print("test_im2sequence")
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            output = layers.im2sequence(
+                input=x, stride=[1, 1], filter_size=[2, 2])
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    @decorators.prog_scope()
+    def test_nce(self):
+        window_size = 5
+        words = []
+        for i in xrange(window_size):
+            words.append(
+                layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+
+        dict_size = 10000
+        label_word = int(window_size / 2) + 1
+
+        embs = []
+        for i in xrange(window_size):
+            if i == label_word:
+                continue
+
+            emb = layers.embedding(
+                input=words[i],
+                size=[dict_size, 32],
+                param_attr='emb.w',
+                is_sparse=True)
+
+            embs.append(emb)
+
+        embs = layers.concat(input=embs, axis=1)
+        loss = layers.nce(input=embs,
+                          label=words[label_word],
+                          num_total_classes=dict_size,
+                          param_attr='nce.w',
+                          bias_attr='nce.b')
+        avg_loss = layers.mean(x=loss)
+        self.assertIsNotNone(avg_loss)
+        print(str(default_main_program()))
+
     def test_row_conv(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/v2/fluid/tests/test_lookup_table_op.py b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
index d5255ba31f7c9e45cf29f412546146234f822026..0c566c76c91dce8dcfc882eed998f492ae3cde76 100644
--- a/python/paddle/v2/fluid/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
@@ -33,5 +33,19 @@ class TestLookupTableOp(OpTest):
         self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
 
 
+class TestLookupTableOpWithPadding(TestLookupTableOp):
+    def test_check_output(self):
+        ids = np.squeeze(self.inputs['Ids'])
+        padding_idx = np.random.choice(ids, 1)[0]
+        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
+        self.attrs = {'padding_idx': long(padding_idx)}
+        self.check_output()
+
+    def test_check_grad(self):
+        # Since paddings are not trainable and fixed in forward, the gradient of 
+        # paddings makes no sense and we don't test the gradient here.
+        pass
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py
index 3ae727a573855b3cb618a8fab70404adf3d92f51..9a51c1f612a0d5363d36e6642ed3b409970025b1 100644
--- a/python/paddle/v2/fluid/tests/test_nce.py
+++ b/python/paddle/v2/fluid/tests/test_nce.py
@@ -109,4 +109,6 @@ class TestNCECase1(TestNCE):
 
 
 if __name__ == '__main__':
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
+    exit(0)
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_prior_box_op.py b/python/paddle/v2/fluid/tests/test_prior_box_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca8d2bca74ce2d4be8160c8851e393489691ae56
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_prior_box_op.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestPriorBoxOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'min_sizes': self.min_sizes,
+            'max_sizes': self.max_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'variances': self.variances,
+            'flip': self.flip,
+            'clip': self.clip,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset
+        }
+
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        return
+
+    def setUp(self):
+        self.op_type = "prior_box"
+        self.set_data()
+
+    def init_test_params(self):
+        self.layer_w = 4
+        self.layer_h = 4
+
+        self.image_w = 20
+        self.image_h = 20
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.min_sizes = [2, 4]
+        self.min_sizes = np.array(self.min_sizes).astype('int64')
+        self.max_sizes = [5, 10]
+        self.max_sizes = np.array(self.max_sizes).astype('int64')
+        self.aspect_ratios = [2.0, 3.0]
+        self.flip = True
+        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+        self.aspect_ratios = np.array(
+            self.aspect_ratios, dtype=np.float).flatten()
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float).flatten()
+
+        self.clip = True
+
+        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+        if len(self.max_sizes) > 1:
+            self.num_priors += len(self.max_sizes)
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w,
+             self.image_h)).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w,
+             self.layer_h)).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        idx = 0
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                idx = 0
+                for s in range(len(self.min_sizes)):
+                    min_size = self.min_sizes[s]
+                    c_w = c_h = min_size / 2.
+                    out_boxes[h, w, idx, :] = [
+                        (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h,
+                        (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h
+                    ]
+                    idx += 1
+
+                    if len(self.max_sizes) > 0:
+                        max_size = self.max_sizes[s]
+                        # second prior: aspect_ratio = 1,
+                        c_w = c_h = math.sqrt(min_size * max_size) / 2
+                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
+                                                   (c_y - c_h) / self.image_h,
+                                                   (c_x + c_w) / self.image_w,
+                                                   (c_y + c_h) / self.image_h]
+                        idx += 1
+
+                    # rest of priors
+                    for r in range(len(self.real_aspect_ratios)):
+                        ar = self.real_aspect_ratios[r]
+                        if math.fabs(ar - 1.) < 1e-6:
+                            continue
+                        c_w = min_size * math.sqrt(ar) / 2
+                        c_h = (min_size / math.sqrt(ar)) / 2
+                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
+                                                   (c_y - c_h) / self.image_h,
+                                                   (c_x + c_w) / self.image_w,
+                                                   (c_y + c_h) / self.image_h]
+                        idx += 1
+        # clip the prior's coordidate such that it is within[0, 1]
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        # set the variance.
+        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
+                                           self.num_priors, 1))
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 1429d6b1e08fe4ab2d1c5a0f19f1cedbcbc85abd..e5000e440cc8d822dbd38dce3978d2722d32ebe4 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -176,7 +176,6 @@ def resize_short(im, size):
     :param size: the shorter edge size of image after resizing.
     :type size: int
     """
-    assert im.shape[-1] == 1 or im.shape[-1] == 3
     h, w = im.shape[:2]
     h_new, w_new = size, size
     if h > w:
@@ -267,7 +266,7 @@ def random_crop(im, size, is_color=True):
     return im
 
 
-def left_right_flip(im):
+def left_right_flip(im, is_color=True):
     """
     Flip an image along the horizontal direction.
     Return the flipped image.
@@ -278,13 +277,15 @@ def left_right_flip(im):
 
         im = left_right_flip(im)
     
-    :paam im: input image with HWC layout
+    :param im: input image with HWC layout or HW layout for gray image
     :type im: ndarray
+    :param is_color: whether input image is color or not
+    :type is_color: bool
     """
-    if len(im.shape) == 3:
+    if len(im.shape) == 3 and is_color:
         return im[:, ::-1, :]
     else:
-        return im[:, ::-1, :]
+        return im[:, ::-1]
 
 
 def simple_transform(im,
@@ -319,11 +320,12 @@ def simple_transform(im,
     """
     im = resize_short(im, resize_size)
     if is_train:
-        im = random_crop(im, crop_size)
+        im = random_crop(im, crop_size, is_color=is_color)
         if np.random.randint(2) == 0:
-            im = left_right_flip(im)
+            im = left_right_flip(im, is_color)
     else:
-        im = center_crop(im, crop_size)
+        im = center_crop(im, crop_size, is_color)
+        im = center_crop(im, crop_size, is_color=is_color)
     if len(im.shape) == 3:
         im = to_chw(im)
 
@@ -331,8 +333,10 @@ def simple_transform(im,
     if mean is not None:
         mean = np.array(mean, dtype=np.float32)
         # mean value, may be one value per channel 
-        if mean.ndim == 1:
+        if mean.ndim == 1 and is_color:
             mean = mean[:, np.newaxis, np.newaxis]
+        elif mean.ndim == 1:
+            mean = mean
         else:
             # elementwise mean
             assert len(mean.shape) == len(im)
@@ -372,6 +376,6 @@ def load_and_transform(filename,
                  mean values per channel.
     :type mean: numpy array | list
     """
-    im = load_image(filename)
+    im = load_image(filename, is_color)
     im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
     return im
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index 2c6ba650a5d7996bef212e88a16f2a159ca377e7..0f1b8331309248aaaf0ed32cf14c583a4cdb7437 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -35,7 +35,7 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
     cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
 
 
-RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool
 
 RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt