Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into remove_evaluator

cf87f39b · fengjiayi · f8029403 · 457d012a · cf87f39b · cf87f39b
118 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,13 +4,17 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
+services:
+  - docker
 os:
  - linux
 env:
  - JOB=build_doc
  - JOB=check_style
+  - JOB=build_android
 addons:
  apt:
    packages:
@@ -41,8 +45,10 @@ before_install:
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
-    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
+    # 43min timeout
+    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
+    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
  - |
    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;

--- a/AUTHORS.md
+++ b/AUTHORS.md
 | Github account | name |
 |---|---|
+| abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
 | beckett1124 | Bin Qi |
 | JiayiFeng | Jia-Yi Feng |

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,11 +166,11 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")

 set(EXTERNAL_LIBS
-    ${GFLAGS_LIBRARIES}
-    ${GLOG_LIBRARIES}
+    gflags
+    glog
    ${CBLAS_LIBRARIES}
-    ${PROTOBUF_LIBRARY}
-    ${ZLIB_LIBRARIES}
+    protobuf
+    zlib
    ${PYTHON_LIBRARIES}
 )


--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -106,10 +106,10 @@ def vgg16_bn_drop(input):
    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])

    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
    bn = fluid.layers.batch_norm(input=fc1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
    return fc2



--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -59,7 +59,6 @@ endif(NOT WITH_GOLANG)

 if(NOT WITH_GPU)
    add_definitions(-DHPPL_STUB_FUNC)
-    add_definitions("-DCUPTI_LIB_PATH=\"\"")

    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@@ -77,9 +76,7 @@ else()
    if(CUPTI_FOUND)
        include_directories(${CUPTI_INCLUDE_DIR})
        add_definitions(-DPADDLE_WITH_CUPTI)
-        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
    else()
-        add_definitions("-DCUPTI_LIB_PATH=\"\"")
        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -28,7 +28,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})

 ExternalProject_Add(
-    zlib
+    extern_zlib
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
    GIT_TAG         "v1.2.8"
@@ -49,9 +49,11 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

+ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+ADD_DEPENDENCIES(zlib extern_zlib)
+
 LIST(APPEND external_project_dependencies zlib)
-ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})

 IF(WITH_C_API)
  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,7 +104,9 @@ function(merge_static_libs TARGET_NAME)
  foreach(lib ${libs})
    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()
-  list(REMOVE_DUPLICATES libs_deps)
+  if(libs_deps)
+    list(REMOVE_DUPLICATES libs_deps)
+  endif()

  # To produce a library we need at least one source file.
  # It is created by add_custom_command below and will helps
@@ -191,10 +193,13 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -72,7 +72,7 @@ copy(inference_lib DEPENDS paddle_fluid_shared
 )

 set(module "platform")
-copy(platform_lib
+copy(platform_lib DEPS profiler_py_proto
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
 )

--- a/doc/api/overview.rst
+++ b/doc/api/overview.rst
@@ -7,10 +7,6 @@ it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.or
 A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.

 We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
-it has several easy to use methods
+it has several easy to use methods to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.

- `paddle.train` 
- `paddle.test`
- `paddle.infer`
-
-to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
+All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code.
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -9,7 +9,7 @@
 为了编译PaddlePaddle，我们需要

 1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
-1. Docker
+2. Docker

 不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。


--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -9,7 +9,7 @@ Requirements
 To build PaddlePaddle, you need

 1. A computer -- Linux, Windows, MacOS.
-1. Docker.
+2. Docker.

 Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
 We run all the tools by running this image.

--- a/doc/build_and_install/pip_install_en.rst
+++ b/doc/build_and_install/pip_install_en.rst
-Install Using pip
+Install using pip
 ================================

 You can use current widely used Python package management
@@ -8,7 +8,7 @@ most of current Linux systems or MacOS.

 .. _pip_install:

-Install Using pip
+Install using pip
 ------------------------------

 Run the following command to install PaddlePaddle on the current

--- a/doc/design/parallel_do.md
+++ b/doc/design/parallel_do.md
@@ -39,15 +39,16 @@ In the backward pass
 This implementation allows to write mixed device program like this

 ```python
-# get embedding feature on CPU
-feature = some_cpu_only_op(data)
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)

-gpu_places = get_place(use_gpu=True)
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
 # parallel processing on multiple GPUs
 pd = ParallelDo(gpu_places)
-with pd.do():
-    read_input(feature)
-    prediction = my_net(feature)
+with pd.do(input=data):
+    prediction = softmax(fc(fc(data, W1), W2))
    write_output(prediction)
 prediction = pd()
 loss = cross_entropy(prediction, label)
@@ -66,20 +67,20 @@ start_program
 main_program
 {
 block0 {
-  vars: data, places, w1, w2
+  vars: data, places, w1, w2, w1_grad, w2_grad,
  ops: data, get_place, parallel_do(block1),
       parallel_do_grad(block2),
       sgd(w2, w2_grad),
       sgd(w1, w1_grad)
 }
-block1 {
+block1 { # the forward pass
  parent_block: 0
  vars: data, h1, h2, loss
  ops: fc, fc, softmax
 }
-block2 {
+block2 { # the backward pass
  parent_block: 1
-  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
  ops: softmax_grad,
       fc_grad
       fc_grad

--- a/doc/dev/contribute_to_paddle_cn.md
+++ b/doc/dev/contribute_to_paddle_cn.md
@@ -222,6 +222,7 @@ upstream
 ## 提交代码的一些约定

 为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
 1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
 2. 提交PUll Request前：
   - 请注意commit的数量：
@@ -231,6 +232,7 @@ upstream
 3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。

 此外，在回复评审人意见时，请您遵守以下约定：
+
 1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
   - 对评审意见不同意的，请给出您自己的反驳理由。

--- a/doc/dev/index_en.rst
+++ b/doc/dev/index_en.rst
@@ -4,6 +4,5 @@ Development
 ..  toctree::
  :maxdepth: 1

-  new_layer_en.rst
  contribute_to_paddle_en.md
  write_docs_en.rst
--- a/doc/faq/build_and_install/index_en.rst
+++ b/doc/faq/build_and_install/index_en.rst
+############################
+Install, Build and Unit test
+############################
+
+TBD
--- a/doc/faq/cluster/index_en.rst
+++ b/doc/faq/cluster/index_en.rst
+###############################
+Cluster Training and Prediction
+###############################
+
+TBD
--- a/doc/faq/index_en.rst
+++ b/doc/faq/index_en.rst
+FAQ
+====
+
+ 
+..  toctree::
+  :maxdepth: 1
+
+  build_and_install/index_en.rst
+  model/index_en.rst
+  parameter/index_en.rst
+  local/index_en.rst
+  cluster/index_en.rst
--- a/doc/faq/local/index_en.rst
+++ b/doc/faq/local/index_en.rst
+#############################
+Local Training and Prediction
+#############################
+
+TBD
--- a/doc/faq/model/index_en.rst
+++ b/doc/faq/model/index_en.rst
+###################
+Model Configuration
+###################
+
+TBD
--- a/doc/faq/parameter/index_en.rst
+++ b/doc/faq/parameter/index_en.rst
+#################
+Parameter Setting
+#################
+
+TBD
--- a/doc/getstarted/concepts/use_concepts_en.rst
+++ b/doc/getstarted/concepts/use_concepts_en.rst
+Basic Concept
+=============
+TBD
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -5,3 +5,4 @@ GET STARTED
  :maxdepth: 1

  quickstart_en.rst
+  concepts/use_concepts_en.rst
--- a/doc/howto/capi/compile_paddle_lib_en.md
+++ b/doc/howto/capi/compile_paddle_lib_en.md
+## Install and Build
+
+TBD
--- a/doc/howto/capi/index_en.rst
+++ b/doc/howto/capi/index_en.rst
+C-API Prediction Library
+========================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_en.md
+  organization_of_the_inputs_en.md
+  workflow_of_capi_en.md
--- a/doc/howto/capi/organization_of_the_inputs_cn.md
+++ b/doc/howto/capi/organization_of_the_inputs_cn.md
@@ -4,12 +4,15 @@

 ### 输入/输出数据类型
 在C-API中，按照基本数据类型在PaddlePaddle内部的定义和实现，输入数据可分为：
+
 1. 一维整型数组
 1. 二维浮点型矩阵
+
    - 稠密矩阵
    - 稀疏矩阵

 说明：
+
 1. 一维数组**仅支持整型值**；
    - 常用于自然语言处理任务，例如：表示词语在词典中的序号；
    - 分类任务中类别标签；
@@ -274,6 +277,7 @@ PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织
 如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。

 于是，在组织神经网络输入时，需要思考完成以下工作：
+
 1. 为每一个输入/输出创建`argument`。
    - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
 1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。

--- a/doc/howto/capi/organization_of_the_inputs_en.md
+++ b/doc/howto/capi/organization_of_the_inputs_en.md
+## Input/Output Data Organization
+
+TBD
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -11,6 +11,7 @@
 </p>

 - 准备预测模型
+
    1. 只将神经网络结构进行序列化。
        - 只对神经网络结构进行序列化，加载模型需同时指定：网络结构的序列化结果和模型参数存储目录。
    1. 将网络结构定义和训练结束存储下来的模型参数文件（多个）合并入一个文件。
@@ -18,6 +19,7 @@
        - 预测时只需加载一个文件便于发布。
    - **注意**：以上两种方式只需选择其一即可。
 - 调用 C-API 开发预测序
+
    1. 初始化PaddlePaddle运行环境。
    1. 加载预测模型。
    1. 创建神经网络输入，组织输入数据。
@@ -90,6 +92,7 @@
 1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。

 - 注意事项
+
    1. 使用PaddlePaddle V2 API训练，模型中所有可学习参数会被存为一个压缩文件，需要手动进行解压，将它们放在同一目录中，C-API不会直接加载 V2 API 存储的压缩文件。
    1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件，请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
    1. 通过灵活使用以上两个接口，加载模型可其它多种方式，例如也可在程序运行过程中再加载另外一个模型。
@@ -106,6 +109,7 @@ C-API支持的所有输入数据类型和他们的组织方式，请参考“输
 这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出，使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。

 在组织神经网络输入，获取输出时，需要思考完成以下工作：
+
 1. 为每一个输入/输出创建`argument`；
 1. 为每一个`argument`创建`paddle_matrix`来存储数据；


--- a/doc/howto/capi/workflow_of_capi_en.md
+++ b/doc/howto/capi/workflow_of_capi_en.md
+## C-API Workflow
+
+TBD
--- a/doc/howto/cluster/cmd_argument_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
-## 启动参数说明
+# 启动参数说明

 下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。

-### 启动参数服务器
+## 启动参数服务器
+
 执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+
 ```bash
 $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
 ```

 如果希望可以在后台运行pserver程序，并保存输出到一个日志文件，可以运行：
+
 ```bash
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
@@ -20,8 +23,10 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 - ports_num_for_sparse：**必选，默认0**，用于稀疏类型参数通信的端口个数
 - num_gradient_servers：**必选，默认1**，当前训练任务pserver总数

-### 启动计算节点
+## 启动计算节点
+
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
+
 ```bash
 $ python train.py
 ```
@@ -67,7 +72,7 @@ paddle.init(
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开


-### 准备数据集
+## 准备数据集

 参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。

@@ -84,7 +89,8 @@ for f in flist:
 ```

 示例程序`prepare.py`会把训练集和测试集分别分割成多个文件（例子中为3个，后缀为`-00000`、`-00001`和`-00002`）:
-```
+
+```bash
 train.txt
 train.txt-00000
 train.txt-00001
@@ -99,12 +105,13 @@ test.txt-00002

 对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。

-### 准备训练程序
+## 准备训练程序

 我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。

 最后，工作空间应如下所示：
-```
+
+```bash
 .
 |-- my_lib.py
 |-- word_dict.pickle
@@ -133,3 +140,21 @@ test.txt-00002

 - `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
 - `test_data_dir`：包含测试数据集的目录。
+
+## 异步 SGD 更新
+
+我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。
+例如，设置 `AdaGrad` optimize 的 `is_async` 和 `async_lagged_grad_discard_ratio` 参数：
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: 是否为异步SGD更新模式。
+- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制，接收到足够的gradient(
+  `async_lagged_grad_discard_ratio * num_gradient_servers`)之后，后面的gradient
+  将会被抛弃。
--- a/doc/howto/cluster/cmd_argument_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
-## Command-line arguments
+# Command-line arguments

 We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.

-### Starting parameter server
+## Starting parameter server

 Type the below command to start a parameter server which will wait for trainers to connect:

 ```bash
-$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0
 ```

 If you wish to run parameter servers in background, and save a log file, you can type:
+
 ```bash
-$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log &
 ```

 Parameter Description
@@ -21,8 +22,10 @@ Parameter Description
 - ports_num: **required, default 1**, total number of ports will listen on.
 - ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
 - num_gradient_servers: **required, default 1**, total number of gradient servers.
+- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on.
+
+## Starting trainer

-### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")

 ```bash
@@ -70,7 +73,7 @@ Parameter Description
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

-### Prepare Training Dataset
+## Prepare Training Dataset

 Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.

@@ -88,7 +91,7 @@ for f in flist:

 Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:

-```
+```bash
 train.txt
 train.txt-00000
 train.txt-00001
@@ -103,13 +106,13 @@ When job started, every trainer needs to get it's own part of data. In some dist

 Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.

-### Prepare Training program
+## Prepare Training program

 We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.

-
 Your workspace may looks like:
-```
+
+```bash
 .
 |-- my_lib.py
 |-- word_dict.pickle
@@ -138,3 +141,21 @@ Your workspace may looks like:

 - `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
 - `test_data_dir`: containing testing data.
+
+## Async SGD Update
+
+We can set some parameters of the optimizer to make it support async SGD update.
+For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer:
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: Is Async-SGD or not.
+- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control.
+  when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed,
+  current async gradient will be discard silently.
--- a/doc/howto/cluster/multi_cluster/fabric_en.md
+++ b/doc/howto/cluster/multi_cluster/fabric_en.md
-# Cluster Training Using Fabric
+# Fabric

 ## Prepare a Linux cluster


--- a/doc/howto/cluster/multi_cluster/k8s_aws_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
-
-# Distributed PaddlePaddle Training on AWS with Kubernetes
+# Kubernetes on AWS

 We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.


--- a/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
+# Kubernetes Distributed
+
+TBD
--- a/doc/howto/cluster/multi_cluster/k8s_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_en.md
-# PaddlePaddle On Kubernetes
+# Kubernetes

 In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.


--- a/doc/howto/cluster/multi_cluster/openmpi_cn.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_cn.md
-# 在OpenMPI集群中提交训练作业
+# 在OpenMPI集群中启动训练

 ## 准备OpenMPI集群


--- a/doc/howto/cluster/multi_cluster/openmpi_en.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_en.md
-# Cluster Training Using OpenMPI
+# OpenMPI

 ## Prepare an OpenMPI cluster


--- a/doc/howto/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
@@ -2,10 +2,25 @@

 命令行参数设置
 ===============
+深度学习算法的实现有着多样化的特点，运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。
+
+在这一部分，首先以几个实际场景为例，展示了部分命令行参数的使用:

 ..  toctree::
  :maxdepth: 1

  use_case_cn.md
+
+接着对所有参数的使用场合进行概述和分类:
+
+..  toctree::
+  :maxdepth: 1
+
  arguments_cn.md
+
+最后给出细节描述，详细解释这些参数的属性和意义:
+
+..  toctree::
+  :maxdepth: 1
+
  detail_introduction_cn.md
--- a/doc/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/howto/rnn/hierarchical_layer_en.rst
+Layers supporting hierarchical sequence as input
+================================================
+
+TBD
--- a/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
+API comparision between RNN and hierarchical RNN
+================================================
+
+TBD
--- a/doc/howto/rnn/index_en.rst
+++ b/doc/howto/rnn/index_en.rst
@@ -5,3 +5,6 @@ RNN Models
  :maxdepth: 1

  rnn_config_en.rst
+  recurrent_group_en.md
+  hierarchical_layer_en.rst
+  hrnn_rnn_api_compare_en.rst
--- a/doc/howto/rnn/recurrent_group_en.md
+++ b/doc/howto/rnn/recurrent_group_en.md
+# Recurrent Group Tutorial
+
+TBD
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -8,3 +8,4 @@ PaddlePaddle Documentation
  build_and_install/index_en.rst
  howto/index_en.rst
  dev/index_en.rst
+  faq/index_en.rst
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -96,3 +96,6 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
      
 cc_test(channel_test SRCS channel_test.cc)
+cc_test(tuple_test SRCS tuple_test.cc )
+cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+        channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc)
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -91,6 +91,11 @@ class ChannelHolder {

  inline bool IsInitialized() const { return holder_ != nullptr; }

+  inline const std::type_index Type() {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    return holder_->Type();
+  }
+
 private:
  /**
   * @note    Placeholder hides type T, so it doesn't appear as a template

--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -542,3 +542,341 @@ TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
  ChannelHolderSendReceive(ch);
  delete ch;
 }
+
+TEST(ChannelHolder, ChannelUninitializedTest) {
+  ChannelHolder *ch = new ChannelHolder();
+  EXPECT_EQ(ch->IsInitialized(), false);
+  int i = 10;
+  EXPECT_EQ(ch->Send(&i), false);
+  EXPECT_EQ(ch->Receive(&i), false);
+  bool is_exception = false;
+  try {
+    ch->Type();
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+TEST(ChannelHolder, ChannelInitializedTest) {
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(2);
+  EXPECT_EQ(ch->IsInitialized(), true);
+  // Channel should remain intialized even after close
+  ch->close();
+  EXPECT_EQ(ch->IsInitialized(), true);
+  delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchSendTest) {
+  // Test with unbuffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  bool is_exception = false;
+  bool boolean_data = true;
+  try {
+    ch->Send(&boolean_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+
+  // Test with Buffered Channel
+  ch = new ChannelHolder();
+  ch->Reset<float>(10);
+  is_exception = false;
+  int int_data = 23;
+  try {
+    ch->Send(&int_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchReceiveTest) {
+  // Test with unbuffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  bool is_exception = false;
+  bool float_data;
+  try {
+    ch->Receive(&float_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+
+  // Test with Buffered Channel
+  ch = new ChannelHolder();
+  ch->Reset<float>(10);
+  is_exception = false;
+  int int_data = 23;
+  try {
+    ch->Receive(&int_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  ch->close();
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  if (isBuffered) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly close the thread
+  // This should unblock all senders
+  ch->close();
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  if (isBuffered) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a channelholder unblocks
+//  any receivers waiting on the channel
+TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
+  // Check for buffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderCloseUnblocksReceiversTest(ch);
+  delete ch;
+
+  // Check for unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a channelholder unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
+  // Check for buffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderCloseUnblocksSendersTest(ch, true);
+  delete ch;
+
+  // Check for unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderCloseUnblocksSendersTest(ch, false);
+  delete ch;
+}
+
+// This tests that destroying a channelholder unblocks
+//  any senders waiting for channel
+void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  if (isBuffered) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (isBuffered) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channelholder also unblocks
+//  any receivers waiting on the channel
+void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderDestroyUnblockReceivers(ch);
+  // ch is already deleted already deleted in
+  // ChannelHolderDestroyUnblockReceivers
+
+  // Check for Unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderDestroyUnblockReceivers(ch);
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderDestroyUnblockSenders(ch, true);
+  // ch is already deleted already deleted in
+  // ChannelHolderDestroyUnblockReceivers
+
+  // Check for Unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderDestroyUnblockSenders(ch, false);
+}
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(go);
+USE_NO_KERNEL_OP(channel_close);
+USE_NO_KERNEL_OP(channel_create);
+USE_NO_KERNEL_OP(channel_recv);
+USE_NO_KERNEL_OP(channel_send);
+USE_NO_KERNEL_OP(elementwise_add);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name,
+                       T value) {
+  // Create LoDTensor<int> of dim [1,1]
+  auto var = scope.Var(name);
+  auto tensor = var->GetMutable<LoDTensor>();
+  tensor->Resize({1, 1});
+  T *expect = tensor->mutable_data<T>(place);
+  expect[0] = value;
+}
+
+void InitTensorsInScope(Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+
+  // Create channel variable
+  scope.Var("Channel");
+
+  // Create Variables, x0 will be put into channel,
+  // result will be pulled from channel
+  CreateIntVariable(scope, place, "Status", false);
+  CreateIntVariable(scope, place, "x0", 99);
+  CreateIntVariable(scope, place, "result", 0);
+}
+
+void AddOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, AttributeMap attrs,
+           BlockDesc *block) {
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(Concurrency, Go_Op) {
+  Scope scope;
+  p::CPUPlace place;
+
+  // Initialize scope variables
+  InitTensorsInScope(scope, place);
+
+  framework::Executor executor(place);
+  ProgramDesc program;
+  BlockDesc *block = program.MutableBlock(0);
+
+  // Create channel OP
+  AddOp("channel_create", {}, {{"Out", {"Channel"}}},
+        {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
+        block);
+
+  // Create Go Op routine
+  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
+  AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
+        {{"Status", {"Status"}}}, {}, goOpBlock);
+
+  // Create Go Op
+  AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
+        block);
+
+  // Create Channel Receive Op
+  AddOp("channel_recv", {{"Channel", {"Channel"}}},
+        {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
+
+  // Create Channel Close Op
+  AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
+
+  // Check the result tensor to make sure it is set to 0
+  const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
+  auto *initialData = tensor.data<int>();
+  EXPECT_EQ(initialData[0], 0);
+
+  executor.Run(program, &scope, 0, true, true);
+
+  // After we call executor.run, the Go operator should do a channel_send to set
+  // the
+  // "result" variable to 99
+  auto *finalData = tensor.data<int>();
+  EXPECT_EQ(finalData[0], 99);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -26,12 +26,15 @@ Dim<i> make_dim(const int64_t* d) {
 }

 template <>
-Dim<1> make_dim<1>(const int64_t* d) {
-  return Dim<1>(*d);
+Dim<0> make_dim<0>(const int64_t* d) {
+  return Dim<0>(*d);
 }

 void make_ddim(DDim& ddim, const int64_t* dims, int n) {
  switch (n) {
+    case 0:
+      ddim = make_dim<0>(dims);
+      break;
    case 1:
      ddim = make_dim<1>(dims);
      break;
@@ -190,7 +193,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
    this->operator()(t.tail);
  }

-  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+  void operator()(const Dim<0>& t) {}
 };
 /// @endcond

@@ -247,9 +250,8 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
    }
  }

-  void operator()(const Dim<1>& dim) {
-    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
-    vector.push_back(dim.head);
+  void operator()(const Dim<0>& dim) {
+    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
  }
 };


--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -30,8 +30,8 @@ namespace framework {
 * The number of dimensions must be between [1, 9].
 */
 struct DDim {
-  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                         Dim<8>, Dim<9>>
+  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
+                         Dim<7>, Dim<8>, Dim<9>>
      DDimVar;
  DDimVar var;


--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -72,38 +72,36 @@ struct Dim {

 // Base case specialization
 template <>
-struct Dim<1> {
-  static constexpr int dimensions = 1;
+struct Dim<0> {
+  static constexpr int dimensions = 0;

  HOSTDEVICE
-  Dim(int64_t _head) : head(_head) {}
+  Dim(int64_t _head) {}

  HOSTDEVICE
-  Dim() : head(0) {}
+  Dim() {}

  HOSTDEVICE
-  Dim(int idx, const Dim<1>& size) : head(idx) {
+  Dim(int idx, const Dim<0>& size) {
 #ifndef __CUDA_ARCH__
-    if (idx >= size.head) {
+    if (idx > 0) {
      throw std::invalid_argument("Index out of range.");
    }
 #else
-    PADDLE_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx == 0);
 #endif
  }

  HOSTDEVICE
-  bool operator==(const Dim<1>& o) const { return (head == o.head); }
+  bool operator==(const Dim<0>& o) const { return true; }

  HOSTDEVICE
-  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
+  bool operator!=(const Dim<0>& o) const { return false; }

  HOSTDEVICE
  int64_t& operator[](int idx);
  HOSTDEVICE
  int64_t operator[](int idx) const;
-
-  int64_t head;
 };

 namespace {
@@ -154,15 +152,20 @@ HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 }

 template <>
-HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
+  throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(idx == 0);
+  PADDLE_ASSERT(false);
 #endif
-  return dim.head;
+#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
+  // On CUDA versions previous to 8.0, only __shared__ variables
+  // could be declared as static in the device code.
+  int64_t head = 0;
+#else
+  static int64_t head = 0;
+#endif
+  return head;
 }

 template <int D>
@@ -181,15 +184,20 @@ HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 }

 template <>
-HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
+  throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(idx == 0);
+  PADDLE_ASSERT(false);
 #endif
-  return dim.head;
+#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
+  // On CUDA versions previous to 8.0, only __shared__ variables
+  // could be declared as static in the device code.
+  int64_t head = 0;
+#else
+  static int64_t head = 0;
+#endif
+  return head;
 }

 }  // namespace
@@ -218,12 +226,12 @@ HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
 }

 // Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
  return indexer(*this, i);
 }

 // Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
  return indexer(*this, i);
 }

@@ -251,8 +259,8 @@ HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
-  return a.head * b.head;
+HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
+  return 0;
 }

 // Product of a Dim
@@ -264,8 +272,8 @@ HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
-  return prod * a.head;
+HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
+  return prod;
 }

 // Is 0 <= idx_i < size_i for all i?
@@ -278,8 +286,8 @@ HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
 // Base case of is 0 <= idx_i < size_i ?
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head));
+HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
+  return true;
 }

 /**
@@ -294,8 +302,8 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
 // Base case of ex_prefix_mul
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
-  return Dim<1>(mul);
+HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
+  return Dim<0>();
 }
 ///\endcond

@@ -309,8 +317,8 @@ HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {

 // Base case
 template <>
-HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head + b.head);
+HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
+  return Dim<0>();
 }

 template <int i>
@@ -328,8 +336,8 @@ HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {

 // Base case
 template <>
-HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head * b.head);
+HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
+  return Dim<0>();
 }

 template <int i>
@@ -356,10 +364,9 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
 ///\cond HIDDEN

 template <>
-HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
-                                           const Dim<1>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<1>(norm_stride);
+HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
+                                           const Dim<0>& stride) {
+  return Dim<0>();
 }

 ///\endcond
@@ -394,6 +401,10 @@ typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
  return os;
 }

+inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
+  return os;
+}
+
 template <int i>
 HOST std::string Dim<i>::to_string() const {
  std::stringstream stream;

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -117,6 +117,7 @@ message VarType {
    // raw variables should manage their own allocations
    // in operators like nccl_op
    RAW = 17;
+    TUPLE = 18;
  }

  required Type type = 1;
@@ -148,6 +149,9 @@ message VarType {
    required int64 capacity = 2;
  }
  optional ChannelDesc channel = 6;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
 }

 message VarDesc {

--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+typedef boost::variant<int, int64_t, float, double, std::string, Tensor,
+                       LoDTensor /*, ChannelHolder*/>
+    ElementVar;
+
+class Tuple {
+ public:
+  using ElementVars = std::vector<ElementVar>;
+
+  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+      : var_(var), var_desc_(var_desc) {}
+  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+
+  ElementVar get(int idx) const { return var_[idx]; };
+
+  ElementVar& get(int idx) { return var_[idx]; };
+
+  bool isSameType(Tuple& t) const;
+
+  size_t getSize() const { return var_.size(); };
+
+ private:
+  ElementVars var_;
+  std::vector<VarDesc> var_desc_;
+};
+
+bool Tuple::isSameType(Tuple& t) const {
+  size_t tuple_size = getSize();
+  if (tuple_size != t.getSize()) {
+    return false;
+  }
+  for (size_t j = 0; j < tuple_size; ++j) {
+    auto type1 = get(j).which();
+    auto type2 = t.get(j).which();
+    if (type1 != type2) return false;
+  }
+  return true;
+}
+
+Tuple* make_tuple(std::vector<ElementVar> tuple) { return new Tuple(tuple); }
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/tuple_test.cc
+++ b/paddle/fluid/framework/tuple_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <sstream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tuple.h"
+
+TEST(Tuple, Make) {
+  std::vector<paddle::framework::ElementVar> element_type;
+  element_type.push_back(12);
+  element_type.push_back(12.0f);
+  element_type.push_back("ElementVar");
+
+  paddle::framework::Tuple* tuple = paddle::framework::make_tuple(element_type);
+
+  EXPECT_EQ(boost::get<int>(tuple->get(0)), 12);
+  EXPECT_EQ(boost::get<float>(tuple->get(1)), 12.0f);
+  EXPECT_EQ(boost::get<std::string>(tuple->get(2)), "ElementVar");
+
+  delete tuple;
+}
+
+TEST(Tuple, IsTheSameType) {
+  std::vector<paddle::framework::ElementVar> element_type1;
+  std::vector<paddle::framework::ElementVar> element_type2;
+  std::vector<paddle::framework::ElementVar> element_type3;
+
+  element_type1.push_back(12);
+  element_type1.push_back(12.0f);
+  element_type1.push_back("Tuple1");
+
+  element_type2.push_back(13);
+  element_type2.push_back(13.0f);
+  element_type2.push_back("Tuple2");
+
+  element_type3.push_back(14.0f);
+  element_type3.push_back(14);
+  element_type3.push_back("Tuple3");
+
+  paddle::framework::Tuple* tuple1 =
+      paddle::framework::make_tuple(element_type1);
+  paddle::framework::Tuple* tuple2 =
+      paddle::framework::make_tuple(element_type2);
+  paddle::framework::Tuple* tuple3 =
+      paddle::framework::make_tuple(element_type3);
+
+  EXPECT_TRUE(tuple1->isSameType(*tuple2));
+  EXPECT_FALSE(tuple1->isSameType(*tuple3));
+
+  delete tuple1;
+  delete tuple2;
+  delete tuple3;
+}
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -130,7 +130,7 @@ endif()

 if(WITH_DISTRIBUTE)
    add_subdirectory(detail)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -41,6 +41,14 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("ColToRowMatchIndices", dims);
    ctx->SetOutputDim("ColToRowMatchDist", dims);
  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
+  }
 };

 template <typename T>

--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -37,12 +37,19 @@ class BoxCoderOp : public framework::OperatorWithKernel {
                      "The rank of Input of PriorBoxVar must be 2");
    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
-    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                      "The rank of Input of TargetBox must be 2");
-    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                      "The shape of TargetBox is [M, 4]");

-    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                        "The rank of Input of TargetBox must be 2");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                        "The shape of TargetBox is [M, 4]");
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
+                        "The rank of Input of TargetBox must be 3");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+    }

    ctx->SetOutputDim(
        "OutputBox",
@@ -70,25 +77,28 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
             "of variance.");
    AddInput(
        "TargetBox",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the box if the input "
-        "is image feature map, they are close to the origin of the coordinate "
-        "system. [xmax, ymax] is the right bottom coordinate of the box. "
-        "This tensor can contain LoD information to represent a batch "
-        "of inputs. One instance of this batch can contain different "
-        "numbers of entities.");
+        "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
+        "[N, 4] when code_type is 'encode_center_size'. This input also can "
+        "be a 3-D Tensor with shape [N, M, 4] when code_type is "
+        "'decode_center_size'. [N, 4], each box is represented as "
+        "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
+        "of the box if the input is image feature map, they are close to "
+        "the origin of the coordinate system. [xmax, ymax] is the right "
+        "bottom coordinate of the box. This tensor can contain LoD "
+        "information to represent a batch of inputs. One instance of this "
+        "batch can contain different numbers of entities.");
    AddAttr<std::string>("code_type",
                         "(string, default encode_center_size) "
                         "the code type used with the target box")
        .SetDefault("encode_center_size")
        .InEnum({"encode_center_size", "decode_center_size"});
-    AddOutput(
-        "OutputBox",
-        "(LoDTensor or Tensor) "
-        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
-        "representing the result of N target boxes encoded/decoded with "
-        "M Prior boxes and variances.");
+    AddOutput("OutputBox",
+              "(LoDTensor or Tensor) "
+              "When code_type is 'encode_center_size', the output tensor of "
+              "box_coder_op with shape [N, M, 4] representing the result of N "
+              "target boxes encoded with M Prior boxes and variances. When "
+              "code_type is 'decode_center_size', N represents the batch size "
+              "and M represents the number of deocded boxes.");

    AddComment(R"DOC(
 Bounding Box Coder Operator.

--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
@@ -66,7 +66,6 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                       T* output) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < row * col) {
-    const int row_idx = idx / col;
    const int col_idx = idx % col;
    T prior_box_width =
        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
@@ -79,17 +78,16 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                           2;

    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
-                             target_box_data[row_idx * len + 2]) *
+                             target_box_data[idx * len + 2]) *
                         prior_box_width;
    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
-                              target_box_data[row_idx * len + 3]) *
+                              target_box_data[idx * len + 3]) *
                          prior_box_height;
    T target_box_center_x = prior_box_var_data[col_idx * len] *
-                                target_box_data[row_idx * len] *
-                                prior_box_width +
+                                target_box_data[idx * len] * prior_box_width +
                            prior_box_center_x;
    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
-                                target_box_data[row_idx * len + 1] *
+                                target_box_data[idx * len + 1] *
                                prior_box_height +
                            prior_box_center_y;


--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
@@ -89,6 +89,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {

    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
+        size_t offset = i * col * len + j * len;
        T prior_box_width =
            prior_box_data[j * len + 2] - prior_box_data[j * len];
        T prior_box_height =
@@ -99,20 +100,19 @@ class BoxCoderKernel : public framework::OpKernel<T> {
            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;

        T target_box_center_x = prior_box_var_data[j * len] *
-                                    target_box_data[i * len] * prior_box_width +
+                                    target_box_data[offset] * prior_box_width +
                                prior_box_center_x;
        T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                    target_box_data[i * len + 1] *
+                                    target_box_data[offset + 1] *
                                    prior_box_height +
                                prior_box_center_y;
        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                      target_box_data[i * len + 2]) *
+                                      target_box_data[offset + 2]) *
                             prior_box_width;
        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                       target_box_data[i * len + 3]) *
+                                       target_box_data[offset + 3]) *
                              prior_box_height;

-        size_t offset = i * col * len + j * len;
        output[offset] = target_box_center_x - target_box_width / 2;
        output[offset + 1] = target_box_center_y - target_box_height / 2;
        output[offset + 2] = target_box_center_x + target_box_width / 2;

--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace pf = paddle::framework;
+static constexpr char kChannel[] = "Channel";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCloseOp : public framework::OperatorBase {
+ public:
+  ChannelCloseOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto &inp = *scope.FindVar(Input(kChannel));
+
+    // Get the mutable version of the channel variable and closes it.
+    pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
+    ch->close();
+  }
+};
+
+class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("Channel"),
+                   "The input of ChannelClose op must be set");
+  }
+};
+
+class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelCloseOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kChannel,
+             "The Channel Variable that should be closed by"
+             " the ChannelClose Op.");
+    AddComment(R"DOC(
+Channel Close Operator.
+
+This operator closes an open channel.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelCloseOpMaker);
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace pf = paddle::framework;
+
+static constexpr char kOutput[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCreateOp : public framework::OperatorBase {
+ public:
+  ChannelCreateOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto &out = *scope.FindVar(Output(kOutput));
+
+    // Determine the datatype and capacity of the channel to be created
+    // from the attributes provided.
+    auto dtype =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
+    auto capacity = Attr<int>("capacity");
+
+    // Based on the datatype, create a new channel holder initialized with
+    // the given capacity. When capacity is 0, an unbuffered channel is
+    // created.
+    pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
+    if (dtype == framework::proto::VarType::LOD_TENSOR) {
+      ch->Reset<pf::LoDTensor>(capacity);
+    } else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
+      ch->Reset<pf::SelectedRows>(capacity);
+    } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
+      ch->Reset<pf::LoDRankTable>(capacity);
+    } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      ch->Reset<pf::LoDTensorArray>(capacity);
+    } else if (dtype == framework::proto::VarType::READER) {
+      ch->Reset<pf::ReaderHolder>(capacity);
+    } else if (dtype == framework::proto::VarType::CHANNEL) {
+      ch->Reset<pf::ChannelHolder>(capacity);
+    } else if (dtype == framework::proto::VarType::BOOL) {
+      ch->Reset<bool>(capacity);
+    } else if (dtype == framework::proto::VarType::INT32) {
+      ch->Reset<int>(capacity);
+    } else if (dtype == framework::proto::VarType::INT64) {
+      ch->Reset<int64_t>(capacity);
+    } else if (dtype == framework::proto::VarType::FP32) {
+      ch->Reset<float>(capacity);
+    } else if (dtype == framework::proto::VarType::FP64) {
+      ch->Reset<double>(capacity);
+    } else {
+      PADDLE_THROW(
+          "Data type %d is not in "
+          "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
+          "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
+          dtype);
+    }
+  }
+};
+
+class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasOutput(kOutput),
+                   "The output of ChannelCreate op must be set");
+    context->SetOutputDim(kOutput, {1});
+  }
+};
+
+class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelCreateOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(kOutput,
+              "The object of a Channel type created by ChannelCreate Op.");
+    AddAttr<int>("capacity", "The size of the buffer of Channel.")
+        .SetDefault(0);
+    AddAttr<int>("data_type", "The data type of elements inside the Channel.");
+    AddComment(R"DOC(
+Channel Create Operator.
+
+This operator creates an object of the VarType Channel and returns it.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelCreateOpMaker);
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include <paddle/fluid/framework/lod_rank_table.h>
+#include <paddle/fluid/framework/lod_tensor_array.h>
+#include <paddle/fluid/framework/reader.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char Status[] = "Status";
+static constexpr char Out[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+void SetReceiveStatus(const platform::Place &dev_place,
+                      framework::Variable &status_var, bool status) {
+  auto cpu = platform::CPUPlace();
+  auto status_tensor =
+      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                        cpu);
+  status_tensor[0] = status;
+}
+
+bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) {
+  // Get type of channel and use that to call mutable data for Variable
+  auto type = framework::ToVarType(ch->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    return ch->Receive(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    return ch->Receive(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelReceive:Unsupported type");
+}
+
+class ChannelRecvOp : public framework::OperatorBase {
+ public:
+  ChannelRecvOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const {
+    PADDLE_ENFORCE(ctx->HasInput(Channel),
+                   "Input(Channel) of ChannelRecvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Out),
+                   "Input(Channel) of ChannelRecvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Status),
+                   "Output(Status) of ChannelRecvOp should not be null.");
+    ctx->SetOutputDim("Status", {1});
+  }
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    // Get the channel holder created by channel_create op, passed as input.
+    framework::ChannelHolder *ch =
+        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
+    auto output_var = scope.FindVar(Output(Out));
+    // Receive the data from the channel.
+    bool ok = ChannelReceive(ch, output_var);
+
+    // Set the status output of the `ChannelReceive` call.
+    SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+  }
+};
+
+class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelRecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(Channel,
+             "(Channel) A variable which \"receives\" the a value sent"
+             "to it by a channel_send op.")
+        .AsDuplicable();
+    AddOutput(Out,
+              "(Variable) Output Variable that will hold the data received"
+              " from the Channel")
+        .AsDuplicable();
+    AddOutput(Status,
+              "(Tensor) An LoD Tensor that returns a boolean status of the"
+              "result of the receive operation.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelRecvOpMaker);
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include <paddle/fluid/framework/lod_rank_table.h>
+#include <paddle/fluid/framework/lod_tensor_array.h>
+#include <paddle/fluid/framework/reader.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char X[] = "X";
+static constexpr char Status[] = "Status";
+static constexpr char copy[] = "copy";
+
+namespace paddle {
+namespace operators {
+
+void SetSendStatus(const platform::Place &dev_place,
+                   framework::Variable &status_var, bool status) {
+  auto cpu = platform::CPUPlace();
+  auto status_tensor =
+      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                        cpu);
+  status_tensor[0] = status;
+}
+
+bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+  auto type = framework::ToVarType(var->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    return ch->Send(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    return ch->Send(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    return ch->Send(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    return ch->Send(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    return ch->Send(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    return ch->Send(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelSend:Unsupported type");
+}
+
+class ChannelSendOp : public framework::OperatorBase {
+ public:
+  ChannelSendOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const {
+    PADDLE_ENFORCE(ctx->HasInput(Channel),
+                   "Input(Channel) of ChannelSendOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(X),
+                   "Input(X) of ChannelSendOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Status),
+                   "Output(Status) of ChannelSendOp should not be null.");
+    ctx->SetOutputDim("Status", {1});
+  }
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    // Get the channel holder created by channel_create op, passed as input.
+    framework::ChannelHolder *ch =
+        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
+    auto input_var = scope.FindVar(Input(X));
+
+    // Send the input data through the channel.
+    bool ok = ChannelSend(ch, input_var);
+
+    // Set the status output of the `ChannelSend` call.
+    SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+  }
+};
+
+class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelSendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(Channel,
+             "(Channel) A variable which \"sends\" the passed in value to "
+             "a listening receiver.")
+        .AsDuplicable();
+    AddInput(X, "(Variable) The value which gets sent by the channel.")
+        .AsDuplicable();
+    AddOutput(Status,
+              "(Tensor) An LoD Tensor that returns a boolean status of the"
+              "result of the send operation.")
+        .AsDuplicable();
+    AddAttr<bool>(copy, "(bool, default false) Should copy before send")
+        .SetDefault(false);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelSendOpMaker);
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -24,6 +24,29 @@ namespace detail {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor;

+template <typename T>
+struct StridedMemcpyFunctor<T, 0> {
+  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
+                  framework::Dim<0> dst_stride, T* dst) const {
+    auto place = dev_ctx.GetPlace();
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
+      auto& cuda_ctx =
+          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
+                   cuda_ctx.stream());
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+    }
+  }
+};
+
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -65,12 +65,17 @@ smaller than or equal to the dimensions of $X$.

 There are two cases for this operator:
 1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a subset of $X$.
+2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
+   of size 1 for $Y$ will be ignored for the consideration of subsequence.
+

 For case 2:
+
 $Y$ will be broadcasted to match the shape of $X$ and axis should be
 set to index of the start dimension to broadcast $Y$ onto $X$.

+If axis is -1, it is treated as axis=rank(X)-rank(Y).
+
 For example
  .. code-block:: python

@@ -79,6 +84,7 @@ For example
    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0

 Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
 information. However, the output only shares the LoD information with input $X$.

--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -20,6 +20,7 @@ limitations under the License. */

 #ifdef __NVCC__
 #include <thrust/iterator/iterator_adaptor.h>
+#include "paddle/fluid/platform/cuda_helper.h"
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif

@@ -61,6 +62,19 @@ inline void get_mid_dims(const framework::DDim& x_dims,
  }
 }

+inline void trim_trailing_singular_dims(framework::DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  if (actual_dims_size != dims.size()) {
+    auto actual_dims = framework::vectorize(dims);
+    actual_dims.resize(actual_dims_size);
+    dims = framework::make_ddim(actual_dims);
+  }
+}
+
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
 template <typename T, typename DeviceContext>
@@ -263,44 +277,6 @@ class TransformFunctor {
    }                                                                          \
  }

-template <class functor, typename DeviceContext, typename T>
-void ElementwiseCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.");
-
-  if (x_dims == y_dims) {
-    functor f;
-    f.template Run<DeviceContext, T>(x, y, z, ctx);
-    return;
-  }
-
-  int axis = ctx.Attr<int>("axis");
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-  if (post == 1) {
-    functor f;
-    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
-    return;
-  } else {
-    functor f;
-    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
-    return;
-  }
-}
-
 #define EIGEN_ADD(x, y) ((x) + (y))
 EIGEN_FUNCTOR(Add, EIGEN_ADD);

@@ -361,13 +337,10 @@ template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
    const T* x, const T* y, const T* out, const T* dout, int h, int w,
    DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
-  extern __shared__ char shm_buffer[];
-  T* shm = reinterpret_cast<T*>(shm_buffer);
-
  int j = blockIdx.x;
  int i = threadIdx.x;
  int tid = threadIdx.x;
-  shm[tid] = 0;
+  T val = 0;

  do {
    int x_offset = i * w + j;
@@ -375,22 +348,16 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
      dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
    }
    if (dy) {
-      shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
    }
    i += ELEMWISE_MAX_BLOCK_DIM;
  } while (i < h);

  if (dy) {
-    __syncthreads();
-
    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
-    // Sum, could be optimized
+    val = platform::reduceSum(val, tid, h);
    if (threadIdx.x == 0) {
-      for (int k = 1; k < h; ++k) {
-        shm[0] += shm[k];
-      }
-      dy[j] = shm[0];
+      dy[j] = val;
    }
  }
 }
@@ -402,10 +369,8 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x,
                                       T* dx, T* dy) {
  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
  int gird_size = w;
-  int shared_mem_size = block_size * sizeof(T);
-  ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, shared_mem_size,
-                                     stream>>>(x, y, out, dout, h, w, dx_op,
-                                               dy_op, dx, dy);
+  ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+      x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
 }

 #endif
@@ -436,7 +401,6 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out,
 }

 #ifdef __NVCC__
-
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast2CUDAKernel(
    const T* x, const T* y, const T* out, const T* dout, int pre, int n,
@@ -444,9 +408,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
  int tid = threadIdx.x;
  int j = blockIdx.x;

-  extern __shared__ char shm_buffer[];
-  T* shm = reinterpret_cast<T*>(shm_buffer);
-  shm[tid] = 0;
+  T val = 0;
  int ttid = tid;

  while (true) {
@@ -461,23 +423,18 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
    }

    if (dy != nullptr) {
-      shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
    }

    ttid += ELEMWISE_MAX_BLOCK_DIM;
  }

  if (dy) {
-    __syncthreads();
    int h = pre * post;
    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
-    // Sum, could be optimized
-    if (tid == 0) {
-      for (int i = 1; i < h; ++i) {
-        shm[0] += shm[i];
-      }
-      dy[j] = shm[0];
+    val = platform::reduceSum(val, tid, h);
+    if (threadIdx.x == 0) {
+      dy[j] = val;
    }
  }
 }
@@ -489,10 +446,8 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
                                       DY_OP dy_op, T* dx, T* dy) {
  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
  int gird_size = n;
-  int shared_mem_size = block_size * sizeof(T);
-  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, shared_mem_size,
-                                     stream>>>(x, y, out, dout, pre, n, post,
-                                               dx_op, dy_op, dx, dy);
+  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+      x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
 }

 #endif
@@ -516,14 +471,10 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
    auto x_dim = x.dims();
    auto y_dim = y.dims();

-    if (y_dim.size() == 1 && y_dim[0] == 1) {
-      // y is a scalar
-      auto extended_dims = framework::vectorize(x_dim);
-      extended_dims.push_back(1);
-      x_dim = framework::make_ddim(extended_dims);
-    }
-
    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+    trim_trailing_singular_dims(y_dim);
+    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
    int pre, n, post;
    get_mid_dims(x_dim, y_dim, axis, pre, n, post);
    if (post == 1) {
@@ -591,14 +542,9 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
    return;
  }

-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  trim_trailing_singular_dims(y_dims);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;

  int pre, n, post;
  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
@@ -633,16 +579,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
    return;
  }

-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                 "Axis should be in range [0, x_dims)");
+  trim_trailing_singular_dims(y_dims);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;

  int pre, n, post;
  get_mid_dims(x_dims, y_dims, axis, pre, n, post);

--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+
+static constexpr char kBlock[] = "sub_block";
+static constexpr char kX[] = "X";
+
+class GoOp : public framework::OperatorBase {
+ public:
+  GoOp(const std::string &type, const framework::VariableNameMap &inputs,
+       const framework::VariableNameMap &outputs,
+       const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void ExecuteOnThread(framework::Executor *executor,
+                       framework::BlockDesc *block,
+                       framework::Scope *scope) const {
+    framework::ProgramDesc *program = block->Program();
+    executor->Run(*program, scope, block->ID(), false /*create_local_scope*/);
+  }
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    /*
+     * Determine the global scope. Create a new child scope.
+     * Within the child scope, add all the local variables relevant
+     * to that scope.
+     *
+     * Now go through all the inputs to the op to ensure that
+     * all of them are in the newly created scope. This is important
+     * to ensure that they don't get destroyed when the parent scope
+     * is deleted.
+     * */
+
+    // TODO(varunarora): Consider moving this root scope lookup to scope.h.
+    const framework::Scope *root_scope = &scope;
+    const framework::Scope *parent_scope = &(root_scope->parent());
+
+    while (parent_scope != nullptr) {
+      root_scope = parent_scope;
+      parent_scope = &(parent_scope->parent());
+    }
+
+    framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
+    framework::Executor executor(dev_place);
+    framework::Scope &new_scope = root_scope->NewScope();
+
+    for (auto &var : block->AllVars()) {
+      new_scope.Var(var->Name());
+    }
+
+    auto &inputs = Inputs(kX);
+    for (size_t i = 0; i < inputs.size(); i++) {
+      PADDLE_ENFORCE_NOT_NULL(new_scope.FindVar(inputs.at(i)),
+                              "All variables used in the go block "
+                              "should be created in the global scope");
+    }
+
+    // Now execute the go op with the newly created scope.
+    std::thread go_thread([dev_place, block, &new_scope, this]() {
+      framework::Executor executor(dev_place);
+      ExecuteOnThread(&executor, block, &new_scope);
+    });
+    go_thread.detach();
+  }
+};
+
+class GoOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GoOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kX,
+             "A set of variables, which are required by operators inside the "
+             "block of Go Op.")
+        .AsDuplicable();
+    AddAttr<framework::BlockDesc *>(kBlock, "The block inside GoOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+// TODO(thuan): Look into Gradient Operator for GO_OP
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(go, paddle::operators::GoOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::GoOpMaker);
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -247,7 +247,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };


--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -62,7 +62,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
    return framework::OpKernelType(
        framework::ToDataType(
            ctx.Input<framework::LoDTensor>("Scores")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };


--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -67,6 +67,14 @@ class PriorBoxOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        platform::CPUPlace());
+  }
 };

 class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -59,6 +59,16 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
  }
 };

+class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
+    }
+  }
+};
+
 class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -80,7 +90,8 @@ class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
                  ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker);
+                  ops::SplitSelectedRowsGradMaker,
+                  ops::SplitSelectedRowsOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
    split_selected_rows,
    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
 proto_library(profiler_proto SRCS profiler.proto)
+py_proto_compile(profiler_py_proto SRCS profiler.proto)
+
+add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+
+add_dependencies(profiler_py_proto profiler_py_proto_init)
+
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 if(WITH_GPU)
  cc_library(enforce SRCS enforce.cc DEPS)

--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -62,5 +62,53 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 }
 #endif

+// __shfl_down has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // TODO(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  __shared__ T shm[32];
+  const int warpSize = 32;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += __shfl_down_sync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += __shfl_down_sync(mask, val, offset);
+  }
+
+  return val;
+}
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/device_tracer.h"
+#include <google/protobuf/text_format.h>
+#include <fstream>
 #include <map>
 #include <mutex>
+#include <numeric>
+#include <thread>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
@@ -51,6 +55,36 @@ uint64_t kAlignSize = 8;
    }                                                                      \
  } while (0)

+std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
+      return "MEMCPY_HtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
+      return "MEMCPY_DtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
+      return "MEMCPY_HtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
+      return "MEMCPY_AtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
+      return "MEMCPY_AtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
+      return "MEMCPY_AtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
+      return "MEMCPY_DtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
+      return "MEMCPY_DtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
+      return "MEMCPY_HtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
+      return "MEMCPY_PtoP";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
+      return "MEMCPY_FORCE_INT";
+    default:
+      break;
+  }
+  return "MEMCPY";
+}
+
 void EnableActivity() {
  // Device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call.
@@ -107,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                                     kernel->correlationId);
            break;
          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY2: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
          default: { break; }
        }
      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -137,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_[id] = anno;
  }

+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    cpu_records_.push_back(
+        CPURecord{anno, start_ns, end_ns,
+                  std::hash<std::thread::id>{}(std::this_thread::get_id())});
+  }
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {
+    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
+                                     stream_id, correlation_id, bytes});
+  }
+
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {
    std::lock_guard<std::mutex> l(trace_mu_);
@@ -172,17 +240,15 @@ class DeviceTracerImpl : public DeviceTracer {
    CUPTI_CALL(
        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
-
    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
    enabled_ = true;
  }

-  proto::Profile GenProfile() {
+  proto::Profile GenProfile(const std::string &profile_path) {
    std::lock_guard<std::mutex> l(trace_mu_);
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
-    std::map<std::string, std::vector<uint64_t>> event_times;
    for (const KernelRecord &r : kernel_records_) {
      if (correlations_.find(r.correlation_id) == correlations_.end()) {
        fprintf(stderr, "cannot relate a kernel activity\n");
@@ -194,15 +260,31 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_end_ns(r.end_ns);
      event->set_stream_id(r.stream_id);
      event->set_device_id(r.device_id);
-      event_times[event->name()].push_back(r.end_ns - r.start_ns);
    }
-    for (const auto &et : event_times) {
-      fprintf(
-          stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
-          et.first.c_str(),
-          std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
-          et.second.size());
+
+    for (const CPURecord &r : cpu_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.thread_id);
+      event->set_device_id(-1);
    }
+    for (const MemRecord &r : mem_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event->mutable_memcopy()->set_bytes(r.bytes);
+    }
+    std::string profile_str;
+    google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
+    std::ofstream profile_f;
+    profile_f.open(profile_path, std::ios::out | std::ios::trunc);
+    profile_f << profile_str;
+    profile_f.close();
    return profile_pb;
  }

@@ -240,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
  uint64_t start_ns_;
  uint64_t end_ns_;
  std::vector<KernelRecord> kernel_records_;
+  std::vector<MemRecord> mem_records_;
+  std::vector<CPURecord> cpu_records_;
  std::unordered_map<uint32_t, std::string> correlations_;
  CUpti_SubscriberHandle subscriber_;
 };
@@ -252,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {

  void AddAnnotation(uint64_t id, const std::string &anno) {}

+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {}
+
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {}

@@ -259,7 +349,9 @@ class DeviceTracerDummy : public DeviceTracer {

  void Enable() {}

-  proto::Profile GenProfile() { return proto::Profile(); }
+  proto::Profile GenProfile(const std::string &profile_path) {
+    return proto::Profile();
+  }

  void Disable() {}
 };
@@ -281,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }

 void ClearCurAnnotation() { cur_annotation = nullptr; }

+const char *CurAnnotation() { return cur_annotation; }
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -36,6 +36,21 @@ class DeviceTracer {
    uint32_t stream_id;
    uint32_t correlation_id;
  };
+  struct CPURecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint64_t thread_id;
+  };
+  struct MemRecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+    uint64_t bytes;
+  };

  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
@@ -48,6 +63,14 @@ class DeviceTracer {
  // human-readable annotations.
  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;

+  virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
+                             uint64_t end_ns, uint32_t device_id,
+                             uint32_t stream_id, uint32_t correlation_id,
+                             uint64_t bytes) = 0;
+
+  virtual void AddCPURecords(const char* anno, uint64_t start_ns,
+                             uint64_t end_ns) = 0;
+
  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
  virtual void AddKernelRecords(uint64_t start, uint64_t end,
@@ -55,7 +78,7 @@ class DeviceTracer {
                                uint32_t correlation_id) = 0;

  // Generate a proto after done (Disabled).
-  virtual proto::Profile GenProfile() = 0;
+  virtual proto::Profile GenProfile(const std::string& profile_path) = 0;

  virtual bool IsEnabled() = 0;
 };
@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
 void SetCurAnnotation(const char* anno);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
-
+// Current name of the operation being run in the thread.
+const char* CurAnnotation();
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)

 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
  __macro(cuptiFinalize);                     \
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
-  __macro(cuptiEnableCallback);
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);

 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);


--- a/paddle/fluid/platform/dynload/cupti_lib_path.h.in
+++ b/paddle/fluid/platform/dynload/cupti_lib_path.h.in
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"

 DEFINE_string(cudnn_dir, "",
@@ -45,8 +46,7 @@ DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 namespace paddle {
 namespace platform {
 namespace dynload {
-
-static const char* cupti_lib_path = CUPTI_LIB_PATH;
+static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;

 static inline std::string join(const std::string& part1,
                               const std::string& part2) {

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
+#include <sys/time.h>
+#include <time.h>
 #include <iomanip>
 #include <map>
 #ifdef PADDLE_WITH_CUDA
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
      .count();
 }

+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
+
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
             const DeviceContext* dev_ctx)
    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }

-RecordEvent::RecordEvent(const std::string& name,
-                         const DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
+    : start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
  name_ = name;
@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,

 RecordEvent::~RecordEvent() {
  if (g_state == ProfilerState::kDisabled) return;
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer) {
+    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
+  }
  ClearCurAnnotation();
  PopEvent(name_, dev_ctx_);
 }
@@ -199,22 +211,22 @@ std::vector<std::vector<Event>> GetAllEvents() {
  return result;
 }

-void DisableProfiler(EventSortingKey sorted_key) {
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path) {
  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
                 "Can't disable profiling, since it's not starting.");
  // Mark the profiling stop.
  Mark("_stop_profiler_", nullptr);
  g_state = ProfilerState::kDisabled;

+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
  DeviceTracer* tracer = GetDeviceTracer();
  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
    tracer->Disable();
-    tracer->GenProfile();
+    tracer->GenProfile(profile_path);
  }
-
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
 }

 void ParseEvents(std::vector<std::vector<Event>>& events,

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -108,6 +108,7 @@ struct RecordEvent {

  ~RecordEvent();

+  uint64_t start_ns_;
  // The device context is used by Event to get the current cuda stream.
  const DeviceContext* dev_ctx_;
  // Event name
@@ -140,7 +141,8 @@ void EnableProfiler(ProfilerState state);
 // Clear the g_all_event_lists, which is total event lists of all threads.
 void ResetProfiler();

-void DisableProfiler(EventSortingKey sorted_key);
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path);

 // Parse the event list and output the profiling report
 void ParseEvents(std::vector<std::vector<Event>>&,

--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -15,12 +15,17 @@ limitations under the License. */
 syntax = "proto2";
 package paddle.platform.proto;

+message MemCopy { optional uint64 bytes = 3; }
+
 message Event {
  optional string name = 1;
  optional uint64 start_ns = 2;
  optional uint64 end_ns = 3;
-  optional uint32 device_id = 5;
+  // When positive, it represents gpu id. When -1, it represents CPU.
+  optional int64 device_id = 5;
  optional uint32 stream_id = 6;
+
+  optional MemCopy memcopy = 7;
 }

 message Profile {

--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -125,5 +125,5 @@ TEST(RecordEvent, RecordEvent) {
  EXPECT_EQ(start_profiler_count, 1);

  // Will remove parsing-related code from test later
-  DisableProfiler(EventSortingKey::kTotal);
+  DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -31,7 +31,6 @@ ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
 BUILD_ROOT=/paddle/build_android
 DEST_ROOT=/paddle/install_android

-rm -rf $BUILD_ROOT 2>/dev/null || true
 mkdir -p $BUILD_ROOT
 cd $BUILD_ROOT


--- a/paddle/scripts/travis/build_android.sh
+++ b/paddle/scripts/travis/build_android.sh
-#!/bin/bash
-set -e
-
-ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
-TMP_DIR=$HOME/$JOB/tmp
-mkdir -p $TMP_DIR
-cd $TMP_DIR
-wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
-unzip -q android-ndk-r14b-linux-x86_64.zip
-chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
-$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-cd $HOME
-rm -rf $TMP_DIR
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_android
-cd $TRAVIS_BUILD_DIR/build_android
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-      -DANDROID_ABI=armeabi-v7a \
-      -DANDROID_ARM_NEON=ON \
-      -DANDROID_ARM_MODE=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_C_API=ON \
-      -DWITH_SWIG_PY=OFF \
-      -DWITH_STYLE_CHECK=OFF \
-      ..
-
-make -j `nproc`
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -57,7 +57,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})

 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
 if(WITH_SWIG_PY)

--- a/python/paddle/fluid/tests/notest_csp.py
+++ b/python/paddle/fluid/tests/notest_csp.py
@@ -11,27 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-
-
-class TestCSPFramework(unittest.TestCase):
-    def daisy_chain(self):
-        n = 10000
-        leftmost = fluid.make_channel(dtype=int)
-        right = leftmost
-        left = leftmost
-        with fluid.While(steps=n):
-            right = fluid.make_channel(dtype=int)
-            with fluid.go():
-                fluid.send(left, 1 + fluid.recv(right))
-            left = right
-
-        with fluid.go():
-            fluid.send(right, 1)
-        fluid.Print(fluid.recv(leftmost))
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -220,7 +220,10 @@ def _callback_lookup_(op):
    :return: callback function
    """
    if op.type == 'parallel_do' and op.attr('use_nccl'):
+        all_vars = op.block.vars
        param_names = set(op.input('parameters'))
+        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
+                             param_names)
        param_grad_names = [n + "@GRAD" for n in param_names]

        class ParallelDoCallBack(object):

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# TODO: Variables: make_channel
-# TODO: Operators: send, close_channel, recv, go, select
 from layers.control_flow import BlockGuard
-from layer_helper import LayerHelper
+from layer_helper import LayerHelper, unique_name
+from layers import fill_constant
+import core

 __all__ = [
    'Go',
@@ -46,41 +46,200 @@ class Go(BlockGuard):
        parent_block = main_program.block(main_program.current_block()
                                          .parent_idx)

+        inner_outputs = set()
        x_name_list = set()
-        out_vars = set()
        for op in go_block.ops:
            # Iterate over all operators, get all the inputs
            # and add as input to the Go operator.
            for iname in op.input_names:
                for in_var_name in op.input(iname):
-                    x_name_list.add(in_var_name)
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)

-            # Iterate over all operators , get all the outputs
-            # add to the output list of Go operator only if
-            # they exist in the parent block.
            for oname in op.output_names:
                for out_var_name in op.output(oname):
-                    if out_var_name in parent_block.vars:
-                        out_vars.add(parent_block.var(out_var_name))
+                    inner_outputs.add(out_var_name)
+
+        # Iterate over all operators , get all the outputs
+        # add to the output list of Go operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))

        parent_block.append_op(
            type='go',
-            inputs={'X': [parent_block.var(x_name) for x_name in x_name_list]},
-            outputs={'Out': out_vars},
+            inputs={
+                'X':
+                [parent_block.var_recursive(x_name) for x_name in x_name_list]
+            },
+            outputs={},
            attrs={'sub_block': go_block})


-def make_channel(dtype, size=0):
-    return True
+def make_channel(dtype, capacity=0):
+    """
+    Helps implementation of a concurrent program by creating a "channel" of
+    a defined data type. Channels allow for the passing of data in
+    concurrent scenarios - such as when using threads to divide computation.
+    Channels can be used to "send" and "receive" such data concurrently.

+    There are two kinds of channels: unbuffered and buffered. Unbuffered
+    channels have no capacity - and thus, block on send and only unblock only
+    once what they have sent has been received.

-def channel_send(channel, value):
-    return True
+    On the other hand, buffered channels are initialized with a capacity -
+    and do not block on sends.
+
+    Use this method in combination with `channel_send`, `channel_recv`,
+    `channel_close`, and `Go` to design a concurrent Paddle program.
+
+    Args:
+        dtype (ParamAttr|string): Data type of the data sent in the channel.
+        This data type should be the string name of a numpy data type.
+        capacity (ParamAttr|int): Size of the channel. Defaults to 0 for
+        to create an unbuffered channel.
+
+    Returns:
+        Variable: The channel variable that can be used to send an receive data
+                  of the defined dtype.
+
+    Examples:
+        .. code-block:: python

+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          ...
+          # Code to execute in a Go block, which receives the channel data.
+          fluid.channel_send(ch, 100)
+          fluid.channel_close(ch)
+    """
+    helper = LayerHelper('channel_create', **locals())
+    main_program = helper.main_program
+    make_channel_block = main_program.current_block()

-def channel_recv(channel):
-    return True
+    # Make a channel variable (using the channel data type) and make sure it
+    # persists into the global scope.
+    channel = helper.create_variable(
+        name=unique_name.generate('channel'),
+        type=core.VarDesc.VarType.CHANNEL,
+        persistable=True)
+
+    create_channel_op = make_channel_block.append_op(
+        type="channel_create",
+        outputs={"Out": channel},
+        attrs={"data_type": dtype,
+               "capacity": capacity})
+
+    return channel
+
+
+def channel_send(channel, value):
+    """
+    Sends a value through a channel variable. Used by an unbuffered or buffered
+    channel to pass data from within or to a concurrent Go block, where
+    `channel_recv` to used to get the passed value.
+
+    Args:
+        channel (Variable|Channel): Channel variable created using
+        `make_channel`.
+        value (Variable): Value to send to channel
+    Returns:
+        Variable: The boolean status on whether or not the channel
+                  successfully sent the passed value.
+
+    Examples:
+        .. code-block:: python
+
+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          ...
+          # Code to execute in a Go block, which receives the channel data.
+          fluid.channel_send(ch, 100)
+    """
+    helper = LayerHelper('channel_send', **locals())
+    main_program = helper.main_program
+    channel_send_block = main_program.current_block()
+
+    status = helper.create_variable(
+        name=unique_name.generate('status'),
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        dtype=core.VarDesc.VarType.BOOL)
+
+    channel_send_op = channel_send_block.append_op(
+        type="channel_send",
+        inputs={
+            "Channel": channel,
+            "X": value,
+        },
+        outputs={"Status": status})
+
+    return status
+
+
+def channel_recv(channel, return_value):
+    """
+    Receives a value through a channel variable. Used by an unbuffered or
+    buffered channel within a concurrent Go block to get data from originally
+    sent using `channel_send`, or from outside such a block where
+    `channel_send` is used to send the value.
+
+    Args:
+        channel (Variable|Channel): Channel variable created using
+        `make_channel`.
+        return_value (Variable): Variable to set as a result of running channel_recv_op
+
+    Returns:
+        Variable: The received value from the channel.
+        Variable: The boolean status on whether or not the channel
+                  successfully received the passed value.
+
+    Examples:
+        .. code-block:: python
+
+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          with fluid.Go():
+            returned_value = fluid.channel_recv(ch, 'int32')
+
+          # Code to send data through the channel.
+    """
+    helper = LayerHelper('channel_recv', **locals())
+    main_program = helper.main_program
+    channel_recv_block = main_program.current_block()
+
+    status = helper.create_variable(
+        name=unique_name.generate('status'),
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        dtype=core.VarDesc.VarType.BOOL)
+
+    channel_recv_op = channel_recv_block.append_op(
+        type="channel_recv",
+        inputs={"Channel": channel},
+        outputs={"Out": return_value,
+                 "Status": status})
+
+    return return_value, status


 def channel_close(channel):
-    return True
+    """
+    Closes a channel created using `make_channel`.
+
+    Args:
+        channel (Variable|Channel): Channel variable created using
+        `make_channel`.
+
+    Examples:
+        .. code-block:: python
+
+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          ...
+          # Code to receive and send data through a channel
+          ...
+          fluid.channel_close(ch)
+    """
+    helper = LayerHelper('channel_close', **locals())
+    main_program = helper.main_program
+    channel_close_block = main_program.current_block()
+
+    channel_close_op = channel_close_block.append_op(
+        type="channel_close", inputs={"Channel": channel})
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -276,17 +276,17 @@ class DistributeTranspiler:
            pserver_program.global_block().create_var(
                name=orig_var_name,
                persistable=True,
+                type=v.type,
                dtype=v.dtype,
                shape=v.shape)
-            print("create origin var: ", orig_var_name)
            for trainer_id in xrange(self.trainers):
                var = pserver_program.global_block().create_var(
                    name="%s.trainer_%d" % (orig_var_name, trainer_id),
                    persistable=False,
+                    type=v.type,
                    dtype=v.dtype,
                    shape=v.shape)
                recv_inputs.append(var)
-                print("create per trainer var: ", var.name)
        # step3
        optimize_block = pserver_program.create_block(0)
        # step 4
@@ -551,11 +551,12 @@ class DistributeTranspiler:
                        type="sum",
                        inputs={"X": vars2merge},
                        outputs={"Out": merged_var})
-                    optimize_block.append_op(
-                        type="scale",
-                        inputs={"X": merged_var},
-                        outputs={"Out": merged_var},
-                        attrs={"scale": 1.0 / float(self.trainers)})
+                    if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+                        optimize_block.append_op(
+                            type="scale",
+                            inputs={"X": merged_var},
+                            outputs={"Out": merged_var},
+                            attrs={"scale": 1.0 / float(self.trainers)})
                new_inputs[key] = merged_var
            elif key == "Param":
                # param is already created on global program

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -22,6 +22,7 @@ from layer_helper import LayerHelper
 __all__ = [
    'Accuracy',
    'ChunkEvaluator',
+    'EditDistance',
 ]


@@ -173,7 +174,7 @@ class ChunkEvaluator(Evaluator):
 class EditDistance(Evaluator):
    """
    Accumulate edit distance sum and sequence number from mini-batches and
-    compute the average edit_distance of all batches.
+    compute the average edit_distance and instance error of all batches.

    Args:
        input: the sequences predicted by network.
@@ -189,14 +190,12 @@ class EditDistance(Evaluator):
        for epoch in PASS_NUM:
            distance_evaluator.reset(exe)
            for data in batches:
-                loss, sum_distance = exe.run(fetch_list=[cost] + distance_evaluator.metrics)
-                avg_distance = distance_evaluator.eval(exe)
-            pass_distance = distance_evaluator.eval(exe)
+                loss = exe.run(fetch_list=[cost])
+            distance, instance_error = distance_evaluator.eval(exe)

        In the above example:
-        'sum_distance' is the sum of the batch's edit distance.
-        'avg_distance' is the average of edit distance from the firt batch to the current batch.
-        'pass_distance' is the average of edit distance from all the pass.
+        'distance' is the average of the edit distance in a pass.
+        'instance_error' is the instance error rate in a pass.

    """

@@ -206,25 +205,45 @@ class EditDistance(Evaluator):
        if main_program.current_block().idx != 0:
            raise ValueError("You can only invoke Evaluator in root block")

-        self.total_error = self.create_state(
-            dtype='float32', shape=[1], suffix='total_error')
+        self.total_distance = self.create_state(
+            dtype='float32', shape=[1], suffix='total_distance')
        self.seq_num = self.create_state(
            dtype='int64', shape=[1], suffix='seq_num')
-        error, seq_num = layers.edit_distance(
+        self.instance_error = self.create_state(
+            dtype='int64', shape=[1], suffix='instance_error')
+        distances, seq_num = layers.edit_distance(
            input=input, label=label, ignored_tokens=ignored_tokens)
-        #error = layers.cast(x=error, dtype='float32')
-        sum_error = layers.reduce_sum(error)
-        layers.sums(input=[self.total_error, sum_error], out=self.total_error)
+
+        zero = layers.fill_constant(shape=[1], value=0.0, dtype='float32')
+        compare_result = layers.equal(distances, zero)
+        compare_result_int = layers.cast(x=compare_result, dtype='int')
+        seq_right_count = layers.reduce_sum(compare_result_int)
+        instance_error_count = layers.elementwise_sub(
+            x=seq_num, y=seq_right_count)
+        total_distance = layers.reduce_sum(distances)
+        layers.sums(
+            input=[self.total_distance, total_distance],
+            out=self.total_distance)
        layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
-        self.metrics.append(sum_error)
+        layers.sums(
+            input=[self.instance_error, instance_error_count],
+            out=self.instance_error)
+        self.metrics.append(total_distance)
+        self.metrics.append(instance_error_count)

    def eval(self, executor, eval_program=None):
        if eval_program is None:
            eval_program = Program()
        block = eval_program.current_block()
        with program_guard(main_program=eval_program):
-            total_error = _clone_var_(block, self.total_error)
+            total_distance = _clone_var_(block, self.total_distance)
            seq_num = _clone_var_(block, self.seq_num)
+            instance_error = _clone_var_(block, self.instance_error)
            seq_num = layers.cast(x=seq_num, dtype='float32')
-            out = layers.elementwise_div(x=total_error, y=seq_num)
-        return np.array(executor.run(eval_program, fetch_list=[out])[0])
+            instance_error = layers.cast(x=instance_error, dtype='float32')
+            avg_distance = layers.elementwise_div(x=total_distance, y=seq_num)
+            avg_instance_error = layers.elementwise_div(
+                x=instance_error, y=seq_num)
+            result = executor.run(
+                eval_program, fetch_list=[avg_distance, avg_instance_error])
+        return np.array(result[0]), np.array(result[1])
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,7 +14,7 @@

 import numpy as np
 import contextlib
-from framework import Program, default_main_program
+from framework import Program, default_main_program, Variable
 from . import core

 __all__ = [
@@ -281,6 +281,8 @@ class Executor(object):

        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
            for i, var in enumerate(fetch_list):
+                assert isinstance(var, Variable) or isinstance(var, str), (
+                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
                global_block.append_op(
                    type='fetch',
                    inputs={'X': [var]},

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -141,6 +141,8 @@ class Variable(object):
        dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable.
        lod_level(int): The level of lod tensor. 0 means it is not a time
            series data.
+        capacity(int): The capacity of Channel variable. Ignored
+            for other types.
        persistable(bool): True if the variable should be saved as check point.
            Defaults to False.
        stop_gradient(bool): True if the variable will stop to calculate
@@ -154,6 +156,7 @@ class Variable(object):
                 shape=None,
                 dtype=None,
                 lod_level=None,
+                 capacity=None,
                 persistable=None,
                 error_clip=None,
                 stop_gradient=False,
@@ -224,6 +227,14 @@ class Variable(object):
                        "persistable is {2}. They are not matched".format(
                            self.name, self.persistable, persistable))

+        if capacity is not None:
+            if is_new_var:
+                self.desc.set_capacity(capacity)
+            else:
+                # TODO(abhinavarora) : Compare with set capacity once,
+                # get_capacity is implemented
+                pass
+
        self.block.vars[name] = self
        self.op = None
        self.stop_gradient = stop_gradient
@@ -472,10 +483,11 @@ class Operator(object):

        self.desc.check_attrs()
        no_kernel_op_set = {
-            'feed', 'fetch', 'save', 'load', 'recurrent',
+            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit'
+            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
+            'channel_send', 'channel_recv'
        }
        if type not in no_kernel_op_set:
            self.desc.infer_var_type(self.block.desc)
@@ -761,7 +773,7 @@ class Block(object):
            stop_gradient = v.stop_gradient
        else:
            raise ValueError("unsupported var type: %s", type(v))
-
+        orig_var_type = v.type
        self.desc.rename_var(name, new_name)
        # NOTE: v is destroyed by C++ after calling rename_var.
        d = self.desc.find_var(new_name)
@@ -770,6 +782,7 @@ class Block(object):
                self,
                d.shape(),
                d.dtype(),
+                type=orig_var_type,
                name=new_name,
                stop_gradient=stop_gradient,
                trainable=trainable,
@@ -780,7 +793,7 @@ class Block(object):
        elif var_type == "Variable":
            var = Variable(
                self,
-                type=v.type,
+                type=orig_var_type,
                name=new_name,
                error_clip=error_clip,
                stop_gradient=stop_gradient)

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -330,9 +330,28 @@ class LayerHelper(object):
        return self.main_program.current_block().create_var(*args, **kwargs)

    def create_global_variable(self, persistable=False, *args, **kwargs):
+        """
+        create global variable, note that there is no initializer for this global variable.
+        Args:
+            persistable(bool): True if it is a checkpoint value.
+            *args: See create_var's documentation
+            **kwargs: See create_var's documentation
+
+        Returns(Variable): the created variable.
+        """
        return self.main_program.global_block().create_var(
            *args, persistable=persistable, **kwargs)

+    def create_or_get_global_variable(self, name, *args, **kwargs):
+        """
+        Creates a global variable if not exists and returns the variable and
+        a boolean flag which is true when it is a new variable.
+        """
+        if self.main_program.global_block().has_var(name):
+            return self.main_program.global_block().var(name), False
+        else:
+            return self.create_global_variable(name=name, *args, **kwargs), True
+
    def set_variable_initializer(self, var, initializer):
        assert isinstance(var, Variable)
        self.startup_program.global_block().create_var(

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -43,8 +43,8 @@ for _OP in set(__auto__):
    globals()[_OP] = generate_layer_fn(_OP)


-def detection_output(scores,
-                     loc,
+def detection_output(loc,
+                     scores,
                     prior_box,
                     prior_box_var,
                     background_label=0,
@@ -54,21 +54,27 @@ def detection_output(scores,
                     score_threshold=0.01,
                     nms_eta=1.0):
    """
-    **Detection Output Layer**
+    **Detection Output Layer for Single Shot Multibox Detector (SSD).**

-    This layer applies the NMS to the output of network and computes the
-    predict bounding box location. The output's shape of this layer could
-    be zero if there is no valid bounding box.
+    This operation is to get the detection results by performing following
+    two steps:
+    
+    1. Decode input bounding box predictions according to the prior boxes.
+    2. Get the final detection results by applying multi-class non maximum
+       suppression (NMS).
+
+    Please note, this operation doesn't clip the final output bounding boxes
+    to the image window.

    Args:
-        scores(Variable): A 3-D Tensor with shape [N, C, M] represents the
-            predicted confidence predictions. N is the batch size, C is the
-            class number, M is number of bounding boxes. For each category
-            there are total M scores which corresponding M bounding boxes.
        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
            predicted locations of M bounding bboxes. N is the batch size,
            and each bounding box has four coordinate values and the layout
            is [xmin, ymin, xmax, ymax].
+        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
+            predicted confidence predictions. N is the batch size, C is the
+            class number, M is number of bounding boxes. For each category
+            there are total M scores which corresponding M bounding boxes.
        prior_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
            each box is represented as [xmin, ymin, xmax, ymax],
            [xmin, ymin] is the left top coordinate of the anchor box,
@@ -91,7 +97,15 @@ def detection_output(scores,
        nms_eta(float): The parameter for adaptive NMS.

    Returns:
-        The detected bounding boxes which are a Tensor.
+        Variable: The detection outputs is a LoDTensor with shape [No, 6].
+            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
+            `No` is the total number of detections in this mini-batch. For each
+            instance, the offsets in first dimension are called LoD, the offset
+            number is N + 1, N is the batch size. The i-th image has
+            `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
+            has no detected results. If all images have not detected results,
+            all the elements in LoD are 0, and output tensor only contains one
+            value, which is -1.

    Examples:
        .. code-block:: python
@@ -100,7 +114,7 @@ def detection_output(scores,
                         append_batch_size=False, dtype='float32')
        pbv = layers.data(name='prior_box_var', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-        loc = layers.data(name='target_box', shape=[21, 4],
+        loc = layers.data(name='target_box', shape=[2, 21, 4],
                          append_batch_size=False, dtype='float32')
        scores = layers.data(name='scores', shape=[2, 21, 10],
                          append_batch_size=False, dtype='float32')
@@ -109,7 +123,6 @@ def detection_output(scores,
                                       prior_box=pb,
                                       prior_box_var=pbv)
    """
-
    helper = LayerHelper("detection_output", **locals())
    decoded_box = box_coder(
        prior_box=prior_box,
@@ -118,6 +131,7 @@ def detection_output(scores,
        code_type='decode_center_size')

    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    scores = nn.transpose(scores, perm=[0, 2, 1])
    helper.append_op(
        type="multiclass_nms",
        inputs={'Scores': scores,
@@ -595,12 +609,13 @@ def multi_box_head(inputs,
       name(str): Name of the prior box layer. Default: None.

    Returns:
-        mbox_loc(list): The predicted boxes' location of the inputs.
-             The layout of each element is [N, H, W, Priors]. Priors
-             is the number of predicted boxof each position of each input.
-        mbox_conf(list): The predicted boxes' confidence of the inputs.
-             The layout of each element is [N, H, W, Priors]. Priors
-             is the number of predicted box of each position of each input.
+        mbox_loc(Variable): The predicted boxes' location of the inputs.
+             The layout is [N, H*W*Priors, 4]. where Priors
+             is the number of predicted boxes each position of each input.
+        mbox_conf(Variable): The predicted boxes' confidence of the inputs.
+             The layout is [N, H*W*Priors, C]. where Priors
+             is the number of predicted boxes each position of each input
+             and C is the number of Classes.
        boxes(Variable): the output prior boxes of PriorBox.
             The layout is [num_priors, 4]. num_priors is the total
             box count of each position of inputs.
@@ -751,7 +766,7 @@ def multi_box_head(inputs,
        num_boxes = box.shape[2]

        # get box_loc
-        num_loc_output = num_boxes * num_classes * 4
+        num_loc_output = num_boxes * 4
        mbox_loc = nn.conv2d(
            input=input,
            num_filters=num_loc_output,
@@ -760,7 +775,12 @@ def multi_box_head(inputs,
            stride=stride)

        mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
-        mbox_locs.append(mbox_loc)
+        new_shape = [
+            mbox_loc.shape[0],
+            mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
+        ]
+        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_locs.append(mbox_loc_flatten)

        # get conf_loc
        num_conf_output = num_boxes * num_classes
@@ -771,11 +791,18 @@ def multi_box_head(inputs,
            padding=pad,
            stride=stride)
        conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
-        mbox_confs.append(conf_loc)
+        new_shape = [
+            conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
+            conf_loc.shape[3] / num_classes, num_classes
+        ]
+        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        mbox_confs.append(conf_loc_flatten)

    if len(box_results) == 1:
        box = box_results[0]
        var = var_results[0]
+        mbox_locs_concat = mbox_locs[0]
+        mbox_confs_concat = mbox_confs[0]
    else:
        reshaped_boxes = []
        reshaped_vars = []
@@ -785,5 +812,7 @@ def multi_box_head(inputs,

        box = tensor.concat(reshaped_boxes)
        var = tensor.concat(reshaped_vars)
+        mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
+        mbox_confs_concat = tensor.concat(mbox_confs, axis=1)

-    return mbox_locs, mbox_confs, box, var
+    return mbox_locs_concat, mbox_confs_concat, box, var
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -53,12 +53,22 @@ def monkey_patch_variable():
        value = float(value)
        tmp_name = unique_tmp_name()
        var = ref_var.block.create_var(name=tmp_name, dtype=dtype)
+        batch_dim = -1
+        for i, d in enumerate(ref_var.shape):
+            if d < 0:
+                batch_dim = i
+                break
+        assert batch_dim != -1
        ref_var.block.append_op(
            type='fill_constant_batch_size_like',
            outputs={'Out': [var]},
            inputs={'Input': [ref_var]},
-            attrs={'shape': ref_var.shape,
-                   'value': value})
+            attrs={
+                'shape': ref_var.shape,
+                'value': value,
+                'input_dim_idx': batch_dim,
+                'output_dim_idx': batch_dim
+            })
        return var

    def astype(self, dtype):
@@ -118,11 +128,20 @@ def monkey_patch_variable():
            tmp_name = unique_tmp_name()
            out = self.block.create_var(name=tmp_name, dtype=lhs_dtype)

+            axis = -1
+            if other_var.shape[0] == -1:
+                axis = 0
+            assert len(self.shape) >= len(other_var.shape), (
+                "The rank of the first argument of an binary operator cannot "
+                "be smaller than the rank of its second argument: %s vs %s" %
+                (len(self.shape), len(other_var.shape)))
+
            self.block.append_op(
                type=op_type,
                inputs={'X': [self],
                        'Y': [other_var]},
-                outputs={'Out': out})
+                outputs={'Out': out},
+                attrs={'axis': axis})
            return out

        comment = OpProtoHolder.instance().get_op_proto(op_type).comment
@@ -131,7 +150,7 @@ def monkey_patch_variable():
        {0}
        Args:
            self(Variable): left hand variable
-            other_var(Variable|float|int): right hand variable 
+            other_var(Variable|float|int): right hand variable

        Returns:
            Variable

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -69,6 +69,7 @@ __all__ = [
    'softmax_with_cross_entropy',
    'smooth_l1',
    'one_hot',
+    'autoincreased_step_counter',
 ]


@@ -2443,10 +2444,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
    return out


-def edit_distance(input,
-                  label,
-                  normalized=False,
-                  ignored_tokens=None,
+def edit_distance(input, label, normalized=True, ignored_tokens=None,
                  name=None):
    """
    EditDistance operator computes the edit distances between a batch of
@@ -3201,3 +3199,34 @@ def one_hot(input, depth):
        attrs={'depth': depth},
        outputs={'Out': one_hot_out})
    return one_hot_out
+
+
+def autoincreased_step_counter(counter_name=None, begin=1, step=1):
+    """
+    NOTE: The counter will be automatically increased by 1 every mini-batch
+    Return the run counter of the main program, which is started with 1.
+
+    Args:
+        counter_name(str): The counter name, default is '@STEP_COUNTER@'.
+        begin(int): The first value of this counter.
+        step(int): The increment step between each execution.
+
+    Returns(Variable): The global run counter.
+    """
+    helper = LayerHelper('global_step_counter')
+    if counter_name is None:
+        counter_name = '@STEP_COUNTER@'
+    counter, is_new_var = helper.create_or_get_global_variable(
+        name=counter_name, dtype='int64', shape=[1], persistable=True)
+    if is_new_var:
+        helper.set_variable_initializer(
+            counter, initializer=Constant(
+                value=begin - 1, force_cpu=True))
+        helper.main_program.global_block().prepend_op(
+            type='increment',
+            inputs={'X': [counter]},
+            outputs={'Out': [counter]},
+            attrs={'step': float(step)})
+        counter.stop_gradient = True
+
+    return counter
--- a/python/paddle/fluid/learning_rate_decay.py
+++ b/python/paddle/fluid/learning_rate_decay.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 import layers
-from framework import Variable
 from initializer import init_on_cpu

 __all__ = [
@@ -30,11 +29,15 @@ strategy according to this module.
 """


-def exponential_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False):
+def _decay_step_counter():
+    # the first global step is zero in learning rate decay
+    global_step = layers.autoincreased_step_counter(
+        counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
+    global_step = layers.cast(global_step, 'float32')
+    return global_step
+
+
+def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
    """Applies exponential decay to the learning rate.

    ```python
@@ -44,7 +47,6 @@ def exponential_decay(learning_rate,
    Args:
        learning_rate: A scalar float32 value or a Variable. This
          will be the initial learning rate during training
-        global_step: A Variable that record the training step.
        decay_steps: A Python `int32` number.
        decay_rate: A Python `float` number.
        staircase: Boolean. If set true, decay the learning rate every decay_steps.
@@ -52,8 +54,7 @@ def exponential_decay(learning_rate,
    Returns:
        The decayed learning rate
    """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for exponential_decay.")
+    global_step = _decay_step_counter()

    with init_on_cpu():
        # update learning_rate
@@ -65,23 +66,17 @@ def exponential_decay(learning_rate,
    return decayed_lr


-def natural_exp_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False):
+def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
    """Applies natural exponential decay to the initial learning rate.

-    ```python
-    if not staircase:
-        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-    else:
-        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-    ```
+    >>> if not staircase:
+    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+
    Args:
        learning_rate: A scalar float32 value or a Variable. This
          will be the initial learning rate during training
-        global_step: A Variable that record the training step.
        decay_steps: A Python `int32` number.
        decay_rate: A Python `float` number.
        staircase: Boolean. If set true, decay the learning rate every decay_steps.
@@ -89,8 +84,7 @@ def natural_exp_decay(learning_rate,
    Returns:
        The decayed learning rate
    """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for natural_exp_decay.")
+    global_step = _decay_step_counter()

    with init_on_cpu():
        div_res = global_step / decay_steps
@@ -101,23 +95,17 @@ def natural_exp_decay(learning_rate,
    return decayed_lr


-def inverse_time_decay(learning_rate,
-                       global_step,
-                       decay_steps,
-                       decay_rate,
-                       staircase=False):
+def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
    """Applies inverse time decay to the initial learning rate.

-    ```python
-    if staircase:
-      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    else:
-      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
-    ```
+    >>> if staircase:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+
    Args:
        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        global_step: A Variable that record the training step.
+          will be the initial learning rate during training.
        decay_steps: A Python `int32` number.
        decay_rate: A Python `float` number.
        staircase: Boolean. If set true, decay the learning rate every decay_steps.
@@ -125,8 +113,7 @@ def inverse_time_decay(learning_rate,
    Returns:
        The decayed learning rate
    """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for inverse_time_decay.")
+    global_step = _decay_step_counter()

    with init_on_cpu():
        div_res = global_step / decay_steps
@@ -139,26 +126,22 @@ def inverse_time_decay(learning_rate,


 def polynomial_decay(learning_rate,
-                     global_step,
                     decay_steps,
                     end_learning_rate=0.0001,
                     power=1.0,
                     cycle=False):
    """Applies polynomial decay to the initial learning rate.

-    ```python
-    if cycle:
-        decay_steps = decay_steps * ceil(global_step / decay_steps)
-    else:
-        global_step = min(global_step, decay_steps)
-    decayed_learning_rate = (learning_rate - end_learning_rate) *
-                      (1 - global_step / decay_steps) ^ power +
-                      end_learning_rate
-    ```
+    >>> if cycle:
+    >>>     decay_steps = decay_steps * ceil(global_step / decay_steps)
+    >>> else:
+    >>>     global_step = min(global_step, decay_steps)
+    >>> decayed_learning_rate = (learning_rate - end_learning_rate) *
+    >>>                   (1 - global_step / decay_steps) ^ power +
+    >>>                   end_learning_rate
    Args:
        learning_rate: A scalar float32 value or a Variable. This
          will be the initial learning rate during training
-        global_step: A Variable that record the training step.
        decay_steps: A Python `int32` number.
        end_learning_rate: A Python `float` number.
        power: A Python `float` number
@@ -167,8 +150,7 @@ def polynomial_decay(learning_rate,
    Returns:
        The decayed learning rate
    """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for inverse_time_decay.")
+    global_step = _decay_step_counter()

    with init_on_cpu():
        if cycle:
@@ -193,27 +175,24 @@ def polynomial_decay(learning_rate,
    return decayed_lr


-def piecewise_decay(global_step, boundaries, values):
+def piecewise_decay(boundaries, values):
    """Applies piecewise decay to the initial learning rate.

-    ```python
-    boundaries = [10000, 20000]
-    values = [1.0, 0.5, 0.1]
-
-    if step < 10000:
-        learning_rate = 1.0
-    elif step >= 10000 and step < 20000:
-        learning_rate = 0.5
-    else:
-        learning_rate = 0.1
-    ```
+    >>> boundaries = [10000, 20000]
+    >>> values = [1.0, 0.5, 0.1]
+    >>>
+    >>> if step < 10000:
+    >>>     learning_rate = 1.0
+    >>> elif 10000 <= step < 20000:
+    >>>     learning_rate = 0.5
+    >>> else:
+    >>>     learning_rate = 0.1
    """

    if len(values) - len(boundaries) != 1:
        raise ValueError("len(values) - len(boundaries) should be 1")

-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for piecewise_decay.")
+    global_step = _decay_step_counter()

    with init_on_cpu():
        lr = layers.create_global_var(

--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -29,6 +29,8 @@ dtype_to_size = {
    core.VarDesc.VarType.BOOL: 1
 }

+sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
+

 class ControlFlowGraph(object):
    def __init__(self, Program, ops, forward_num, skip_opt):
@@ -141,7 +143,7 @@ class ControlFlowGraph(object):
        self.pool = []
        for i in range(self.op_size):
            op = self._ops[i]
-            if op.type() == "while" or op.type() == "while_grad":
+            if op.type() in sub_block_ops:
                continue
            block_desc = op.block()
            is_forward = i < self._forward_num
@@ -198,67 +200,75 @@ class ControlFlowGraph(object):
                        block_desc, var_name, is_forward).shape()))


-def get_cfgs(input_program):
+def _process_sub_block_pair(pdesc, sub_block_pair):
    ops_list = []
-    pdesc = input_program.get_desc()
    block_desc = pdesc.block(0)
    op_size = block_desc.op_size()
-    # Get global block ops
-    ops_list.append(
-        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
-
-    while_sub_block_ids = []
-    while_grad_sub_block_ids = []
-    while_block_id_pair = []
-    while_op_dict = {}
+    for fwd_op, bwd_op in sub_block_pair:
+        sub_block_ids = []
+        grad_sub_block_ids = []
+        sub_block_id_pair = []
+        sub_op_dict = {}
+        for i in range(op_size):
+            op = block_desc.op(i)
+            if op.type() == fwd_op:
+                sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
+            elif op.type() == bwd_op:
+                grad_sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op

-    for i in range(op_size):
-        op = block_desc.op(i)
-        if op.type() == "while":
-            while_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_dict[op.attr("sub_block").id] = op
-        elif op.type() == "while_grad":
-            while_grad_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_dict[op.attr("sub_block").id] = op
+        # Find fwd_op/bwd_op block pair
+        for grad_id in grad_sub_block_ids:
+            fwd_id = pdesc.block(grad_id).get_forward_block_idx()
+            if fwd_id in sub_block_ids:
+                sub_block_id_pair.append((fwd_id, grad_id))
+                sub_block_ids.remove(fwd_id)

-    # Find while/while_grad block pair
-    for grad_id in while_grad_sub_block_ids:
-        forward_id = pdesc.block(grad_id).get_forward_block_idx()
-        if forward_id in while_sub_block_ids:
-            while_block_id_pair.append((forward_id, grad_id))
-            while_sub_block_ids.remove(forward_id)
+        # Get fwd_op/bwd_op block ops
+        for fwd_id, grad_id in sub_block_id_pair:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            block_op_size = sub_block.op_size()
+            for i in range(block_op_size):
+                sub_block_ops.append(sub_block.op(i))

-    # Get while/while_grad block ops
-    for forward_id, grad_id in while_block_id_pair:
-        while_block_ops = []
-        while_block = pdesc.block(forward_id)
-        while_block_op_size = while_block.op_size()
-        for i in range(while_block_op_size):
-            while_block_ops.append(while_block.op(i))
+            grad_sub_block = pdesc.block(grad_id)
+            grad_sub_block_op_size = grad_sub_block.op_size()
+            for i in range(grad_sub_block_op_size):
+                sub_block_ops.append(grad_sub_block.op(i))

-        while_grad_block = pdesc.block(grad_id)
-        while_grad_block_op_size = while_grad_block.op_size()
-        for i in range(while_grad_block_op_size):
-            while_block_ops.append(while_grad_block.op(i))
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
+            ops_list.append((sub_block_ops, block_op_size, sub_op_output))

-        while_op_output = set()
-        while_op_output.update(while_op_dict[forward_id].output_arg_names())
-        while_op_output.update(while_op_dict[grad_id].output_arg_names())
+        # Process rest fwd_op block ops
+        for fwd_id in sub_block_ids:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            sub_block_op_size = sub_block.op_size()
+            for i in range(sub_block_op_size):
+                sub_block_ops.append(sub_block.op(i))
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
+    return ops_list

-        ops_list.append((while_block_ops, while_block_op_size, while_op_output))

-    # Process rest while block ops
-    for forward_id in while_sub_block_ids:
-        while_block_ops = []
-        while_block = pdesc.block(forward_id)
-        while_block_op_size = while_block.op_size()
-        for i in range(while_block_op_size):
-            while_block_ops.append(while_block.op(i))
+def _get_cfgs(input_program):
+    ops_list = []
+    pdesc = input_program.get_desc()
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    # Get global block ops
+    ops_list.append(
+        ([block_desc.op(i) for i in range(op_size)], op_size, set()))

-        while_op_output = set()
-        while_op_output.update(while_op_dict[forward_id].output_arg_names())
+    sub_block_pair = [("while", "while_grad"), ("parallel_do",
+                                                "parallel_do_grad")]

-        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
+    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))

    cfgs = [
        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
@@ -268,6 +278,6 @@ def get_cfgs(input_program):


 def memory_optimize(input_program):
-    cfgs = get_cfgs(input_program)
+    cfgs = _get_cfgs(input_program)
    for cfg in cfgs:
        cfg.memory_optimize()
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -35,11 +35,10 @@ class Optimizer(object):
    but need to use one of it's implementation.
    """

-    def __init__(self, learning_rate, global_step=None, regularization=None):
+    def __init__(self, learning_rate, regularization=None):
        if not isinstance(learning_rate, float) and \
                not isinstance(learning_rate, framework.Variable):
            raise TypeError("learning rate should be float or Variable")
-        self._global_step = global_step
        self.regularization = regularization
        self._learning_rate = learning_rate
        # each program should have a independent learning rate
@@ -159,26 +158,6 @@ class Optimizer(object):
                            format(name, param.name))
        return self._accumulators[name][param.name]

-    def _increment_global_step(self, block):
-        """Increment the global step by 1 after every iteration
-
-        Args:
-            block: the block in which the loss variable is present
-
-        Returns:
-            list with global_step increment op as its only element
-        """
-        assert isinstance(block, framework.Block)
-        assert self._global_step is not None
-        # create the increment op
-        increment_op = block.append_op(
-            type="increment",
-            inputs={"X": self._global_step},
-            outputs={"Out": self._global_step},
-            attrs={"step": 1.0})
-
-        return increment_op
-
    def create_optimization_pass(self,
                                 parameters_and_grads,
                                 loss,
@@ -225,8 +204,6 @@ class Optimizer(object):
            # FIXME: Need to fix this once we figure out how to handle dependencies
            self._finish_update(loss.block)

-            if self._global_step is not None:
-                self._increment_global_step(loss.block)
            end = len(global_block.ops)
            return global_block.slice_ops(start, end)


--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -73,7 +73,7 @@ def reset_profiler():


 @contextmanager
-def profiler(state, sorted_key=None):
+def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
    """The profiler interface.
    Different from cuda_profiler, this profiler can be used to profile both CPU
    and GPU program. By defalut, it records the CPU and GPU operator kernels,
@@ -95,8 +95,9 @@ def profiler(state, sorted_key=None):
            The `max` means sorting by the maximum execution time.
            The `min` means sorting by the minimum execution time.
            The `ave` means sorting by the average execution time.
+        profile_path (string) : If state == 'All', it will write a profile
+            proto output file.
    """
-
    if state not in ['CPU', 'GPU', "All"]:
        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
    if state == "GPU":
@@ -122,4 +123,4 @@ def profiler(state, sorted_key=None):
    }
    # TODO(qingqing) : redirect C++ ostream to Python stream.
    # with core.ostream_redirect(stdout=True, stderr=True):
-    core.disable_profiler(key_map[sorted_key])
+    core.disable_profiler(key_map[sorted_key], profile_path)
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/__init__.py
+++ b/tools/__init__.py
--- a/tools/timeline.py
+++ b/tools/timeline.py