diff --git a/.travis.yml b/.travis.yml
index e2d49daa1981396628efa5d16459eb70e9e76884..ed566016ec6e0d30a0761356580114020ada66a0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,13 +4,17 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
+services:
+  - docker
 os:
   - linux
 env:
   - JOB=build_doc
   - JOB=check_style
+  - JOB=build_android
 addons:
   apt:
     packages:
@@ -41,8 +45,10 @@ before_install:
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
-    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
+    # 43min timeout
+    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
+    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
   - |
     if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/AUTHORS.md b/AUTHORS.md
index 389a7ad8bcfbb1d294bdd1f16905a7a0d2fbb932..9c6821d9f8681c5907c2fc9938fdb62ba64b9a92 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -1,5 +1,6 @@
 | Github account | name |
 |---|---|
+| abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
 | beckett1124 | Bin Qi |
 | JiayiFeng | Jia-Yi Feng |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2f440c2d089b5d596ab59d5099c0066ef325614..469af0f7859b9ea79d1fc4c53e19cc29bfe28ce8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,11 +166,11 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 
 set(EXTERNAL_LIBS
-    ${GFLAGS_LIBRARIES}
-    ${GLOG_LIBRARIES}
+    gflags
+    glog
     ${CBLAS_LIBRARIES}
-    ${PROTOBUF_LIBRARY}
-    ${ZLIB_LIBRARIES}
+    protobuf
+    zlib
     ${PYTHON_LIBRARIES}
 )
 
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 80eee112ddfb97bea57e302d76e0fef8fc9df9eb..786f224608f7d41c438411de0e09fedbcf2264b8 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -106,10 +106,10 @@ def vgg16_bn_drop(input):
     conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
     bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
     return fc2
 
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7730453fc9292015465713232abda155a18a1aad..0f76f55270592c5625a9624b33f4c0f82efdc627 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -59,7 +59,6 @@ endif(NOT WITH_GOLANG)
 
 if(NOT WITH_GPU)
     add_definitions(-DHPPL_STUB_FUNC)
-    add_definitions("-DCUPTI_LIB_PATH=\"\"")
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@@ -77,9 +76,7 @@ else()
     if(CUPTI_FOUND)
         include_directories(${CUPTI_INCLUDE_DIR})
         add_definitions(-DPADDLE_WITH_CUPTI)
-        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
     else()
-        add_definitions("-DCUPTI_LIB_PATH=\"\"")
         message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
     endif()
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index e568880632c885ca77e8e609918ff643e7ed6e47..20b8506e678af4db6ccb65bef99d28e085a67bf2 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -28,7 +28,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
 
 ExternalProject_Add(
-    zlib
+    extern_zlib
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/madler/zlib.git"
     GIT_TAG         "v1.2.8"
@@ -49,9 +49,11 @@ ExternalProject_Add(
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
+ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+ADD_DEPENDENCIES(zlib extern_zlib)
+
 LIST(APPEND external_project_dependencies zlib)
-ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 12e07bd5f883d2cf100ece391bdf7d2bbcafc8f7..356da582d1f8b6a8858af90ccdf5af2100e5db87 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,7 +104,9 @@ function(merge_static_libs TARGET_NAME)
   foreach(lib ${libs})
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
-  list(REMOVE_DUPLICATES libs_deps)
+  if(libs_deps)
+    list(REMOVE_DUPLICATES libs_deps)
+  endif()
 
   # To produce a library we need at least one source file.
   # It is created by add_custom_command below and will helps
@@ -191,10 +193,13 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
-      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
         list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       endif()
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index df186637726f60ee1b69cec7291477f3efcd059c..4471df36b0717171da4dff92ca0ec98b4f981028 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -72,7 +72,7 @@ copy(inference_lib DEPENDS paddle_fluid_shared
 )
 
 set(module "platform")
-copy(platform_lib
+copy(platform_lib DEPS profiler_py_proto
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
 )
diff --git a/doc/api/overview.rst b/doc/api/overview.rst
index 16b6cf42660c51feee09c689c671d5ef06663efb..a6f21428de1e4906e4af9433bc1c994f2b2c8b8e 100644
--- a/doc/api/overview.rst
+++ b/doc/api/overview.rst
@@ -7,10 +7,6 @@ it proposes some high-level concepts such as `Layers <http://www.paddlepaddle.or
 A model is composed of the computation described by a group of `Layers`, with `Evaluator` to define the error, `Optimizer` to update the parameters and `Data Reader` to feed in the data.
 
 We also provide the `interface for Training and Inference <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html>`_ to help control the training and inference phrase,
-it has several easy to use methods
+it has several easy to use methods to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
 
-- `paddle.train` 
-- `paddle.test`
-- `paddle.infer`
-
-to better expose the internal running details, different `events <http://www.paddlepaddle.org/docs/develop/api/en/v2/run_logic.html#event>`_ are available to users by writing some callbacks.
+All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code.
diff --git a/doc/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
index cb766c3838133740892928b587edcf3843b7abce..115b92a33888abf1e1be400e1abbb58b632a2976 100644
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -9,7 +9,7 @@
 为了编译PaddlePaddle，我们需要
 
 1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
-1. Docker
+2. Docker
 
 不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。
 
diff --git a/doc/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
index 556cbfdf087c340a7f7a1760f92325ab87eeea89..8fef9e7347e8d924026999bfda985381750c6b51 100644
--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -9,7 +9,7 @@ Requirements
 To build PaddlePaddle, you need
 
 1. A computer -- Linux, Windows, MacOS.
-1. Docker.
+2. Docker.
 
 Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
 We run all the tools by running this image.
diff --git a/doc/build_and_install/pip_install_en.rst b/doc/build_and_install/pip_install_en.rst
index c1e806c0fe5f03139c0dff985f9ae0856eaa2e98..0d4c925b6e2731bdfd76582309aa7e8adbafa6ae 100644
--- a/doc/build_and_install/pip_install_en.rst
+++ b/doc/build_and_install/pip_install_en.rst
@@ -1,4 +1,4 @@
-Install Using pip
+Install using pip
 ================================
 
 You can use current widely used Python package management
@@ -8,7 +8,7 @@ most of current Linux systems or MacOS.
 
 .. _pip_install:
 
-Install Using pip
+Install using pip
 ------------------------------
 
 Run the following command to install PaddlePaddle on the current
diff --git a/doc/design/parallel_do.md b/doc/design/parallel_do.md
index 45f7731996519b87fc591ee8ce831341d7ab2965..42bd136f825986d94fafaeaa5f58edb02848a74c 100644
--- a/doc/design/parallel_do.md
+++ b/doc/design/parallel_do.md
@@ -39,15 +39,16 @@ In the backward pass
 This implementation allows to write mixed device program like this
 
 ```python
-# get embedding feature on CPU
-feature = some_cpu_only_op(data)
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)
 
-gpu_places = get_place(use_gpu=True)
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
 # parallel processing on multiple GPUs
 pd = ParallelDo(gpu_places)
-with pd.do():
-    read_input(feature)
-    prediction = my_net(feature)
+with pd.do(input=data):
+    prediction = softmax(fc(fc(data, W1), W2))
     write_output(prediction)
 prediction = pd()
 loss = cross_entropy(prediction, label)
@@ -66,20 +67,20 @@ start_program
 main_program
 {
 block0 {
-  vars: data, places, w1, w2
+  vars: data, places, w1, w2, w1_grad, w2_grad,
   ops: data, get_place, parallel_do(block1),
        parallel_do_grad(block2),
        sgd(w2, w2_grad),
        sgd(w1, w1_grad)
 }
-block1 {
+block1 { # the forward pass
   parent_block: 0
   vars: data, h1, h2, loss
   ops: fc, fc, softmax
 }
-block2 {
+block2 { # the backward pass
   parent_block: 1
-  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
   ops: softmax_grad,
        fc_grad
        fc_grad
diff --git a/doc/dev/contribute_to_paddle_cn.md b/doc/dev/contribute_to_paddle_cn.md
index 3e0bf7b3973079a2063d33b6be4fe8a9dc5c07bb..d8bf093e09b53b302225739fa67146adc7976e4b 100644
--- a/doc/dev/contribute_to_paddle_cn.md
+++ b/doc/dev/contribute_to_paddle_cn.md
@@ -222,6 +222,7 @@ upstream
 ## 提交代码的一些约定
 
 为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
 1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
 2. 提交PUll Request前：
    - 请注意commit的数量：
@@ -231,6 +232,7 @@ upstream
 3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
 
 此外，在回复评审人意见时，请您遵守以下约定：
+
 1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
    - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
    - 对评审意见不同意的，请给出您自己的反驳理由。
diff --git a/doc/dev/index_en.rst b/doc/dev/index_en.rst
index 5dd12d2233cff20e021b90beb94571a2817bd1ad..5fdc30a2d688a6a269b4972c2591af60893e7dfb 100644
--- a/doc/dev/index_en.rst
+++ b/doc/dev/index_en.rst
@@ -4,6 +4,5 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  new_layer_en.rst
   contribute_to_paddle_en.md
   write_docs_en.rst
diff --git a/doc/faq/build_and_install/index_en.rst b/doc/faq/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..614db457d715665073cec1a495d4d7df6887532f
--- /dev/null
+++ b/doc/faq/build_and_install/index_en.rst
@@ -0,0 +1,5 @@
+############################
+Install, Build and Unit test
+############################
+
+TBD
diff --git a/doc/faq/cluster/index_en.rst b/doc/faq/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..855b7e8e53307b82a72c156be4ef509e27edf822
--- /dev/null
+++ b/doc/faq/cluster/index_en.rst
@@ -0,0 +1,5 @@
+###############################
+Cluster Training and Prediction
+###############################
+
+TBD
diff --git a/doc/faq/index_en.rst b/doc/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..57df868f760038b25fae30df7ab20a68875ad36a
--- /dev/null
+++ b/doc/faq/index_en.rst
@@ -0,0 +1,12 @@
+FAQ
+====
+
+ 
+..  toctree::
+  :maxdepth: 1
+
+  build_and_install/index_en.rst
+  model/index_en.rst
+  parameter/index_en.rst
+  local/index_en.rst
+  cluster/index_en.rst
diff --git a/doc/faq/local/index_en.rst b/doc/faq/local/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4cb43031933a8bbe9aebae04bc3e9c74a6d21b95
--- /dev/null
+++ b/doc/faq/local/index_en.rst
@@ -0,0 +1,5 @@
+#############################
+Local Training and Prediction
+#############################
+
+TBD
diff --git a/doc/faq/model/index_en.rst b/doc/faq/model/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cb26f59655f97dc28a2047994643ae16b8857964
--- /dev/null
+++ b/doc/faq/model/index_en.rst
@@ -0,0 +1,5 @@
+###################
+Model Configuration
+###################
+
+TBD
diff --git a/doc/faq/parameter/index_en.rst b/doc/faq/parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..61c7845af7e531013a06125f7c35b59081dafb42
--- /dev/null
+++ b/doc/faq/parameter/index_en.rst
@@ -0,0 +1,5 @@
+#################
+Parameter Setting
+#################
+
+TBD
diff --git a/doc/getstarted/concepts/use_concepts_en.rst b/doc/getstarted/concepts/use_concepts_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..406b0cbb913894dc333d8e4561c207793c33e475
--- /dev/null
+++ b/doc/getstarted/concepts/use_concepts_en.rst
@@ -0,0 +1,3 @@
+Basic Concept
+=============
+TBD
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index c680e1903750117073bee64cb4d4f4ccfff5ac3d..33f299be5680e0aa4a3f36638f51135503193d94 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -5,3 +5,4 @@ GET STARTED
   :maxdepth: 1
 
   quickstart_en.rst
+  concepts/use_concepts_en.rst
diff --git a/doc/howto/capi/compile_paddle_lib_en.md b/doc/howto/capi/compile_paddle_lib_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..11d69b9b79c1a41898d3060d3fe25a31330334a3
--- /dev/null
+++ b/doc/howto/capi/compile_paddle_lib_en.md
@@ -0,0 +1,3 @@
+## Install and Build
+
+TBD
diff --git a/doc/howto/capi/index_en.rst b/doc/howto/capi/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2cbbe362fd8e06abe9866d998f60fbb3458a80b5
--- /dev/null
+++ b/doc/howto/capi/index_en.rst
@@ -0,0 +1,9 @@
+C-API Prediction Library
+========================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_en.md
+  organization_of_the_inputs_en.md
+  workflow_of_capi_en.md
diff --git a/doc/howto/capi/organization_of_the_inputs_cn.md b/doc/howto/capi/organization_of_the_inputs_cn.md
index a889ae4ffab7be02468b4a5ac5a18e3cc77803c9..343526c213110cb9c6abaf9a12b3d634ad3fabe9 100644
--- a/doc/howto/capi/organization_of_the_inputs_cn.md
+++ b/doc/howto/capi/organization_of_the_inputs_cn.md
@@ -4,12 +4,15 @@
 
 ### 输入/输出数据类型
 在C-API中，按照基本数据类型在PaddlePaddle内部的定义和实现，输入数据可分为：
+
 1. 一维整型数组
 1. 二维浮点型矩阵
+
     - 稠密矩阵
     - 稀疏矩阵
 
 说明：
+
 1. 一维数组**仅支持整型值**；
     - 常用于自然语言处理任务，例如：表示词语在词典中的序号；
     - 分类任务中类别标签；
@@ -274,6 +277,7 @@ PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织
 如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
 
 于是，在组织神经网络输入时，需要思考完成以下工作：
+
 1. 为每一个输入/输出创建`argument`。
     - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
 1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
diff --git a/doc/howto/capi/organization_of_the_inputs_en.md b/doc/howto/capi/organization_of_the_inputs_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..250d3b2f749aed018e63527e817899c843dff996
--- /dev/null
+++ b/doc/howto/capi/organization_of_the_inputs_en.md
@@ -0,0 +1,3 @@
+## Input/Output Data Organization
+
+TBD
diff --git a/doc/howto/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
index 1ccc72eefbc730b2eab2d51f5b04e50728b735d7..1968c1099ac5734cd68b437f2f7aa428d7b5265e 100644
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -11,6 +11,7 @@
 </p>
 
 - 准备预测模型
+
     1. 只将神经网络结构进行序列化。
         - 只对神经网络结构进行序列化，加载模型需同时指定：网络结构的序列化结果和模型参数存储目录。
     1. 将网络结构定义和训练结束存储下来的模型参数文件（多个）合并入一个文件。
@@ -18,6 +19,7 @@
         - 预测时只需加载一个文件便于发布。
     - **注意**：以上两种方式只需选择其一即可。
 - 调用 C-API 开发预测序
+
     1. 初始化PaddlePaddle运行环境。
     1. 加载预测模型。
     1. 创建神经网络输入，组织输入数据。
@@ -90,6 +92,7 @@
 1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
 
 - 注意事项
+
     1. 使用PaddlePaddle V2 API训练，模型中所有可学习参数会被存为一个压缩文件，需要手动进行解压，将它们放在同一目录中，C-API不会直接加载 V2 API 存储的压缩文件。
     1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件，请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
     1. 通过灵活使用以上两个接口，加载模型可其它多种方式，例如也可在程序运行过程中再加载另外一个模型。
@@ -106,6 +109,7 @@ C-API支持的所有输入数据类型和他们的组织方式，请参考“输
 这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出，使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
 
 在组织神经网络输入，获取输出时，需要思考完成以下工作：
+
 1. 为每一个输入/输出创建`argument`；
 1. 为每一个`argument`创建`paddle_matrix`来存储数据；
 
diff --git a/doc/howto/capi/workflow_of_capi_en.md b/doc/howto/capi/workflow_of_capi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..1692ecd56520675f02ad25ef73761330ebd0e740
--- /dev/null
+++ b/doc/howto/capi/workflow_of_capi_en.md
@@ -0,0 +1,3 @@
+## C-API Workflow
+
+TBD
diff --git a/doc/howto/cluster/cmd_argument_cn.md b/doc/howto/cluster/cmd_argument_cn.md
index 5c575dd5b53f6e4ea025a8fbaebdb2d1a1f1c9ed..40e1dde4858b802c2e703bcca4b71730facde5ef 100644
--- a/doc/howto/cluster/cmd_argument_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
@@ -1,14 +1,17 @@
-## 启动参数说明
+# 启动参数说明
 
 下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
 
-### 启动参数服务器
+## 启动参数服务器
+
 执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+
 ```bash
 $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
 ```
 
 如果希望可以在后台运行pserver程序，并保存输出到一个日志文件，可以运行：
+
 ```bash
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
@@ -20,8 +23,10 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 - ports_num_for_sparse：**必选，默认0**，用于稀疏类型参数通信的端口个数
 - num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
 
-### 启动计算节点
+## 启动计算节点
+
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
+
 ```bash
 $ python train.py
 ```
@@ -67,7 +72,7 @@ paddle.init(
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
 
 
-### 准备数据集
+## 准备数据集
 
 参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
 
@@ -84,7 +89,8 @@ for f in flist:
 ```
 
 示例程序`prepare.py`会把训练集和测试集分别分割成多个文件（例子中为3个，后缀为`-00000`、`-00001`和`-00002`）:
-```
+
+```bash
 train.txt
 train.txt-00000
 train.txt-00001
@@ -99,12 +105,13 @@ test.txt-00002
 
 对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
 
-### 准备训练程序
+## 准备训练程序
 
 我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
 
 最后，工作空间应如下所示：
-```
+
+```bash
 .
 |-- my_lib.py
 |-- word_dict.pickle
@@ -133,3 +140,21 @@ test.txt-00002
 
 - `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
 - `test_data_dir`：包含测试数据集的目录。
+
+## 异步 SGD 更新
+
+我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。
+例如，设置 `AdaGrad` optimize 的 `is_async` 和 `async_lagged_grad_discard_ratio` 参数：
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: 是否为异步SGD更新模式。
+- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制，接收到足够的gradient(
+  `async_lagged_grad_discard_ratio * num_gradient_servers`)之后，后面的gradient
+  将会被抛弃。
diff --git a/doc/howto/cluster/cmd_argument_en.md b/doc/howto/cluster/cmd_argument_en.md
index 06fd5717564c99e3bb46835a2bd5071dff665f23..40179c28f83800c1c74a6045f8fac6841bdafeaa 100644
--- a/doc/howto/cluster/cmd_argument_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
@@ -1,18 +1,19 @@
-## Command-line arguments
+# Command-line arguments
 
 We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
 
-### Starting parameter server
+## Starting parameter server
 
 Type the below command to start a parameter server which will wait for trainers to connect:
 
 ```bash
-$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0
 ```
 
 If you wish to run parameter servers in background, and save a log file, you can type:
+
 ```bash
-$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log &
 ```
 
 Parameter Description
@@ -21,8 +22,10 @@ Parameter Description
 - ports_num: **required, default 1**, total number of ports will listen on.
 - ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
 - num_gradient_servers: **required, default 1**, total number of gradient servers.
+- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on.
+
+## Starting trainer
 
-### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")
 
 ```bash
@@ -70,7 +73,7 @@ Parameter Description
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
 
-### Prepare Training Dataset
+## Prepare Training Dataset
 
 Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
 
@@ -88,7 +91,7 @@ for f in flist:
 
 Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
 
-```
+```bash
 train.txt
 train.txt-00000
 train.txt-00001
@@ -103,13 +106,13 @@ When job started, every trainer needs to get it's own part of data. In some dist
 
 Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
 
-### Prepare Training program
+## Prepare Training program
 
 We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
 
-
 Your workspace may looks like:
-```
+
+```bash
 .
 |-- my_lib.py
 |-- word_dict.pickle
@@ -138,3 +141,21 @@ Your workspace may looks like:
 
 - `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
 - `test_data_dir`: containing testing data.
+
+## Async SGD Update
+
+We can set some parameters of the optimizer to make it support async SGD update.
+For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer:
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: Is Async-SGD or not.
+- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control.
+  when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed,
+  current async gradient will be discard silently.
diff --git a/doc/howto/cluster/multi_cluster/fabric_en.md b/doc/howto/cluster/multi_cluster/fabric_en.md
index bf270d89ab8514801ca4629cf412f73257429df9..bac9ffe1526a06a3a23b1d8acf33a5fb74b7e50d 100644
--- a/doc/howto/cluster/multi_cluster/fabric_en.md
+++ b/doc/howto/cluster/multi_cluster/fabric_en.md
@@ -1,4 +1,4 @@
-# Cluster Training Using Fabric
+# Fabric
 
 ## Prepare a Linux cluster
 
diff --git a/doc/howto/cluster/multi_cluster/k8s_aws_en.md b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
index 0dfa8237a3fa2c9c3ee11e873c9fbbed3cd6018f..8e8e87be711bd45177ed77c81c531606e801d1f0 100644
--- a/doc/howto/cluster/multi_cluster/k8s_aws_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
@@ -1,5 +1,4 @@
-
-# Distributed PaddlePaddle Training on AWS with Kubernetes
+# Kubernetes on AWS
 
 We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
 
diff --git a/doc/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc3d50b3ffd3b703a3a656caa1f96bdcf683f68b
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -0,0 +1,3 @@
+# Kubernetes Distributed
+
+TBD
diff --git a/doc/howto/cluster/multi_cluster/k8s_en.md b/doc/howto/cluster/multi_cluster/k8s_en.md
index c374f00a495d705ceddf8d3d930768ceeb93282b..96ff652705726fc56fa0078593cd2a695fcdb5e2 100644
--- a/doc/howto/cluster/multi_cluster/k8s_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_en.md
@@ -1,4 +1,4 @@
-# PaddlePaddle On Kubernetes
+# Kubernetes
 
 In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
 
diff --git a/doc/howto/cluster/multi_cluster/openmpi_cn.md b/doc/howto/cluster/multi_cluster/openmpi_cn.md
index 831cafdc03c6a908f31769d0467de022df42dab5..954b2215cc3136ec5b3e1cdc2f6d3f508f814516 100644
--- a/doc/howto/cluster/multi_cluster/openmpi_cn.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_cn.md
@@ -1,4 +1,4 @@
-# 在OpenMPI集群中提交训练作业
+# 在OpenMPI集群中启动训练
 
 ## 准备OpenMPI集群
 
diff --git a/doc/howto/cluster/multi_cluster/openmpi_en.md b/doc/howto/cluster/multi_cluster/openmpi_en.md
index 09af46e25ebe1f843dc7c7be0997dc706413b65c..a5c02b336b8a974f546499acae32edac24219be9 100644
--- a/doc/howto/cluster/multi_cluster/openmpi_en.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_en.md
@@ -1,4 +1,4 @@
-# Cluster Training Using OpenMPI
+# OpenMPI
 
 ## Prepare an OpenMPI cluster
 
diff --git a/doc/howto/cmd_parameter/index_cn.rst b/doc/howto/cmd_parameter/index_cn.rst
index 17b379f6295d66d864e2b53108012eff5895d96b..6900bb1443e611d326e8d5640e794ac2b9079beb 100644
--- a/doc/howto/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
@@ -2,10 +2,25 @@
 
 命令行参数设置
 ===============
+深度学习算法的实现有着多样化的特点，运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数，以实现对模型训练或预测流程的控制。
+
+在这一部分，首先以几个实际场景为例，展示了部分命令行参数的使用:
 
 ..  toctree::
   :maxdepth: 1
 
   use_case_cn.md
+
+接着对所有参数的使用场合进行概述和分类:
+
+..  toctree::
+  :maxdepth: 1
+
   arguments_cn.md
+
+最后给出细节描述，详细解释这些参数的属性和意义:
+
+..  toctree::
+  :maxdepth: 1
+
   detail_introduction_cn.md
diff --git a/doc/howto/rnn/hierarchical_layer_en.rst b/doc/howto/rnn/hierarchical_layer_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..236f58a160c7f77c28e4b1216b83b3d3cdaaa459
--- /dev/null
+++ b/doc/howto/rnn/hierarchical_layer_en.rst
@@ -0,0 +1,4 @@
+Layers supporting hierarchical sequence as input
+================================================
+
+TBD
diff --git a/doc/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e5aa05c117393e81c557ba67609f787b38587efd
--- /dev/null
+++ b/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -0,0 +1,4 @@
+API comparision between RNN and hierarchical RNN
+================================================
+
+TBD
diff --git a/doc/howto/rnn/index_en.rst b/doc/howto/rnn/index_en.rst
index 7adc79873d699fdfd5a85034bcef964dd1f19132..e1b20ef2e7bf4c521b613e54577ff6a3feaa8936 100644
--- a/doc/howto/rnn/index_en.rst
+++ b/doc/howto/rnn/index_en.rst
@@ -5,3 +5,6 @@ RNN Models
   :maxdepth: 1
 
   rnn_config_en.rst
+  recurrent_group_en.md
+  hierarchical_layer_en.rst
+  hrnn_rnn_api_compare_en.rst
diff --git a/doc/howto/rnn/recurrent_group_en.md b/doc/howto/rnn/recurrent_group_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..d264b0a9f85faffd49c1982117cb5a3ac6ffc015
--- /dev/null
+++ b/doc/howto/rnn/recurrent_group_en.md
@@ -0,0 +1,3 @@
+# Recurrent Group Tutorial
+
+TBD
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 166f56c28f464563a0b36007f58cebb58c286916..909f035cca3db2a02fd38462acc451375eceff40 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -8,3 +8,4 @@ PaddlePaddle Documentation
   build_and_install/index_en.rst
   howto/index_en.rst
   dev/index_en.rst
+  faq/index_en.rst
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 0b4c6db6f98d8d73b362d3c98f52a3914a031c68..82c7d4a2ec648449fc65ca2ae0de397b2f6fa120 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -96,3 +96,6 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
 cc_test(channel_test SRCS channel_test.cc)
+cc_test(tuple_test SRCS tuple_test.cc )
+cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+        channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc)
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index be578059388e627778d485365e5a735158603fe5..bda1bfb23b18f8c6b9f1c3eded461a9322a154af 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -91,6 +91,11 @@ class ChannelHolder {
 
   inline bool IsInitialized() const { return holder_ != nullptr; }
 
+  inline const std::type_index Type() {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    return holder_->Type();
+  }
+
  private:
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a template
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 2c4e622bd789c691c38a8810fe5e09e464a8cf1f..695169fcb9e93b5e69d3d4ae6f63f8e4c2d1605f 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -542,3 +542,341 @@ TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
   ChannelHolderSendReceive(ch);
   delete ch;
 }
+
+TEST(ChannelHolder, ChannelUninitializedTest) {
+  ChannelHolder *ch = new ChannelHolder();
+  EXPECT_EQ(ch->IsInitialized(), false);
+  int i = 10;
+  EXPECT_EQ(ch->Send(&i), false);
+  EXPECT_EQ(ch->Receive(&i), false);
+  bool is_exception = false;
+  try {
+    ch->Type();
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+TEST(ChannelHolder, ChannelInitializedTest) {
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(2);
+  EXPECT_EQ(ch->IsInitialized(), true);
+  // Channel should remain intialized even after close
+  ch->close();
+  EXPECT_EQ(ch->IsInitialized(), true);
+  delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchSendTest) {
+  // Test with unbuffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  bool is_exception = false;
+  bool boolean_data = true;
+  try {
+    ch->Send(&boolean_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+
+  // Test with Buffered Channel
+  ch = new ChannelHolder();
+  ch->Reset<float>(10);
+  is_exception = false;
+  int int_data = 23;
+  try {
+    ch->Send(&int_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchReceiveTest) {
+  // Test with unbuffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  bool is_exception = false;
+  bool float_data;
+  try {
+    ch->Receive(&float_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+
+  // Test with Buffered Channel
+  ch = new ChannelHolder();
+  ch->Reset<float>(10);
+  is_exception = false;
+  int int_data = 23;
+  try {
+    ch->Receive(&int_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  ch->close();
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  if (isBuffered) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly close the thread
+  // This should unblock all senders
+  ch->close();
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  if (isBuffered) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a channelholder unblocks
+//  any receivers waiting on the channel
+TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
+  // Check for buffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderCloseUnblocksReceiversTest(ch);
+  delete ch;
+
+  // Check for unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a channelholder unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
+  // Check for buffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderCloseUnblocksSendersTest(ch, true);
+  delete ch;
+
+  // Check for unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderCloseUnblocksSendersTest(ch, false);
+  delete ch;
+}
+
+// This tests that destroying a channelholder unblocks
+//  any senders waiting for channel
+void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  if (isBuffered) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (isBuffered) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channelholder also unblocks
+//  any receivers waiting on the channel
+void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderDestroyUnblockReceivers(ch);
+  // ch is already deleted already deleted in
+  // ChannelHolderDestroyUnblockReceivers
+
+  // Check for Unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderDestroyUnblockReceivers(ch);
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderDestroyUnblockSenders(ch, true);
+  // ch is already deleted already deleted in
+  // ChannelHolderDestroyUnblockReceivers
+
+  // Check for Unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderDestroyUnblockSenders(ch, false);
+}
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5770b0a5a18659e615e80a7c48113d8b543b69ec
--- /dev/null
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(go);
+USE_NO_KERNEL_OP(channel_close);
+USE_NO_KERNEL_OP(channel_create);
+USE_NO_KERNEL_OP(channel_recv);
+USE_NO_KERNEL_OP(channel_send);
+USE_NO_KERNEL_OP(elementwise_add);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name,
+                       T value) {
+  // Create LoDTensor<int> of dim [1,1]
+  auto var = scope.Var(name);
+  auto tensor = var->GetMutable<LoDTensor>();
+  tensor->Resize({1, 1});
+  T *expect = tensor->mutable_data<T>(place);
+  expect[0] = value;
+}
+
+void InitTensorsInScope(Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+
+  // Create channel variable
+  scope.Var("Channel");
+
+  // Create Variables, x0 will be put into channel,
+  // result will be pulled from channel
+  CreateIntVariable(scope, place, "Status", false);
+  CreateIntVariable(scope, place, "x0", 99);
+  CreateIntVariable(scope, place, "result", 0);
+}
+
+void AddOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, AttributeMap attrs,
+           BlockDesc *block) {
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(Concurrency, Go_Op) {
+  Scope scope;
+  p::CPUPlace place;
+
+  // Initialize scope variables
+  InitTensorsInScope(scope, place);
+
+  framework::Executor executor(place);
+  ProgramDesc program;
+  BlockDesc *block = program.MutableBlock(0);
+
+  // Create channel OP
+  AddOp("channel_create", {}, {{"Out", {"Channel"}}},
+        {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
+        block);
+
+  // Create Go Op routine
+  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
+  AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
+        {{"Status", {"Status"}}}, {}, goOpBlock);
+
+  // Create Go Op
+  AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
+        block);
+
+  // Create Channel Receive Op
+  AddOp("channel_recv", {{"Channel", {"Channel"}}},
+        {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
+
+  // Create Channel Close Op
+  AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
+
+  // Check the result tensor to make sure it is set to 0
+  const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
+  auto *initialData = tensor.data<int>();
+  EXPECT_EQ(initialData[0], 0);
+
+  executor.Run(program, &scope, 0, true, true);
+
+  // After we call executor.run, the Go operator should do a channel_send to set
+  // the
+  // "result" variable to 99
+  auto *finalData = tensor.data<int>();
+  EXPECT_EQ(finalData[0], 99);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
index 97afd366387e9ba6476be59a4d73d53a38834d0e..05e423b8a52962d47a6615d48243444374b470e3 100644
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -26,12 +26,15 @@ Dim<i> make_dim(const int64_t* d) {
 }
 
 template <>
-Dim<1> make_dim<1>(const int64_t* d) {
-  return Dim<1>(*d);
+Dim<0> make_dim<0>(const int64_t* d) {
+  return Dim<0>(*d);
 }
 
 void make_ddim(DDim& ddim, const int64_t* dims, int n) {
   switch (n) {
+    case 0:
+      ddim = make_dim<0>(dims);
+      break;
     case 1:
       ddim = make_dim<1>(dims);
       break;
@@ -190,7 +193,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
     this->operator()(t.tail);
   }
 
-  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+  void operator()(const Dim<0>& t) {}
 };
 /// @endcond
 
@@ -247,9 +250,8 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
     }
   }
 
-  void operator()(const Dim<1>& dim) {
-    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
-    vector.push_back(dim.head);
+  void operator()(const Dim<0>& dim) {
+    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
   }
 };
 
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index 5aff10d3b95902fdb9fe432d9f31830304dd3d07..f05b5ee3faee856a41f1376e5952710b550e7c42 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -30,8 +30,8 @@ namespace framework {
  * The number of dimensions must be between [1, 9].
  */
 struct DDim {
-  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                         Dim<8>, Dim<9>>
+  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
+                         Dim<7>, Dim<8>, Dim<9>>
       DDimVar;
   DDimVar var;
 
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
index 08b708006aadc4769bde7b37347ac1adfeca2bf7..58b75ba4b5a1973d97aedcea0f76681b625afa65 100644
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -72,38 +72,36 @@ struct Dim {
 
 // Base case specialization
 template <>
-struct Dim<1> {
-  static constexpr int dimensions = 1;
+struct Dim<0> {
+  static constexpr int dimensions = 0;
 
   HOSTDEVICE
-  Dim(int64_t _head) : head(_head) {}
+  Dim(int64_t _head) {}
 
   HOSTDEVICE
-  Dim() : head(0) {}
+  Dim() {}
 
   HOSTDEVICE
-  Dim(int idx, const Dim<1>& size) : head(idx) {
+  Dim(int idx, const Dim<0>& size) {
 #ifndef __CUDA_ARCH__
-    if (idx >= size.head) {
+    if (idx > 0) {
       throw std::invalid_argument("Index out of range.");
     }
 #else
-    PADDLE_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx == 0);
 #endif
   }
 
   HOSTDEVICE
-  bool operator==(const Dim<1>& o) const { return (head == o.head); }
+  bool operator==(const Dim<0>& o) const { return true; }
 
   HOSTDEVICE
-  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
+  bool operator!=(const Dim<0>& o) const { return false; }
 
   HOSTDEVICE
   int64_t& operator[](int idx);
   HOSTDEVICE
   int64_t operator[](int idx) const;
-
-  int64_t head;
 };
 
 namespace {
@@ -154,15 +152,20 @@ HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
+  throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(idx == 0);
+  PADDLE_ASSERT(false);
 #endif
-  return dim.head;
+#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
+  // On CUDA versions previous to 8.0, only __shared__ variables
+  // could be declared as static in the device code.
+  int64_t head = 0;
+#else
+  static int64_t head = 0;
+#endif
+  return head;
 }
 
 template <int D>
@@ -181,15 +184,20 @@ HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
+  throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(idx == 0);
+  PADDLE_ASSERT(false);
 #endif
-  return dim.head;
+#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
+  // On CUDA versions previous to 8.0, only __shared__ variables
+  // could be declared as static in the device code.
+  int64_t head = 0;
+#else
+  static int64_t head = 0;
+#endif
+  return head;
 }
 
 }  // namespace
@@ -218,12 +226,12 @@ HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
 }
 
 // Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
   return indexer(*this, i);
 }
 
@@ -251,8 +259,8 @@ HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
-  return a.head * b.head;
+HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
+  return 0;
 }
 
 // Product of a Dim
@@ -264,8 +272,8 @@ HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
-  return prod * a.head;
+HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
+  return prod;
 }
 
 // Is 0 <= idx_i < size_i for all i?
@@ -278,8 +286,8 @@ HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
 // Base case of is 0 <= idx_i < size_i ?
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head));
+HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
+  return true;
 }
 
 /**
@@ -294,8 +302,8 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
 // Base case of ex_prefix_mul
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
-  return Dim<1>(mul);
+HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
+  return Dim<0>();
 }
 ///\endcond
 
@@ -309,8 +317,8 @@ HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
 
 // Base case
 template <>
-HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head + b.head);
+HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
+  return Dim<0>();
 }
 
 template <int i>
@@ -328,8 +336,8 @@ HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
 
 // Base case
 template <>
-HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head * b.head);
+HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
+  return Dim<0>();
 }
 
 template <int i>
@@ -356,10 +364,9 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
 ///\cond HIDDEN
 
 template <>
-HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
-                                           const Dim<1>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<1>(norm_stride);
+HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
+                                           const Dim<0>& stride) {
+  return Dim<0>();
 }
 
 ///\endcond
@@ -394,6 +401,10 @@ typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
   return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
+  return os;
+}
+
 template <int i>
 HOST std::string Dim<i>::to_string() const {
   std::stringstream stream;
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 38f22b89143c3e23c8368b9281ccc757a892a373..96f53dc1bc8747e1b8ea84166614f98ff363ae5e 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -117,6 +117,7 @@ message VarType {
     // raw variables should manage their own allocations
     // in operators like nccl_op
     RAW = 17;
+    TUPLE = 18;
   }
 
   required Type type = 1;
@@ -148,6 +149,9 @@ message VarType {
     required int64 capacity = 2;
   }
   optional ChannelDesc channel = 6;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
 }
 
 message VarDesc {
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
new file mode 100644
index 0000000000000000000000000000000000000000..78996908b18a5a0935d8de9920e8ccef9069e74b
--- /dev/null
+++ b/paddle/fluid/framework/tuple.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+typedef boost::variant<int, int64_t, float, double, std::string, Tensor,
+                       LoDTensor /*, ChannelHolder*/>
+    ElementVar;
+
+class Tuple {
+ public:
+  using ElementVars = std::vector<ElementVar>;
+
+  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+      : var_(var), var_desc_(var_desc) {}
+  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+
+  ElementVar get(int idx) const { return var_[idx]; };
+
+  ElementVar& get(int idx) { return var_[idx]; };
+
+  bool isSameType(Tuple& t) const;
+
+  size_t getSize() const { return var_.size(); };
+
+ private:
+  ElementVars var_;
+  std::vector<VarDesc> var_desc_;
+};
+
+bool Tuple::isSameType(Tuple& t) const {
+  size_t tuple_size = getSize();
+  if (tuple_size != t.getSize()) {
+    return false;
+  }
+  for (size_t j = 0; j < tuple_size; ++j) {
+    auto type1 = get(j).which();
+    auto type2 = t.get(j).which();
+    if (type1 != type2) return false;
+  }
+  return true;
+}
+
+Tuple* make_tuple(std::vector<ElementVar> tuple) { return new Tuple(tuple); }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tuple_test.cc b/paddle/fluid/framework/tuple_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..810900f161ccc08234e28b982bdd962e4cded9ae
--- /dev/null
+++ b/paddle/fluid/framework/tuple_test.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <sstream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tuple.h"
+
+TEST(Tuple, Make) {
+  std::vector<paddle::framework::ElementVar> element_type;
+  element_type.push_back(12);
+  element_type.push_back(12.0f);
+  element_type.push_back("ElementVar");
+
+  paddle::framework::Tuple* tuple = paddle::framework::make_tuple(element_type);
+
+  EXPECT_EQ(boost::get<int>(tuple->get(0)), 12);
+  EXPECT_EQ(boost::get<float>(tuple->get(1)), 12.0f);
+  EXPECT_EQ(boost::get<std::string>(tuple->get(2)), "ElementVar");
+
+  delete tuple;
+}
+
+TEST(Tuple, IsTheSameType) {
+  std::vector<paddle::framework::ElementVar> element_type1;
+  std::vector<paddle::framework::ElementVar> element_type2;
+  std::vector<paddle::framework::ElementVar> element_type3;
+
+  element_type1.push_back(12);
+  element_type1.push_back(12.0f);
+  element_type1.push_back("Tuple1");
+
+  element_type2.push_back(13);
+  element_type2.push_back(13.0f);
+  element_type2.push_back("Tuple2");
+
+  element_type3.push_back(14.0f);
+  element_type3.push_back(14);
+  element_type3.push_back("Tuple3");
+
+  paddle::framework::Tuple* tuple1 =
+      paddle::framework::make_tuple(element_type1);
+  paddle::framework::Tuple* tuple2 =
+      paddle::framework::make_tuple(element_type2);
+  paddle::framework::Tuple* tuple3 =
+      paddle::framework::make_tuple(element_type3);
+
+  EXPECT_TRUE(tuple1->isSameType(*tuple2));
+  EXPECT_FALSE(tuple1->isSameType(*tuple3));
+
+  delete tuple1;
+  delete tuple2;
+  delete tuple3;
+}
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4da46e94c5cd979507fed80b35ebedf0cc6791d0..e1c02ec1613a7c31258f2d305b040ea627cebcdd 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -130,7 +130,7 @@ endif()
 
 if(WITH_DISTRIBUTE)
     add_subdirectory(detail)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
     op_library(send_op DEPS ${DISTRIBUTE_DEPS})
     set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc
index 2b3f26c0a890c33f9b4f4c8a5a271123d7ff0b31..1218d9fdc1e6101d17bc09a4ae769f5fbf8e7b15 100644
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -41,6 +41,14 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ColToRowMatchIndices", dims);
     ctx->SetOutputDim("ColToRowMatchDist", dims);
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc
index 1fc201286f335f9b3e1ef87c85c0b02fc0b29e4b..eccdd408a17a07a541480705242b137f8207c139 100644
--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -37,12 +37,19 @@ class BoxCoderOp : public framework::OperatorWithKernel {
                       "The rank of Input of PriorBoxVar must be 2");
     PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
     PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
-    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                      "The rank of Input of TargetBox must be 2");
-    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                      "The shape of TargetBox is [M, 4]");
 
-    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                        "The rank of Input of TargetBox must be 2");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                        "The shape of TargetBox is [M, 4]");
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
+                        "The rank of Input of TargetBox must be 3");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+    }
 
     ctx->SetOutputDim(
         "OutputBox",
@@ -70,25 +77,28 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
              "of variance.");
     AddInput(
         "TargetBox",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the box if the input "
-        "is image feature map, they are close to the origin of the coordinate "
-        "system. [xmax, ymax] is the right bottom coordinate of the box. "
-        "This tensor can contain LoD information to represent a batch "
-        "of inputs. One instance of this batch can contain different "
-        "numbers of entities.");
+        "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
+        "[N, 4] when code_type is 'encode_center_size'. This input also can "
+        "be a 3-D Tensor with shape [N, M, 4] when code_type is "
+        "'decode_center_size'. [N, 4], each box is represented as "
+        "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
+        "of the box if the input is image feature map, they are close to "
+        "the origin of the coordinate system. [xmax, ymax] is the right "
+        "bottom coordinate of the box. This tensor can contain LoD "
+        "information to represent a batch of inputs. One instance of this "
+        "batch can contain different numbers of entities.");
     AddAttr<std::string>("code_type",
                          "(string, default encode_center_size) "
                          "the code type used with the target box")
         .SetDefault("encode_center_size")
         .InEnum({"encode_center_size", "decode_center_size"});
-    AddOutput(
-        "OutputBox",
-        "(LoDTensor or Tensor) "
-        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
-        "representing the result of N target boxes encoded/decoded with "
-        "M Prior boxes and variances.");
+    AddOutput("OutputBox",
+              "(LoDTensor or Tensor) "
+              "When code_type is 'encode_center_size', the output tensor of "
+              "box_coder_op with shape [N, M, 4] representing the result of N "
+              "target boxes encoded with M Prior boxes and variances. When "
+              "code_type is 'decode_center_size', N represents the batch size "
+              "and M represents the number of deocded boxes.");
 
     AddComment(R"DOC(
 Bounding Box Coder Operator.
diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu
index 7ab242edfa3bae0c394d2ca8c0b32e2c5761fa70..0944e9c95d4a66cc4a51751a8c70cd7a3fefaf1a 100644
--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
@@ -66,7 +66,6 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                        T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
-    const int row_idx = idx / col;
     const int col_idx = idx % col;
     T prior_box_width =
         prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
@@ -79,17 +78,16 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                            2;
 
     T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
-                             target_box_data[row_idx * len + 2]) *
+                             target_box_data[idx * len + 2]) *
                          prior_box_width;
     T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
-                              target_box_data[row_idx * len + 3]) *
+                              target_box_data[idx * len + 3]) *
                           prior_box_height;
     T target_box_center_x = prior_box_var_data[col_idx * len] *
-                                target_box_data[row_idx * len] *
-                                prior_box_width +
+                                target_box_data[idx * len] * prior_box_width +
                             prior_box_center_x;
     T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
-                                target_box_data[row_idx * len + 1] *
+                                target_box_data[idx * len + 1] *
                                 prior_box_height +
                             prior_box_center_y;
 
diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/box_coder_op.h
index 5e105aff52673af54970b5d7f05dc1bc39bbc901..3c7cac1cd17042994287effc31a918ebd4353c4c 100644
--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
@@ -89,6 +89,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
+        size_t offset = i * col * len + j * len;
         T prior_box_width =
             prior_box_data[j * len + 2] - prior_box_data[j * len];
         T prior_box_height =
@@ -99,20 +100,19 @@ class BoxCoderKernel : public framework::OpKernel<T> {
             (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
 
         T target_box_center_x = prior_box_var_data[j * len] *
-                                    target_box_data[i * len] * prior_box_width +
+                                    target_box_data[offset] * prior_box_width +
                                 prior_box_center_x;
         T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                    target_box_data[i * len + 1] *
+                                    target_box_data[offset + 1] *
                                     prior_box_height +
                                 prior_box_center_y;
         T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                      target_box_data[i * len + 2]) *
+                                      target_box_data[offset + 2]) *
                              prior_box_width;
         T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                       target_box_data[i * len + 3]) *
+                                       target_box_data[offset + 3]) *
                               prior_box_height;
 
-        size_t offset = i * col * len + j * len;
         output[offset] = target_box_center_x - target_box_width / 2;
         output[offset + 1] = target_box_center_y - target_box_height / 2;
         output[offset + 2] = target_box_center_x + target_box_width / 2;
diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5892650c49e2e9d7345fb94465d124cff57f0a6f
--- /dev/null
+++ b/paddle/fluid/operators/channel_close_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace pf = paddle::framework;
+static constexpr char kChannel[] = "Channel";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCloseOp : public framework::OperatorBase {
+ public:
+  ChannelCloseOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto &inp = *scope.FindVar(Input(kChannel));
+
+    // Get the mutable version of the channel variable and closes it.
+    pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
+    ch->close();
+  }
+};
+
+class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("Channel"),
+                   "The input of ChannelClose op must be set");
+  }
+};
+
+class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelCloseOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kChannel,
+             "The Channel Variable that should be closed by"
+             " the ChannelClose Op.");
+    AddComment(R"DOC(
+Channel Close Operator.
+
+This operator closes an open channel.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelCloseOpMaker);
diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2fdfd0e1f24ed071bb57b7de8f99b2d5e1d3196
--- /dev/null
+++ b/paddle/fluid/operators/channel_create_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace pf = paddle::framework;
+
+static constexpr char kOutput[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCreateOp : public framework::OperatorBase {
+ public:
+  ChannelCreateOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto &out = *scope.FindVar(Output(kOutput));
+
+    // Determine the datatype and capacity of the channel to be created
+    // from the attributes provided.
+    auto dtype =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
+    auto capacity = Attr<int>("capacity");
+
+    // Based on the datatype, create a new channel holder initialized with
+    // the given capacity. When capacity is 0, an unbuffered channel is
+    // created.
+    pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
+    if (dtype == framework::proto::VarType::LOD_TENSOR) {
+      ch->Reset<pf::LoDTensor>(capacity);
+    } else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
+      ch->Reset<pf::SelectedRows>(capacity);
+    } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
+      ch->Reset<pf::LoDRankTable>(capacity);
+    } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      ch->Reset<pf::LoDTensorArray>(capacity);
+    } else if (dtype == framework::proto::VarType::READER) {
+      ch->Reset<pf::ReaderHolder>(capacity);
+    } else if (dtype == framework::proto::VarType::CHANNEL) {
+      ch->Reset<pf::ChannelHolder>(capacity);
+    } else if (dtype == framework::proto::VarType::BOOL) {
+      ch->Reset<bool>(capacity);
+    } else if (dtype == framework::proto::VarType::INT32) {
+      ch->Reset<int>(capacity);
+    } else if (dtype == framework::proto::VarType::INT64) {
+      ch->Reset<int64_t>(capacity);
+    } else if (dtype == framework::proto::VarType::FP32) {
+      ch->Reset<float>(capacity);
+    } else if (dtype == framework::proto::VarType::FP64) {
+      ch->Reset<double>(capacity);
+    } else {
+      PADDLE_THROW(
+          "Data type %d is not in "
+          "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
+          "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
+          dtype);
+    }
+  }
+};
+
+class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasOutput(kOutput),
+                   "The output of ChannelCreate op must be set");
+    context->SetOutputDim(kOutput, {1});
+  }
+};
+
+class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelCreateOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(kOutput,
+              "The object of a Channel type created by ChannelCreate Op.");
+    AddAttr<int>("capacity", "The size of the buffer of Channel.")
+        .SetDefault(0);
+    AddAttr<int>("data_type", "The data type of elements inside the Channel.");
+    AddComment(R"DOC(
+Channel Create Operator.
+
+This operator creates an object of the VarType Channel and returns it.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelCreateOpMaker);
diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c12b88e7a91c4ea7044223464a2f902db494d1a8
--- /dev/null
+++ b/paddle/fluid/operators/channel_recv_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include <paddle/fluid/framework/lod_rank_table.h>
+#include <paddle/fluid/framework/lod_tensor_array.h>
+#include <paddle/fluid/framework/reader.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char Status[] = "Status";
+static constexpr char Out[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+void SetReceiveStatus(const platform::Place &dev_place,
+                      framework::Variable &status_var, bool status) {
+  auto cpu = platform::CPUPlace();
+  auto status_tensor =
+      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                        cpu);
+  status_tensor[0] = status;
+}
+
+bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) {
+  // Get type of channel and use that to call mutable data for Variable
+  auto type = framework::ToVarType(ch->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    return ch->Receive(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    return ch->Receive(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelReceive:Unsupported type");
+}
+
+class ChannelRecvOp : public framework::OperatorBase {
+ public:
+  ChannelRecvOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const {
+    PADDLE_ENFORCE(ctx->HasInput(Channel),
+                   "Input(Channel) of ChannelRecvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Out),
+                   "Input(Channel) of ChannelRecvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Status),
+                   "Output(Status) of ChannelRecvOp should not be null.");
+    ctx->SetOutputDim("Status", {1});
+  }
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    // Get the channel holder created by channel_create op, passed as input.
+    framework::ChannelHolder *ch =
+        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
+    auto output_var = scope.FindVar(Output(Out));
+    // Receive the data from the channel.
+    bool ok = ChannelReceive(ch, output_var);
+
+    // Set the status output of the `ChannelReceive` call.
+    SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+  }
+};
+
+class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelRecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(Channel,
+             "(Channel) A variable which \"receives\" the a value sent"
+             "to it by a channel_send op.")
+        .AsDuplicable();
+    AddOutput(Out,
+              "(Variable) Output Variable that will hold the data received"
+              " from the Channel")
+        .AsDuplicable();
+    AddOutput(Status,
+              "(Tensor) An LoD Tensor that returns a boolean status of the"
+              "result of the receive operation.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelRecvOpMaker);
diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d7715ad229e821f02437246e3326063cb1ee757
--- /dev/null
+++ b/paddle/fluid/operators/channel_send_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include <paddle/fluid/framework/lod_rank_table.h>
+#include <paddle/fluid/framework/lod_tensor_array.h>
+#include <paddle/fluid/framework/reader.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char X[] = "X";
+static constexpr char Status[] = "Status";
+static constexpr char copy[] = "copy";
+
+namespace paddle {
+namespace operators {
+
+void SetSendStatus(const platform::Place &dev_place,
+                   framework::Variable &status_var, bool status) {
+  auto cpu = platform::CPUPlace();
+  auto status_tensor =
+      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                        cpu);
+  status_tensor[0] = status;
+}
+
+bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+  auto type = framework::ToVarType(var->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    return ch->Send(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    return ch->Send(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    return ch->Send(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    return ch->Send(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    return ch->Send(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    return ch->Send(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelSend:Unsupported type");
+}
+
+class ChannelSendOp : public framework::OperatorBase {
+ public:
+  ChannelSendOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const {
+    PADDLE_ENFORCE(ctx->HasInput(Channel),
+                   "Input(Channel) of ChannelSendOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(X),
+                   "Input(X) of ChannelSendOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Status),
+                   "Output(Status) of ChannelSendOp should not be null.");
+    ctx->SetOutputDim("Status", {1});
+  }
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    // Get the channel holder created by channel_create op, passed as input.
+    framework::ChannelHolder *ch =
+        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
+    auto input_var = scope.FindVar(Input(X));
+
+    // Send the input data through the channel.
+    bool ok = ChannelSend(ch, input_var);
+
+    // Set the status output of the `ChannelSend` call.
+    SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+  }
+};
+
+class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelSendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(Channel,
+             "(Channel) A variable which \"sends\" the passed in value to "
+             "a listening receiver.")
+        .AsDuplicable();
+    AddInput(X, "(Variable) The value which gets sent by the channel.")
+        .AsDuplicable();
+    AddOutput(Status,
+              "(Tensor) An LoD Tensor that returns a boolean status of the"
+              "result of the send operation.")
+        .AsDuplicable();
+    AddAttr<bool>(copy, "(bool, default false) Should copy before send")
+        .SetDefault(false);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelSendOpMaker);
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
index bac5cdc99c0133b1e6da3f6a23bc0512ca4177f5..0b7c470fe72eb4270b8d5b2d227642d85683c16d 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -24,6 +24,29 @@ namespace detail {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor;
 
+template <typename T>
+struct StridedMemcpyFunctor<T, 0> {
+  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
+                  framework::Dim<0> dst_stride, T* dst) const {
+    auto place = dev_ctx.GetPlace();
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
+      auto& cuda_ctx =
+          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
+                   cuda_ctx.stream());
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+    }
+  }
+};
+
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
   void operator()(const platform::DeviceContext& dev_ctx, const T* src,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 06bcd0be646e7dff72b46b1c9031464de21b3c6a..fe31bbaed44fced68b7b51dd2c2031950ec4247d 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -65,12 +65,17 @@ smaller than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
 1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a subset of $X$.
+2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
+   of size 1 for $Y$ will be ignored for the consideration of subsequence.
+
 
 For case 2:
+
 $Y$ will be broadcasted to match the shape of $X$ and axis should be
 set to index of the start dimension to broadcast $Y$ onto $X$.
 
+If axis is -1, it is treated as axis=rank(X)-rank(Y).
+
 For example
   .. code-block:: python
 
@@ -79,6 +84,7 @@ For example
     shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
     shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
 Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
 information. However, the output only shares the LoD information with input $X$.
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 5c7830353093ebbf6a5216be497ea6f1f1c21909..ffda53a383ced411415e528886a23f28f6a62648 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #ifdef __NVCC__
 #include <thrust/iterator/iterator_adaptor.h>
+#include "paddle/fluid/platform/cuda_helper.h"
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif
 
@@ -61,6 +62,19 @@ inline void get_mid_dims(const framework::DDim& x_dims,
   }
 }
 
+inline void trim_trailing_singular_dims(framework::DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  if (actual_dims_size != dims.size()) {
+    auto actual_dims = framework::vectorize(dims);
+    actual_dims.resize(actual_dims_size);
+    dims = framework::make_ddim(actual_dims);
+  }
+}
+
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
 template <typename T, typename DeviceContext>
@@ -263,44 +277,6 @@ class TransformFunctor {
     }                                                                          \
   }
 
-template <class functor, typename DeviceContext, typename T>
-void ElementwiseCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.");
-
-  if (x_dims == y_dims) {
-    functor f;
-    f.template Run<DeviceContext, T>(x, y, z, ctx);
-    return;
-  }
-
-  int axis = ctx.Attr<int>("axis");
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-  if (post == 1) {
-    functor f;
-    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
-    return;
-  } else {
-    functor f;
-    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
-    return;
-  }
-}
-
 #define EIGEN_ADD(x, y) ((x) + (y))
 EIGEN_FUNCTOR(Add, EIGEN_ADD);
 
@@ -361,13 +337,10 @@ template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
     const T* x, const T* y, const T* out, const T* dout, int h, int w,
     DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
-  extern __shared__ char shm_buffer[];
-  T* shm = reinterpret_cast<T*>(shm_buffer);
-
   int j = blockIdx.x;
   int i = threadIdx.x;
   int tid = threadIdx.x;
-  shm[tid] = 0;
+  T val = 0;
 
   do {
     int x_offset = i * w + j;
@@ -375,22 +348,16 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
       dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
     }
     if (dy) {
-      shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
     }
     i += ELEMWISE_MAX_BLOCK_DIM;
   } while (i < h);
 
   if (dy) {
-    __syncthreads();
-
     h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
-    // Sum, could be optimized
+    val = platform::reduceSum(val, tid, h);
     if (threadIdx.x == 0) {
-      for (int k = 1; k < h; ++k) {
-        shm[0] += shm[k];
-      }
-      dy[j] = shm[0];
+      dy[j] = val;
     }
   }
 }
@@ -402,10 +369,8 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x,
                                        T* dx, T* dy) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
   int gird_size = w;
-  int shared_mem_size = block_size * sizeof(T);
-  ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, shared_mem_size,
-                                     stream>>>(x, y, out, dout, h, w, dx_op,
-                                               dy_op, dx, dy);
+  ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+      x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
 }
 
 #endif
@@ -436,7 +401,6 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out,
 }
 
 #ifdef __NVCC__
-
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast2CUDAKernel(
     const T* x, const T* y, const T* out, const T* dout, int pre, int n,
@@ -444,9 +408,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   int tid = threadIdx.x;
   int j = blockIdx.x;
 
-  extern __shared__ char shm_buffer[];
-  T* shm = reinterpret_cast<T*>(shm_buffer);
-  shm[tid] = 0;
+  T val = 0;
   int ttid = tid;
 
   while (true) {
@@ -461,23 +423,18 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
     }
 
     if (dy != nullptr) {
-      shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
     }
 
     ttid += ELEMWISE_MAX_BLOCK_DIM;
   }
 
   if (dy) {
-    __syncthreads();
     int h = pre * post;
     h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
-    // Sum, could be optimized
-    if (tid == 0) {
-      for (int i = 1; i < h; ++i) {
-        shm[0] += shm[i];
-      }
-      dy[j] = shm[0];
+    val = platform::reduceSum(val, tid, h);
+    if (threadIdx.x == 0) {
+      dy[j] = val;
     }
   }
 }
@@ -489,10 +446,8 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
                                        DY_OP dy_op, T* dx, T* dy) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   int gird_size = n;
-  int shared_mem_size = block_size * sizeof(T);
-  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, shared_mem_size,
-                                     stream>>>(x, y, out, dout, pre, n, post,
-                                               dx_op, dy_op, dx, dy);
+  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+      x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
 }
 
 #endif
@@ -516,14 +471,10 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
     auto x_dim = x.dims();
     auto y_dim = y.dims();
 
-    if (y_dim.size() == 1 && y_dim[0] == 1) {
-      // y is a scalar
-      auto extended_dims = framework::vectorize(x_dim);
-      extended_dims.push_back(1);
-      x_dim = framework::make_ddim(extended_dims);
-    }
-
     axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+    trim_trailing_singular_dims(y_dim);
+    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
     int pre, n, post;
     get_mid_dims(x_dim, y_dim, axis, pre, n, post);
     if (post == 1) {
@@ -591,14 +542,9 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
     return;
   }
 
-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  trim_trailing_singular_dims(y_dims);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
   get_mid_dims(x_dims, y_dims, axis, pre, n, post);
@@ -633,16 +579,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
     return;
   }
 
-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
+  trim_trailing_singular_dims(y_dims);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
   get_mid_dims(x_dims, y_dims, axis, pre, n, post);
diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfa797717d78aa72e1b931b6db6e153270b3424e
--- /dev/null
+++ b/paddle/fluid/operators/go_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+
+static constexpr char kBlock[] = "sub_block";
+static constexpr char kX[] = "X";
+
+class GoOp : public framework::OperatorBase {
+ public:
+  GoOp(const std::string &type, const framework::VariableNameMap &inputs,
+       const framework::VariableNameMap &outputs,
+       const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void ExecuteOnThread(framework::Executor *executor,
+                       framework::BlockDesc *block,
+                       framework::Scope *scope) const {
+    framework::ProgramDesc *program = block->Program();
+    executor->Run(*program, scope, block->ID(), false /*create_local_scope*/);
+  }
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    /*
+     * Determine the global scope. Create a new child scope.
+     * Within the child scope, add all the local variables relevant
+     * to that scope.
+     *
+     * Now go through all the inputs to the op to ensure that
+     * all of them are in the newly created scope. This is important
+     * to ensure that they don't get destroyed when the parent scope
+     * is deleted.
+     * */
+
+    // TODO(varunarora): Consider moving this root scope lookup to scope.h.
+    const framework::Scope *root_scope = &scope;
+    const framework::Scope *parent_scope = &(root_scope->parent());
+
+    while (parent_scope != nullptr) {
+      root_scope = parent_scope;
+      parent_scope = &(parent_scope->parent());
+    }
+
+    framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
+    framework::Executor executor(dev_place);
+    framework::Scope &new_scope = root_scope->NewScope();
+
+    for (auto &var : block->AllVars()) {
+      new_scope.Var(var->Name());
+    }
+
+    auto &inputs = Inputs(kX);
+    for (size_t i = 0; i < inputs.size(); i++) {
+      PADDLE_ENFORCE_NOT_NULL(new_scope.FindVar(inputs.at(i)),
+                              "All variables used in the go block "
+                              "should be created in the global scope");
+    }
+
+    // Now execute the go op with the newly created scope.
+    std::thread go_thread([dev_place, block, &new_scope, this]() {
+      framework::Executor executor(dev_place);
+      ExecuteOnThread(&executor, block, &new_scope);
+    });
+    go_thread.detach();
+  }
+};
+
+class GoOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GoOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kX,
+             "A set of variables, which are required by operators inside the "
+             "block of Go Op.")
+        .AsDuplicable();
+    AddAttr<framework::BlockDesc *>(kBlock, "The block inside GoOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+// TODO(thuan): Look into Gradient Operator for GO_OP
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(go, paddle::operators::GoOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::GoOpMaker);
diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
index b7e9f4e2248882103a3926c309d6fc455442c917..0e81d60878dce747b047abbe4641b71462373b2b 100644
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -247,7 +247,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
index 2565e7e9efad415c5e4db2489afa9553683b7b0a..c4e70cde6f8c6bdf1f28b010b0b90091772fdffb 100644
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -62,7 +62,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(
         framework::ToDataType(
             ctx.Input<framework::LoDTensor>("Scores")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index 922b2bd237a1ec54aea895a05ccd78cd624e88ae..be7898c22190339e0717317807b91e038f4949f6 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -67,6 +67,14 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
     ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        platform::CPUPlace());
+  }
 };
 
 class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index b0e21e01ecc55d666f43aee621e9b5a05b347e1d..e1ce3d0c1bf11e9a623e4e9adc8f08f5069f4d94 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -59,6 +59,16 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
   }
 };
 
+class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
+    }
+  }
+};
+
 class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -80,7 +90,8 @@ class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
                   ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker);
+                  ops::SplitSelectedRowsGradMaker,
+                  ops::SplitSelectedRowsOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
     split_selected_rows,
     ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 28a668c86aa322803a65b916b4273181f5652e21..a1a743d94e204ca506c994f8fabb6bbf8c22cea5 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,4 +1,15 @@
 proto_library(profiler_proto SRCS profiler.proto)
+py_proto_compile(profiler_py_proto SRCS profiler.proto)
+
+add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+
+add_dependencies(profiler_py_proto profiler_py_proto_init)
+
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 if(WITH_GPU)
   cc_library(enforce SRCS enforce.cc DEPS)
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 881d611d4ac26f992036f639097815aff625227b..a4ea4f21e3c16c9292cf67863616924e9d9f8aba 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -62,5 +62,53 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 }
 #endif
 
+// __shfl_down has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // TODO(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  __shared__ T shm[32];
+  const int warpSize = 32;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += __shfl_down_sync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += __shfl_down_sync(mask, val, offset);
+  }
+
+  return val;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 87bbdfa5fd5d9781d5f2b310d2142b1b4decbf9b..265343573b3c85335f0b1c9a55e60cf9cf13f3c3 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device_tracer.h"
+#include <google/protobuf/text_format.h>
+#include <fstream>
 #include <map>
 #include <mutex>
+#include <numeric>
+#include <thread>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
@@ -51,6 +55,36 @@ uint64_t kAlignSize = 8;
     }                                                                      \
   } while (0)
 
+std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
+      return "MEMCPY_HtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
+      return "MEMCPY_DtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
+      return "MEMCPY_HtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
+      return "MEMCPY_AtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
+      return "MEMCPY_AtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
+      return "MEMCPY_AtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
+      return "MEMCPY_DtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
+      return "MEMCPY_DtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
+      return "MEMCPY_HtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
+      return "MEMCPY_PtoP";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
+      return "MEMCPY_FORCE_INT";
+    default:
+      break;
+  }
+  return "MEMCPY";
+}
+
 void EnableActivity() {
   // Device activity record is created when CUDA initializes, so we
   // want to enable it before cuInit() or any CUDA runtime call.
@@ -107,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                                      kernel->correlationId);
             break;
           }
+          case CUPTI_ACTIVITY_KIND_MEMCPY: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY2: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
           default: { break; }
         }
       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -137,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
     correlations_[id] = anno;
   }
 
+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    cpu_records_.push_back(
+        CPURecord{anno, start_ns, end_ns,
+                  std::hash<std::thread::id>{}(std::this_thread::get_id())});
+  }
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {
+    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
+                                     stream_id, correlation_id, bytes});
+  }
+
   void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                         uint32_t stream_id, uint32_t correlation_id) {
     std::lock_guard<std::mutex> l(trace_mu_);
@@ -172,17 +240,15 @@ class DeviceTracerImpl : public DeviceTracer {
     CUPTI_CALL(
         dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                      CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
-
     CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
     enabled_ = true;
   }
 
-  proto::Profile GenProfile() {
+  proto::Profile GenProfile(const std::string &profile_path) {
     std::lock_guard<std::mutex> l(trace_mu_);
     proto::Profile profile_pb;
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
-    std::map<std::string, std::vector<uint64_t>> event_times;
     for (const KernelRecord &r : kernel_records_) {
       if (correlations_.find(r.correlation_id) == correlations_.end()) {
         fprintf(stderr, "cannot relate a kernel activity\n");
@@ -194,15 +260,31 @@ class DeviceTracerImpl : public DeviceTracer {
       event->set_end_ns(r.end_ns);
       event->set_stream_id(r.stream_id);
       event->set_device_id(r.device_id);
-      event_times[event->name()].push_back(r.end_ns - r.start_ns);
     }
-    for (const auto &et : event_times) {
-      fprintf(
-          stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
-          et.first.c_str(),
-          std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
-          et.second.size());
+
+    for (const CPURecord &r : cpu_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.thread_id);
+      event->set_device_id(-1);
     }
+    for (const MemRecord &r : mem_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event->mutable_memcopy()->set_bytes(r.bytes);
+    }
+    std::string profile_str;
+    google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
+    std::ofstream profile_f;
+    profile_f.open(profile_path, std::ios::out | std::ios::trunc);
+    profile_f << profile_str;
+    profile_f.close();
     return profile_pb;
   }
 
@@ -240,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
   uint64_t start_ns_;
   uint64_t end_ns_;
   std::vector<KernelRecord> kernel_records_;
+  std::vector<MemRecord> mem_records_;
+  std::vector<CPURecord> cpu_records_;
   std::unordered_map<uint32_t, std::string> correlations_;
   CUpti_SubscriberHandle subscriber_;
 };
@@ -252,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {
 
   void AddAnnotation(uint64_t id, const std::string &anno) {}
 
+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {}
+
   void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                         uint32_t stream_id, uint32_t correlation_id) {}
 
@@ -259,7 +349,9 @@ class DeviceTracerDummy : public DeviceTracer {
 
   void Enable() {}
 
-  proto::Profile GenProfile() { return proto::Profile(); }
+  proto::Profile GenProfile(const std::string &profile_path) {
+    return proto::Profile();
+  }
 
   void Disable() {}
 };
@@ -281,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
 
 void ClearCurAnnotation() { cur_annotation = nullptr; }
 
+const char *CurAnnotation() { return cur_annotation; }
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 06cea84cc80ebefe9f5c396673cc9a35673f718f..23f7cdbdffc9f48ac5555455bf745233c81dd0cb 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -36,6 +36,21 @@ class DeviceTracer {
     uint32_t stream_id;
     uint32_t correlation_id;
   };
+  struct CPURecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint64_t thread_id;
+  };
+  struct MemRecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+    uint64_t bytes;
+  };
 
   virtual ~DeviceTracer() {}
   // Needs to be called once before use.
@@ -48,6 +63,14 @@ class DeviceTracer {
   // human-readable annotations.
   virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
 
+  virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
+                             uint64_t end_ns, uint32_t device_id,
+                             uint32_t stream_id, uint32_t correlation_id,
+                             uint64_t bytes) = 0;
+
+  virtual void AddCPURecords(const char* anno, uint64_t start_ns,
+                             uint64_t end_ns) = 0;
+
   // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
   // added before for human readability.
   virtual void AddKernelRecords(uint64_t start, uint64_t end,
@@ -55,7 +78,7 @@ class DeviceTracer {
                                 uint32_t correlation_id) = 0;
 
   // Generate a proto after done (Disabled).
-  virtual proto::Profile GenProfile() = 0;
+  virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
 
   virtual bool IsEnabled() = 0;
 };
@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
 void SetCurAnnotation(const char* anno);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
-
+// Current name of the operation being run in the thread.
+const char* CurAnnotation();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 567c137a55e4e0cb0b5080893be305e847bb61e1..84dac2937de02b3374156ebc83e19dac9f9a3e7a 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,7 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index a79868c18c14b6bcdf85d60e766c7ec8be993f28..c1bf88f8cb690861b97686d99d36410143445243 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
   __macro(cuptiFinalize);                     \
   __macro(cuptiSubscribe);                    \
   __macro(cuptiUnsubscribe);                  \
-  __macro(cuptiEnableCallback);
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);
 
 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
 
diff --git a/paddle/fluid/platform/dynload/cupti_lib_path.h.in b/paddle/fluid/platform/dynload/cupti_lib_path.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..017384bfbb7eb6db3927894f652c11ddb07cebc5
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cupti_lib_path.h.in
@@ -0,0 +1,17 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 8eb5966e5776004a03fee17b74ae72614331a694..e590e81bab51fd9fe12309335522614263d8e21d 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DEFINE_string(cudnn_dir, "",
@@ -45,8 +46,7 @@ DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 namespace paddle {
 namespace platform {
 namespace dynload {
-
-static const char* cupti_lib_path = CUPTI_LIB_PATH;
+static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
 
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 0076762d2f8c3840497ad354234680c4e41607cb..094f9224f719f6656fd0c44aa4a620730432ccac 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#include <sys/time.h>
+#include <time.h>
 #include <iomanip>
 #include <map>
 #ifdef PADDLE_WITH_CUDA
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
       .count();
 }
 
+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
+
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
              const DeviceContext* dev_ctx)
     : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -132,8 +140,8 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
   GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }
 
-RecordEvent::RecordEvent(const std::string& name,
-                         const DeviceContext* dev_ctx) {
+RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
+    : start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
   dev_ctx_ = dev_ctx;
   name_ = name;
@@ -144,6 +152,10 @@ RecordEvent::RecordEvent(const std::string& name,
 
 RecordEvent::~RecordEvent() {
   if (g_state == ProfilerState::kDisabled) return;
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer) {
+    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
+  }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
 }
@@ -199,22 +211,22 @@ std::vector<std::vector<Event>> GetAllEvents() {
   return result;
 }
 
-void DisableProfiler(EventSortingKey sorted_key) {
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path) {
   PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
                  "Can't disable profiling, since it's not starting.");
   // Mark the profiling stop.
   Mark("_stop_profiler_", nullptr);
   g_state = ProfilerState::kDisabled;
 
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
   DeviceTracer* tracer = GetDeviceTracer();
   if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
     tracer->Disable();
-    tracer->GenProfile();
+    tracer->GenProfile(profile_path);
   }
-
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
 }
 
 void ParseEvents(std::vector<std::vector<Event>>& events,
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 775edb85c0c3b4b6d2cf2e86c527af0722c72fea..3542ce6cda87e3b013d60393e4ba93da61921940 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -108,6 +108,7 @@ struct RecordEvent {
 
   ~RecordEvent();
 
+  uint64_t start_ns_;
   // The device context is used by Event to get the current cuda stream.
   const DeviceContext* dev_ctx_;
   // Event name
@@ -140,7 +141,8 @@ void EnableProfiler(ProfilerState state);
 // Clear the g_all_event_lists, which is total event lists of all threads.
 void ResetProfiler();
 
-void DisableProfiler(EventSortingKey sorted_key);
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path);
 
 // Parse the event list and output the profiling report
 void ParseEvents(std::vector<std::vector<Event>>&,
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index bdd86a0440d2b00eaee14195030456d0ad217f9a..06db7ed63895d5385952169e0672bf9888d7abae 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -15,12 +15,17 @@ limitations under the License. */
 syntax = "proto2";
 package paddle.platform.proto;
 
+message MemCopy { optional uint64 bytes = 3; }
+
 message Event {
   optional string name = 1;
   optional uint64 start_ns = 2;
   optional uint64 end_ns = 3;
-  optional uint32 device_id = 5;
+  // When positive, it represents gpu id. When -1, it represents CPU.
+  optional int64 device_id = 5;
   optional uint32 stream_id = 6;
+
+  optional MemCopy memcopy = 7;
 }
 
 message Profile {
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index dae4d2206e0a1ec6ef99122460a15c064efe58fd..4a86d8ec622772d0b62d5f4755a7b6138529b1d1 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -125,5 +125,5 @@ TEST(RecordEvent, RecordEvent) {
   EXPECT_EQ(start_profiler_count, 1);
 
   // Will remove parsing-related code from test later
-  DisableProfiler(EventSortingKey::kTotal);
+  DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index cd13073a0c200a62c4762ef13106abb50cf903b8..3d5e775fafb6b94a3429dbf3368a8949bca3d612 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -31,7 +31,6 @@ ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
 BUILD_ROOT=/paddle/build_android
 DEST_ROOT=/paddle/install_android
 
-rm -rf $BUILD_ROOT 2>/dev/null || true
 mkdir -p $BUILD_ROOT
 cd $BUILD_ROOT
 
diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh
deleted file mode 100755
index 9da71d1e8cdec4047167fe354973e6bac85fb9f0..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_android.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-set -e
-
-ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
-TMP_DIR=$HOME/$JOB/tmp
-mkdir -p $TMP_DIR
-cd $TMP_DIR
-wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
-unzip -q android-ndk-r14b-linux-x86_64.zip
-chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
-$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-cd $HOME
-rm -rf $TMP_DIR
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_android
-cd $TRAVIS_BUILD_DIR/build_android
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-      -DANDROID_ABI=armeabi-v7a \
-      -DANDROID_ARM_NEON=ON \
-      -DANDROID_ARM_MODE=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_C_API=ON \
-      -DWITH_SWIG_PY=OFF \
-      -DWITH_STYLE_CHECK=OFF \
-      ..
-
-make -j `nproc`
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0d497dcfce909d969081ac49dd760e70f025f73d..6e24cbdd3f6a4f05c1691dc643d880f6f454429d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -57,7 +57,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
 if(WITH_SWIG_PY)
diff --git a/python/paddle/fluid/tests/notest_csp.py b/python/__init__.py
similarity index 51%
rename from python/paddle/fluid/tests/notest_csp.py
rename to python/__init__.py
index f4be833deebd2c82e060e66a8bcf590020625cf8..eca2dce114b069bf9b455d77ce670d73b5047fd2 100644
--- a/python/paddle/fluid/tests/notest_csp.py
+++ b/python/__init__.py
@@ -11,27 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-
-
-class TestCSPFramework(unittest.TestCase):
-    def daisy_chain(self):
-        n = 10000
-        leftmost = fluid.make_channel(dtype=int)
-        right = leftmost
-        left = leftmost
-        with fluid.While(steps=n):
-            right = fluid.make_channel(dtype=int)
-            with fluid.go():
-                fluid.send(left, 1 + fluid.recv(right))
-            left = right
-
-        with fluid.go():
-            fluid.send(right, 1)
-        fluid.Print(fluid.recv(leftmost))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 58fa7f1bebc082df11e07f5f97927b417af3e4e8..e02245d05dd9fd5d82a26dcbc00a8b832f93842e 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -220,7 +220,10 @@ def _callback_lookup_(op):
     :return: callback function
     """
     if op.type == 'parallel_do' and op.attr('use_nccl'):
+        all_vars = op.block.vars
         param_names = set(op.input('parameters'))
+        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
+                             param_names)
         param_grad_names = [n + "@GRAD" for n in param_names]
 
         class ParallelDoCallBack(object):
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index 5f868b6e86d349eed137cfef36e5438d3db019c2..dec224fc886cd0739add0ebb6488625ef5063b8d 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: Variables: make_channel
-# TODO: Operators: send, close_channel, recv, go, select
 from layers.control_flow import BlockGuard
-from layer_helper import LayerHelper
+from layer_helper import LayerHelper, unique_name
+from layers import fill_constant
+import core
 
 __all__ = [
     'Go',
@@ -46,41 +46,200 @@ class Go(BlockGuard):
         parent_block = main_program.block(main_program.current_block()
                                           .parent_idx)
 
+        inner_outputs = set()
         x_name_list = set()
-        out_vars = set()
         for op in go_block.ops:
             # Iterate over all operators, get all the inputs
             # and add as input to the Go operator.
             for iname in op.input_names:
                 for in_var_name in op.input(iname):
-                    x_name_list.add(in_var_name)
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
 
-            # Iterate over all operators , get all the outputs
-            # add to the output list of Go operator only if
-            # they exist in the parent block.
             for oname in op.output_names:
                 for out_var_name in op.output(oname):
-                    if out_var_name in parent_block.vars:
-                        out_vars.add(parent_block.var(out_var_name))
+                    inner_outputs.add(out_var_name)
+
+        # Iterate over all operators , get all the outputs
+        # add to the output list of Go operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
 
         parent_block.append_op(
             type='go',
-            inputs={'X': [parent_block.var(x_name) for x_name in x_name_list]},
-            outputs={'Out': out_vars},
+            inputs={
+                'X':
+                [parent_block.var_recursive(x_name) for x_name in x_name_list]
+            },
+            outputs={},
             attrs={'sub_block': go_block})
 
 
-def make_channel(dtype, size=0):
-    return True
+def make_channel(dtype, capacity=0):
+    """
+    Helps implementation of a concurrent program by creating a "channel" of
+    a defined data type. Channels allow for the passing of data in
+    concurrent scenarios - such as when using threads to divide computation.
+    Channels can be used to "send" and "receive" such data concurrently.
 
+    There are two kinds of channels: unbuffered and buffered. Unbuffered
+    channels have no capacity - and thus, block on send and only unblock only
+    once what they have sent has been received.
 
-def channel_send(channel, value):
-    return True
+    On the other hand, buffered channels are initialized with a capacity -
+    and do not block on sends.
+
+    Use this method in combination with `channel_send`, `channel_recv`,
+    `channel_close`, and `Go` to design a concurrent Paddle program.
+
+    Args:
+        dtype (ParamAttr|string): Data type of the data sent in the channel.
+        This data type should be the string name of a numpy data type.
+        capacity (ParamAttr|int): Size of the channel. Defaults to 0 for
+        to create an unbuffered channel.
+
+    Returns:
+        Variable: The channel variable that can be used to send an receive data
+                  of the defined dtype.
+
+    Examples:
+        .. code-block:: python
 
+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          ...
+          # Code to execute in a Go block, which receives the channel data.
+          fluid.channel_send(ch, 100)
+          fluid.channel_close(ch)
+    """
+    helper = LayerHelper('channel_create', **locals())
+    main_program = helper.main_program
+    make_channel_block = main_program.current_block()
 
-def channel_recv(channel):
-    return True
+    # Make a channel variable (using the channel data type) and make sure it
+    # persists into the global scope.
+    channel = helper.create_variable(
+        name=unique_name.generate('channel'),
+        type=core.VarDesc.VarType.CHANNEL,
+        persistable=True)
+
+    create_channel_op = make_channel_block.append_op(
+        type="channel_create",
+        outputs={"Out": channel},
+        attrs={"data_type": dtype,
+               "capacity": capacity})
+
+    return channel
+
+
+def channel_send(channel, value):
+    """
+    Sends a value through a channel variable. Used by an unbuffered or buffered
+    channel to pass data from within or to a concurrent Go block, where
+    `channel_recv` to used to get the passed value.
+
+    Args:
+        channel (Variable|Channel): Channel variable created using
+        `make_channel`.
+        value (Variable): Value to send to channel
+    Returns:
+        Variable: The boolean status on whether or not the channel
+                  successfully sent the passed value.
+
+    Examples:
+        .. code-block:: python
+
+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          ...
+          # Code to execute in a Go block, which receives the channel data.
+          fluid.channel_send(ch, 100)
+    """
+    helper = LayerHelper('channel_send', **locals())
+    main_program = helper.main_program
+    channel_send_block = main_program.current_block()
+
+    status = helper.create_variable(
+        name=unique_name.generate('status'),
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        dtype=core.VarDesc.VarType.BOOL)
+
+    channel_send_op = channel_send_block.append_op(
+        type="channel_send",
+        inputs={
+            "Channel": channel,
+            "X": value,
+        },
+        outputs={"Status": status})
+
+    return status
+
+
+def channel_recv(channel, return_value):
+    """
+    Receives a value through a channel variable. Used by an unbuffered or
+    buffered channel within a concurrent Go block to get data from originally
+    sent using `channel_send`, or from outside such a block where
+    `channel_send` is used to send the value.
+
+    Args:
+        channel (Variable|Channel): Channel variable created using
+        `make_channel`.
+        return_value (Variable): Variable to set as a result of running channel_recv_op
+
+    Returns:
+        Variable: The received value from the channel.
+        Variable: The boolean status on whether or not the channel
+                  successfully received the passed value.
+
+    Examples:
+        .. code-block:: python
+
+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          with fluid.Go():
+            returned_value = fluid.channel_recv(ch, 'int32')
+
+          # Code to send data through the channel.
+    """
+    helper = LayerHelper('channel_recv', **locals())
+    main_program = helper.main_program
+    channel_recv_block = main_program.current_block()
+
+    status = helper.create_variable(
+        name=unique_name.generate('status'),
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        dtype=core.VarDesc.VarType.BOOL)
+
+    channel_recv_op = channel_recv_block.append_op(
+        type="channel_recv",
+        inputs={"Channel": channel},
+        outputs={"Out": return_value,
+                 "Status": status})
+
+    return return_value, status
 
 
 def channel_close(channel):
-    return True
+    """
+    Closes a channel created using `make_channel`.
+
+    Args:
+        channel (Variable|Channel): Channel variable created using
+        `make_channel`.
+
+    Examples:
+        .. code-block:: python
+
+          ch = fluid.make_channel(dtype='int32', capacity=10)
+          ...
+          # Code to receive and send data through a channel
+          ...
+          fluid.channel_close(ch)
+    """
+    helper = LayerHelper('channel_close', **locals())
+    main_program = helper.main_program
+    channel_close_block = main_program.current_block()
+
+    channel_close_op = channel_close_block.append_op(
+        type="channel_close", inputs={"Channel": channel})
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 8da9ca290b22ae69b1fd195d8614c31dc4e13e00..bb2ce4d45d5da6b2fbd097a94479f3696271e5ec 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -276,17 +276,17 @@ class DistributeTranspiler:
             pserver_program.global_block().create_var(
                 name=orig_var_name,
                 persistable=True,
+                type=v.type,
                 dtype=v.dtype,
                 shape=v.shape)
-            print("create origin var: ", orig_var_name)
             for trainer_id in xrange(self.trainers):
                 var = pserver_program.global_block().create_var(
                     name="%s.trainer_%d" % (orig_var_name, trainer_id),
                     persistable=False,
+                    type=v.type,
                     dtype=v.dtype,
                     shape=v.shape)
                 recv_inputs.append(var)
-                print("create per trainer var: ", var.name)
         # step3
         optimize_block = pserver_program.create_block(0)
         # step 4
@@ -551,11 +551,12 @@ class DistributeTranspiler:
                         type="sum",
                         inputs={"X": vars2merge},
                         outputs={"Out": merged_var})
-                    optimize_block.append_op(
-                        type="scale",
-                        inputs={"X": merged_var},
-                        outputs={"Out": merged_var},
-                        attrs={"scale": 1.0 / float(self.trainers)})
+                    if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+                        optimize_block.append_op(
+                            type="scale",
+                            inputs={"X": merged_var},
+                            outputs={"Out": merged_var},
+                            attrs={"scale": 1.0 / float(self.trainers)})
                 new_inputs[key] = merged_var
             elif key == "Param":
                 # param is already created on global program
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index d8caecb3fc49169294a18b5748d389739f35a224..9bc42a63da9d80df3b36f8c59e2a5c2f3ebcfd31 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -22,6 +22,7 @@ from layer_helper import LayerHelper
 __all__ = [
     'Accuracy',
     'ChunkEvaluator',
+    'EditDistance',
 ]
 
 
@@ -173,7 +174,7 @@ class ChunkEvaluator(Evaluator):
 class EditDistance(Evaluator):
     """
     Accumulate edit distance sum and sequence number from mini-batches and
-    compute the average edit_distance of all batches.
+    compute the average edit_distance and instance error of all batches.
 
     Args:
         input: the sequences predicted by network.
@@ -189,14 +190,12 @@ class EditDistance(Evaluator):
         for epoch in PASS_NUM:
             distance_evaluator.reset(exe)
             for data in batches:
-                loss, sum_distance = exe.run(fetch_list=[cost] + distance_evaluator.metrics)
-                avg_distance = distance_evaluator.eval(exe)
-            pass_distance = distance_evaluator.eval(exe)
+                loss = exe.run(fetch_list=[cost])
+            distance, instance_error = distance_evaluator.eval(exe)
 
         In the above example:
-        'sum_distance' is the sum of the batch's edit distance.
-        'avg_distance' is the average of edit distance from the firt batch to the current batch.
-        'pass_distance' is the average of edit distance from all the pass.
+        'distance' is the average of the edit distance in a pass.
+        'instance_error' is the instance error rate in a pass.
 
     """
 
@@ -206,25 +205,45 @@ class EditDistance(Evaluator):
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
-        self.total_error = self.create_state(
-            dtype='float32', shape=[1], suffix='total_error')
+        self.total_distance = self.create_state(
+            dtype='float32', shape=[1], suffix='total_distance')
         self.seq_num = self.create_state(
             dtype='int64', shape=[1], suffix='seq_num')
-        error, seq_num = layers.edit_distance(
+        self.instance_error = self.create_state(
+            dtype='int64', shape=[1], suffix='instance_error')
+        distances, seq_num = layers.edit_distance(
             input=input, label=label, ignored_tokens=ignored_tokens)
-        #error = layers.cast(x=error, dtype='float32')
-        sum_error = layers.reduce_sum(error)
-        layers.sums(input=[self.total_error, sum_error], out=self.total_error)
+
+        zero = layers.fill_constant(shape=[1], value=0.0, dtype='float32')
+        compare_result = layers.equal(distances, zero)
+        compare_result_int = layers.cast(x=compare_result, dtype='int')
+        seq_right_count = layers.reduce_sum(compare_result_int)
+        instance_error_count = layers.elementwise_sub(
+            x=seq_num, y=seq_right_count)
+        total_distance = layers.reduce_sum(distances)
+        layers.sums(
+            input=[self.total_distance, total_distance],
+            out=self.total_distance)
         layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
-        self.metrics.append(sum_error)
+        layers.sums(
+            input=[self.instance_error, instance_error_count],
+            out=self.instance_error)
+        self.metrics.append(total_distance)
+        self.metrics.append(instance_error_count)
 
     def eval(self, executor, eval_program=None):
         if eval_program is None:
             eval_program = Program()
         block = eval_program.current_block()
         with program_guard(main_program=eval_program):
-            total_error = _clone_var_(block, self.total_error)
+            total_distance = _clone_var_(block, self.total_distance)
             seq_num = _clone_var_(block, self.seq_num)
+            instance_error = _clone_var_(block, self.instance_error)
             seq_num = layers.cast(x=seq_num, dtype='float32')
-            out = layers.elementwise_div(x=total_error, y=seq_num)
-        return np.array(executor.run(eval_program, fetch_list=[out])[0])
+            instance_error = layers.cast(x=instance_error, dtype='float32')
+            avg_distance = layers.elementwise_div(x=total_distance, y=seq_num)
+            avg_instance_error = layers.elementwise_div(
+                x=instance_error, y=seq_num)
+            result = executor.run(
+                eval_program, fetch_list=[avg_distance, avg_instance_error])
+        return np.array(result[0]), np.array(result[1])
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e2749593057067ec04284940fff8f6a5284806ef..e5fb0e5d628b2df14355aec2718cf46aa641b6cf 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,7 +14,7 @@
 
 import numpy as np
 import contextlib
-from framework import Program, default_main_program
+from framework import Program, default_main_program, Variable
 from . import core
 
 __all__ = [
@@ -281,6 +281,8 @@ class Executor(object):
 
         if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
             for i, var in enumerate(fetch_list):
+                assert isinstance(var, Variable) or isinstance(var, str), (
+                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
                 global_block.append_op(
                     type='fetch',
                     inputs={'X': [var]},
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 2ca8c320842a1d8dce04d190bb200f3d49c78154..f921b93f1bd38544a9cd7be533dd1c87122ef279 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -141,6 +141,8 @@ class Variable(object):
         dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable.
         lod_level(int): The level of lod tensor. 0 means it is not a time
             series data.
+        capacity(int): The capacity of Channel variable. Ignored
+            for other types.
         persistable(bool): True if the variable should be saved as check point.
             Defaults to False.
         stop_gradient(bool): True if the variable will stop to calculate
@@ -154,6 +156,7 @@ class Variable(object):
                  shape=None,
                  dtype=None,
                  lod_level=None,
+                 capacity=None,
                  persistable=None,
                  error_clip=None,
                  stop_gradient=False,
@@ -224,6 +227,14 @@ class Variable(object):
                         "persistable is {2}. They are not matched".format(
                             self.name, self.persistable, persistable))
 
+        if capacity is not None:
+            if is_new_var:
+                self.desc.set_capacity(capacity)
+            else:
+                # TODO(abhinavarora) : Compare with set capacity once,
+                # get_capacity is implemented
+                pass
+
         self.block.vars[name] = self
         self.op = None
         self.stop_gradient = stop_gradient
@@ -472,10 +483,11 @@ class Operator(object):
 
         self.desc.check_attrs()
         no_kernel_op_set = {
-            'feed', 'fetch', 'save', 'load', 'recurrent',
+            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
             'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit'
+            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
+            'channel_send', 'channel_recv'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
@@ -761,7 +773,7 @@ class Block(object):
             stop_gradient = v.stop_gradient
         else:
             raise ValueError("unsupported var type: %s", type(v))
-
+        orig_var_type = v.type
         self.desc.rename_var(name, new_name)
         # NOTE: v is destroyed by C++ after calling rename_var.
         d = self.desc.find_var(new_name)
@@ -770,6 +782,7 @@ class Block(object):
                 self,
                 d.shape(),
                 d.dtype(),
+                type=orig_var_type,
                 name=new_name,
                 stop_gradient=stop_gradient,
                 trainable=trainable,
@@ -780,7 +793,7 @@ class Block(object):
         elif var_type == "Variable":
             var = Variable(
                 self,
-                type=v.type,
+                type=orig_var_type,
                 name=new_name,
                 error_clip=error_clip,
                 stop_gradient=stop_gradient)
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 6437dbb446e13d9ba24098dd936bfba395c6a566..da7e74c901e1f5be709c5f9d73f048bfda0c5549 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -330,9 +330,28 @@ class LayerHelper(object):
         return self.main_program.current_block().create_var(*args, **kwargs)
 
     def create_global_variable(self, persistable=False, *args, **kwargs):
+        """
+        create global variable, note that there is no initializer for this global variable.
+        Args:
+            persistable(bool): True if it is a checkpoint value.
+            *args: See create_var's documentation
+            **kwargs: See create_var's documentation
+
+        Returns(Variable): the created variable.
+        """
         return self.main_program.global_block().create_var(
             *args, persistable=persistable, **kwargs)
 
+    def create_or_get_global_variable(self, name, *args, **kwargs):
+        """
+        Creates a global variable if not exists and returns the variable and
+        a boolean flag which is true when it is a new variable.
+        """
+        if self.main_program.global_block().has_var(name):
+            return self.main_program.global_block().var(name), False
+        else:
+            return self.create_global_variable(name=name, *args, **kwargs), True
+
     def set_variable_initializer(self, var, initializer):
         assert isinstance(var, Variable)
         self.startup_program.global_block().create_var(
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index f380f5c00cfe709f625c82439577248687dceb7c..d16b4dc3a482d657158636f0e3a2c1d07bae7d8c 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -43,8 +43,8 @@ for _OP in set(__auto__):
     globals()[_OP] = generate_layer_fn(_OP)
 
 
-def detection_output(scores,
-                     loc,
+def detection_output(loc,
+                     scores,
                      prior_box,
                      prior_box_var,
                      background_label=0,
@@ -54,21 +54,27 @@ def detection_output(scores,
                      score_threshold=0.01,
                      nms_eta=1.0):
     """
-    **Detection Output Layer**
+    **Detection Output Layer for Single Shot Multibox Detector (SSD).**
 
-    This layer applies the NMS to the output of network and computes the
-    predict bounding box location. The output's shape of this layer could
-    be zero if there is no valid bounding box.
+    This operation is to get the detection results by performing following
+    two steps:
+    
+    1. Decode input bounding box predictions according to the prior boxes.
+    2. Get the final detection results by applying multi-class non maximum
+       suppression (NMS).
+
+    Please note, this operation doesn't clip the final output bounding boxes
+    to the image window.
 
     Args:
-        scores(Variable): A 3-D Tensor with shape [N, C, M] represents the
-            predicted confidence predictions. N is the batch size, C is the
-            class number, M is number of bounding boxes. For each category
-            there are total M scores which corresponding M bounding boxes.
         loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
             predicted locations of M bounding bboxes. N is the batch size,
             and each bounding box has four coordinate values and the layout
             is [xmin, ymin, xmax, ymax].
+        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
+            predicted confidence predictions. N is the batch size, C is the
+            class number, M is number of bounding boxes. For each category
+            there are total M scores which corresponding M bounding boxes.
         prior_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
             each box is represented as [xmin, ymin, xmax, ymax],
             [xmin, ymin] is the left top coordinate of the anchor box,
@@ -91,7 +97,15 @@ def detection_output(scores,
         nms_eta(float): The parameter for adaptive NMS.
 
     Returns:
-        The detected bounding boxes which are a Tensor.
+        Variable: The detection outputs is a LoDTensor with shape [No, 6].
+            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
+            `No` is the total number of detections in this mini-batch. For each
+            instance, the offsets in first dimension are called LoD, the offset
+            number is N + 1, N is the batch size. The i-th image has
+            `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
+            has no detected results. If all images have not detected results,
+            all the elements in LoD are 0, and output tensor only contains one
+            value, which is -1.
 
     Examples:
         .. code-block:: python
@@ -100,7 +114,7 @@ def detection_output(scores,
                          append_batch_size=False, dtype='float32')
         pbv = layers.data(name='prior_box_var', shape=[10, 4],
                           append_batch_size=False, dtype='float32')
-        loc = layers.data(name='target_box', shape=[21, 4],
+        loc = layers.data(name='target_box', shape=[2, 21, 4],
                           append_batch_size=False, dtype='float32')
         scores = layers.data(name='scores', shape=[2, 21, 10],
                           append_batch_size=False, dtype='float32')
@@ -109,7 +123,6 @@ def detection_output(scores,
                                        prior_box=pb,
                                        prior_box_var=pbv)
     """
-
     helper = LayerHelper("detection_output", **locals())
     decoded_box = box_coder(
         prior_box=prior_box,
@@ -118,6 +131,7 @@ def detection_output(scores,
         code_type='decode_center_size')
 
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
+    scores = nn.transpose(scores, perm=[0, 2, 1])
     helper.append_op(
         type="multiclass_nms",
         inputs={'Scores': scores,
@@ -595,12 +609,13 @@ def multi_box_head(inputs,
        name(str): Name of the prior box layer. Default: None.
 
     Returns:
-        mbox_loc(list): The predicted boxes' location of the inputs.
-             The layout of each element is [N, H, W, Priors]. Priors
-             is the number of predicted boxof each position of each input.
-        mbox_conf(list): The predicted boxes' confidence of the inputs.
-             The layout of each element is [N, H, W, Priors]. Priors
-             is the number of predicted box of each position of each input.
+        mbox_loc(Variable): The predicted boxes' location of the inputs.
+             The layout is [N, H*W*Priors, 4]. where Priors
+             is the number of predicted boxes each position of each input.
+        mbox_conf(Variable): The predicted boxes' confidence of the inputs.
+             The layout is [N, H*W*Priors, C]. where Priors
+             is the number of predicted boxes each position of each input
+             and C is the number of Classes.
         boxes(Variable): the output prior boxes of PriorBox.
              The layout is [num_priors, 4]. num_priors is the total
              box count of each position of inputs.
@@ -751,7 +766,7 @@ def multi_box_head(inputs,
         num_boxes = box.shape[2]
 
         # get box_loc
-        num_loc_output = num_boxes * num_classes * 4
+        num_loc_output = num_boxes * 4
         mbox_loc = nn.conv2d(
             input=input,
             num_filters=num_loc_output,
@@ -760,7 +775,12 @@ def multi_box_head(inputs,
             stride=stride)
 
         mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
-        mbox_locs.append(mbox_loc)
+        new_shape = [
+            mbox_loc.shape[0],
+            mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
+        ]
+        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_locs.append(mbox_loc_flatten)
 
         # get conf_loc
         num_conf_output = num_boxes * num_classes
@@ -771,11 +791,18 @@ def multi_box_head(inputs,
             padding=pad,
             stride=stride)
         conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
-        mbox_confs.append(conf_loc)
+        new_shape = [
+            conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
+            conf_loc.shape[3] / num_classes, num_classes
+        ]
+        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
         box = box_results[0]
         var = var_results[0]
+        mbox_locs_concat = mbox_locs[0]
+        mbox_confs_concat = mbox_confs[0]
     else:
         reshaped_boxes = []
         reshaped_vars = []
@@ -785,5 +812,7 @@ def multi_box_head(inputs,
 
         box = tensor.concat(reshaped_boxes)
         var = tensor.concat(reshaped_vars)
+        mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
+        mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
 
-    return mbox_locs, mbox_confs, box, var
+    return mbox_locs_concat, mbox_confs_concat, box, var
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index faccc3ddf827e4211c9f2e61da7138e5d43f1d11..08a0184c2c2ad5f3c3792fd0a12f0ab0c746849b 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -1,11 +1,11 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -53,12 +53,22 @@ def monkey_patch_variable():
         value = float(value)
         tmp_name = unique_tmp_name()
         var = ref_var.block.create_var(name=tmp_name, dtype=dtype)
+        batch_dim = -1
+        for i, d in enumerate(ref_var.shape):
+            if d < 0:
+                batch_dim = i
+                break
+        assert batch_dim != -1
         ref_var.block.append_op(
             type='fill_constant_batch_size_like',
             outputs={'Out': [var]},
             inputs={'Input': [ref_var]},
-            attrs={'shape': ref_var.shape,
-                   'value': value})
+            attrs={
+                'shape': ref_var.shape,
+                'value': value,
+                'input_dim_idx': batch_dim,
+                'output_dim_idx': batch_dim
+            })
         return var
 
     def astype(self, dtype):
@@ -118,11 +128,20 @@ def monkey_patch_variable():
             tmp_name = unique_tmp_name()
             out = self.block.create_var(name=tmp_name, dtype=lhs_dtype)
 
+            axis = -1
+            if other_var.shape[0] == -1:
+                axis = 0
+            assert len(self.shape) >= len(other_var.shape), (
+                "The rank of the first argument of an binary operator cannot "
+                "be smaller than the rank of its second argument: %s vs %s" %
+                (len(self.shape), len(other_var.shape)))
+
             self.block.append_op(
                 type=op_type,
                 inputs={'X': [self],
                         'Y': [other_var]},
-                outputs={'Out': out})
+                outputs={'Out': out},
+                attrs={'axis': axis})
             return out
 
         comment = OpProtoHolder.instance().get_op_proto(op_type).comment
@@ -131,7 +150,7 @@ def monkey_patch_variable():
         {0}
         Args:
             self(Variable): left hand variable
-            other_var(Variable|float|int): right hand variable 
+            other_var(Variable|float|int): right hand variable
 
         Returns:
             Variable
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 76101fe64fa96fbd4d6eae15d9ac89086e7819b2..a844383f64512c240368a7d03eaad037d9580418 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -69,6 +69,7 @@ __all__ = [
     'softmax_with_cross_entropy',
     'smooth_l1',
     'one_hot',
+    'autoincreased_step_counter',
 ]
 
 
@@ -2443,10 +2444,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def edit_distance(input,
-                  label,
-                  normalized=False,
-                  ignored_tokens=None,
+def edit_distance(input, label, normalized=True, ignored_tokens=None,
                   name=None):
     """
     EditDistance operator computes the edit distances between a batch of
@@ -3201,3 +3199,34 @@ def one_hot(input, depth):
         attrs={'depth': depth},
         outputs={'Out': one_hot_out})
     return one_hot_out
+
+
+def autoincreased_step_counter(counter_name=None, begin=1, step=1):
+    """
+    NOTE: The counter will be automatically increased by 1 every mini-batch
+    Return the run counter of the main program, which is started with 1.
+
+    Args:
+        counter_name(str): The counter name, default is '@STEP_COUNTER@'.
+        begin(int): The first value of this counter.
+        step(int): The increment step between each execution.
+
+    Returns(Variable): The global run counter.
+    """
+    helper = LayerHelper('global_step_counter')
+    if counter_name is None:
+        counter_name = '@STEP_COUNTER@'
+    counter, is_new_var = helper.create_or_get_global_variable(
+        name=counter_name, dtype='int64', shape=[1], persistable=True)
+    if is_new_var:
+        helper.set_variable_initializer(
+            counter, initializer=Constant(
+                value=begin - 1, force_cpu=True))
+        helper.main_program.global_block().prepend_op(
+            type='increment',
+            inputs={'X': [counter]},
+            outputs={'Out': [counter]},
+            attrs={'step': float(step)})
+        counter.stop_gradient = True
+
+    return counter
diff --git a/python/paddle/fluid/learning_rate_decay.py b/python/paddle/fluid/learning_rate_decay.py
index 0826d3da79a96590f00159a2d2e6f069792909c4..631efa048740ea3d50947a321ae2e76c6a6048af 100644
--- a/python/paddle/fluid/learning_rate_decay.py
+++ b/python/paddle/fluid/learning_rate_decay.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import layers
-from framework import Variable
 from initializer import init_on_cpu
 
 __all__ = [
@@ -30,11 +29,15 @@ strategy according to this module.
 """
 
 
-def exponential_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False):
+def _decay_step_counter():
+    # the first global step is zero in learning rate decay
+    global_step = layers.autoincreased_step_counter(
+        counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
+    global_step = layers.cast(global_step, 'float32')
+    return global_step
+
+
+def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """Applies exponential decay to the learning rate.
 
     ```python
@@ -44,7 +47,6 @@ def exponential_decay(learning_rate,
     Args:
         learning_rate: A scalar float32 value or a Variable. This
           will be the initial learning rate during training
-        global_step: A Variable that record the training step.
         decay_steps: A Python `int32` number.
         decay_rate: A Python `float` number.
         staircase: Boolean. If set true, decay the learning rate every decay_steps.
@@ -52,8 +54,7 @@ def exponential_decay(learning_rate,
     Returns:
         The decayed learning rate
     """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for exponential_decay.")
+    global_step = _decay_step_counter()
 
     with init_on_cpu():
         # update learning_rate
@@ -65,23 +66,17 @@ def exponential_decay(learning_rate,
     return decayed_lr
 
 
-def natural_exp_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False):
+def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """Applies natural exponential decay to the initial learning rate.
 
-    ```python
-    if not staircase:
-        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-    else:
-        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-    ```
+    >>> if not staircase:
+    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+
     Args:
         learning_rate: A scalar float32 value or a Variable. This
           will be the initial learning rate during training
-        global_step: A Variable that record the training step.
         decay_steps: A Python `int32` number.
         decay_rate: A Python `float` number.
         staircase: Boolean. If set true, decay the learning rate every decay_steps.
@@ -89,8 +84,7 @@ def natural_exp_decay(learning_rate,
     Returns:
         The decayed learning rate
     """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for natural_exp_decay.")
+    global_step = _decay_step_counter()
 
     with init_on_cpu():
         div_res = global_step / decay_steps
@@ -101,23 +95,17 @@ def natural_exp_decay(learning_rate,
     return decayed_lr
 
 
-def inverse_time_decay(learning_rate,
-                       global_step,
-                       decay_steps,
-                       decay_rate,
-                       staircase=False):
+def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """Applies inverse time decay to the initial learning rate.
 
-    ```python
-    if staircase:
-      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    else:
-      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
-    ```
+    >>> if staircase:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+
     Args:
         learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        global_step: A Variable that record the training step.
+          will be the initial learning rate during training.
         decay_steps: A Python `int32` number.
         decay_rate: A Python `float` number.
         staircase: Boolean. If set true, decay the learning rate every decay_steps.
@@ -125,8 +113,7 @@ def inverse_time_decay(learning_rate,
     Returns:
         The decayed learning rate
     """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for inverse_time_decay.")
+    global_step = _decay_step_counter()
 
     with init_on_cpu():
         div_res = global_step / decay_steps
@@ -139,26 +126,22 @@ def inverse_time_decay(learning_rate,
 
 
 def polynomial_decay(learning_rate,
-                     global_step,
                      decay_steps,
                      end_learning_rate=0.0001,
                      power=1.0,
                      cycle=False):
     """Applies polynomial decay to the initial learning rate.
 
-    ```python
-    if cycle:
-        decay_steps = decay_steps * ceil(global_step / decay_steps)
-    else:
-        global_step = min(global_step, decay_steps)
-    decayed_learning_rate = (learning_rate - end_learning_rate) *
-                      (1 - global_step / decay_steps) ^ power +
-                      end_learning_rate
-    ```
+    >>> if cycle:
+    >>>     decay_steps = decay_steps * ceil(global_step / decay_steps)
+    >>> else:
+    >>>     global_step = min(global_step, decay_steps)
+    >>> decayed_learning_rate = (learning_rate - end_learning_rate) *
+    >>>                   (1 - global_step / decay_steps) ^ power +
+    >>>                   end_learning_rate
     Args:
         learning_rate: A scalar float32 value or a Variable. This
           will be the initial learning rate during training
-        global_step: A Variable that record the training step.
         decay_steps: A Python `int32` number.
         end_learning_rate: A Python `float` number.
         power: A Python `float` number
@@ -167,8 +150,7 @@ def polynomial_decay(learning_rate,
     Returns:
         The decayed learning rate
     """
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for inverse_time_decay.")
+    global_step = _decay_step_counter()
 
     with init_on_cpu():
         if cycle:
@@ -193,27 +175,24 @@ def polynomial_decay(learning_rate,
     return decayed_lr
 
 
-def piecewise_decay(global_step, boundaries, values):
+def piecewise_decay(boundaries, values):
     """Applies piecewise decay to the initial learning rate.
 
-    ```python
-    boundaries = [10000, 20000]
-    values = [1.0, 0.5, 0.1]
-
-    if step < 10000:
-        learning_rate = 1.0
-    elif step >= 10000 and step < 20000:
-        learning_rate = 0.5
-    else:
-        learning_rate = 0.1
-    ```
+    >>> boundaries = [10000, 20000]
+    >>> values = [1.0, 0.5, 0.1]
+    >>>
+    >>> if step < 10000:
+    >>>     learning_rate = 1.0
+    >>> elif 10000 <= step < 20000:
+    >>>     learning_rate = 0.5
+    >>> else:
+    >>>     learning_rate = 0.1
     """
 
     if len(values) - len(boundaries) != 1:
         raise ValueError("len(values) - len(boundaries) should be 1")
 
-    if not isinstance(global_step, Variable):
-        raise ValueError("global_step is required for piecewise_decay.")
+    global_step = _decay_step_counter()
 
     with init_on_cpu():
         lr = layers.create_global_var(
diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py
index 6952ca7fe49931b9ebc84e214569c47d632d2a06..708ca08b17c85efbb25ecaf2580b7141421e25b9 100644
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -29,6 +29,8 @@ dtype_to_size = {
     core.VarDesc.VarType.BOOL: 1
 }
 
+sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
+
 
 class ControlFlowGraph(object):
     def __init__(self, Program, ops, forward_num, skip_opt):
@@ -141,7 +143,7 @@ class ControlFlowGraph(object):
         self.pool = []
         for i in range(self.op_size):
             op = self._ops[i]
-            if op.type() == "while" or op.type() == "while_grad":
+            if op.type() in sub_block_ops:
                 continue
             block_desc = op.block()
             is_forward = i < self._forward_num
@@ -198,67 +200,75 @@ class ControlFlowGraph(object):
                         block_desc, var_name, is_forward).shape()))
 
 
-def get_cfgs(input_program):
+def _process_sub_block_pair(pdesc, sub_block_pair):
     ops_list = []
-    pdesc = input_program.get_desc()
     block_desc = pdesc.block(0)
     op_size = block_desc.op_size()
-    # Get global block ops
-    ops_list.append(
-        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
-
-    while_sub_block_ids = []
-    while_grad_sub_block_ids = []
-    while_block_id_pair = []
-    while_op_dict = {}
+    for fwd_op, bwd_op in sub_block_pair:
+        sub_block_ids = []
+        grad_sub_block_ids = []
+        sub_block_id_pair = []
+        sub_op_dict = {}
+        for i in range(op_size):
+            op = block_desc.op(i)
+            if op.type() == fwd_op:
+                sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
+            elif op.type() == bwd_op:
+                grad_sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
 
-    for i in range(op_size):
-        op = block_desc.op(i)
-        if op.type() == "while":
-            while_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_dict[op.attr("sub_block").id] = op
-        elif op.type() == "while_grad":
-            while_grad_sub_block_ids.append(op.attr("sub_block").id)
-            while_op_dict[op.attr("sub_block").id] = op
+        # Find fwd_op/bwd_op block pair
+        for grad_id in grad_sub_block_ids:
+            fwd_id = pdesc.block(grad_id).get_forward_block_idx()
+            if fwd_id in sub_block_ids:
+                sub_block_id_pair.append((fwd_id, grad_id))
+                sub_block_ids.remove(fwd_id)
 
-    # Find while/while_grad block pair
-    for grad_id in while_grad_sub_block_ids:
-        forward_id = pdesc.block(grad_id).get_forward_block_idx()
-        if forward_id in while_sub_block_ids:
-            while_block_id_pair.append((forward_id, grad_id))
-            while_sub_block_ids.remove(forward_id)
+        # Get fwd_op/bwd_op block ops
+        for fwd_id, grad_id in sub_block_id_pair:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            block_op_size = sub_block.op_size()
+            for i in range(block_op_size):
+                sub_block_ops.append(sub_block.op(i))
 
-    # Get while/while_grad block ops
-    for forward_id, grad_id in while_block_id_pair:
-        while_block_ops = []
-        while_block = pdesc.block(forward_id)
-        while_block_op_size = while_block.op_size()
-        for i in range(while_block_op_size):
-            while_block_ops.append(while_block.op(i))
+            grad_sub_block = pdesc.block(grad_id)
+            grad_sub_block_op_size = grad_sub_block.op_size()
+            for i in range(grad_sub_block_op_size):
+                sub_block_ops.append(grad_sub_block.op(i))
 
-        while_grad_block = pdesc.block(grad_id)
-        while_grad_block_op_size = while_grad_block.op_size()
-        for i in range(while_grad_block_op_size):
-            while_block_ops.append(while_grad_block.op(i))
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
+            ops_list.append((sub_block_ops, block_op_size, sub_op_output))
 
-        while_op_output = set()
-        while_op_output.update(while_op_dict[forward_id].output_arg_names())
-        while_op_output.update(while_op_dict[grad_id].output_arg_names())
+        # Process rest fwd_op block ops
+        for fwd_id in sub_block_ids:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            sub_block_op_size = sub_block.op_size()
+            for i in range(sub_block_op_size):
+                sub_block_ops.append(sub_block.op(i))
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
+    return ops_list
 
-        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
-    # Process rest while block ops
-    for forward_id in while_sub_block_ids:
-        while_block_ops = []
-        while_block = pdesc.block(forward_id)
-        while_block_op_size = while_block.op_size()
-        for i in range(while_block_op_size):
-            while_block_ops.append(while_block.op(i))
+def _get_cfgs(input_program):
+    ops_list = []
+    pdesc = input_program.get_desc()
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    # Get global block ops
+    ops_list.append(
+        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
 
-        while_op_output = set()
-        while_op_output.update(while_op_dict[forward_id].output_arg_names())
+    sub_block_pair = [("while", "while_grad"), ("parallel_do",
+                                                "parallel_do_grad")]
 
-        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
+    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
 
     cfgs = [
         ControlFlowGraph(input_program, ops, forward_num, skip_opt)
@@ -268,6 +278,6 @@ def get_cfgs(input_program):
 
 
 def memory_optimize(input_program):
-    cfgs = get_cfgs(input_program)
+    cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.memory_optimize()
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 93a19de92e1654df2424019d764f1cbbe6314686..1c12d53e4f352b7e4dea980301973a69665e49f9 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -35,11 +35,10 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, learning_rate, global_step=None, regularization=None):
+    def __init__(self, learning_rate, regularization=None):
         if not isinstance(learning_rate, float) and \
                 not isinstance(learning_rate, framework.Variable):
             raise TypeError("learning rate should be float or Variable")
-        self._global_step = global_step
         self.regularization = regularization
         self._learning_rate = learning_rate
         # each program should have a independent learning rate
@@ -159,26 +158,6 @@ class Optimizer(object):
                             format(name, param.name))
         return self._accumulators[name][param.name]
 
-    def _increment_global_step(self, block):
-        """Increment the global step by 1 after every iteration
-
-        Args:
-            block: the block in which the loss variable is present
-
-        Returns:
-            list with global_step increment op as its only element
-        """
-        assert isinstance(block, framework.Block)
-        assert self._global_step is not None
-        # create the increment op
-        increment_op = block.append_op(
-            type="increment",
-            inputs={"X": self._global_step},
-            outputs={"Out": self._global_step},
-            attrs={"step": 1.0})
-
-        return increment_op
-
     def create_optimization_pass(self,
                                  parameters_and_grads,
                                  loss,
@@ -225,8 +204,6 @@ class Optimizer(object):
             # FIXME: Need to fix this once we figure out how to handle dependencies
             self._finish_update(loss.block)
 
-            if self._global_step is not None:
-                self._increment_global_step(loss.block)
             end = len(global_block.ops)
             return global_block.slice_ops(start, end)
 
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 59e75209d39dc0f2b72ecf832ff15df192a2898e..04fd05cc33cff3d720be75923d4af3767942669f 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -73,7 +73,7 @@ def reset_profiler():
 
 
 @contextmanager
-def profiler(state, sorted_key=None):
+def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     """The profiler interface.
     Different from cuda_profiler, this profiler can be used to profile both CPU
     and GPU program. By defalut, it records the CPU and GPU operator kernels,
@@ -95,8 +95,9 @@ def profiler(state, sorted_key=None):
             The `max` means sorting by the maximum execution time.
             The `min` means sorting by the minimum execution time.
             The `ave` means sorting by the average execution time.
+        profile_path (string) : If state == 'All', it will write a profile
+            proto output file.
     """
-
     if state not in ['CPU', 'GPU', "All"]:
         raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
     if state == "GPU":
@@ -122,4 +123,4 @@ def profiler(state, sorted_key=None):
     }
     # TODO(qingqing) : redirect C++ ostream to Python stream.
     # with core.ostream_redirect(stdout=True, stderr=True):
-    core.disable_profiler(key_map[sorted_key])
+    core.disable_profiler(key_map[sorted_key], profile_path)
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 613f4a7bf1c41f9f320ba8d310545a182f95e316..430b4fe073c2b6866023542e7b2a71d68568dd12 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -86,10 +86,10 @@ def vgg16_bn_drop(input):
     conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
     bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
     return fc2
 
 
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 13efe4efb1f2ba4ebd25868c7f3d94ca3c5fb1a1..5c6374b93175d85c49633b73b20aa5e3b64ff9f1 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -169,16 +169,12 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
-    global_step = fluid.layers.create_global_var(
-        shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.learning_rate_decay.exponential_decay(
             learning_rate=0.0001,
-            global_step=global_step,
             decay_steps=100000,
             decay_rate=0.5,
-            staircase=True),
-        global_step=global_step)
+            staircase=True))
     optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 784cfe58dfebd5451918789a3bd156b092978bd5..a99d13330b84e07a04a4499d749df24ec99e7ec0 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -24,15 +24,29 @@ import sys
 fluid.default_startup_program().random_seed = 111
 
 x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
 y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_cost = fluid.layers.mean(cost)
+device_type = 'CPU'
+use_nccl = False
+place = fluid.CPUPlace()
+if fluid.core.is_compiled_with_cuda():
+    device_type = 'CUDA'
+    use_nccl = True
+    place = fluid.CUDAPlace(0)
 
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+places = fluid.layers.get_places(device_count=0, device_type=device_type)
+pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
+with pd.do():
+    x_ = pd.read_input(x)
+    y_ = pd.read_input(y)
+    y_predict = fluid.layers.fc(input=x_, size=1, act=None)
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y_)
+    avg_cost = fluid.layers.mean(x=cost)
+    pd.write_output(avg_cost)
+
+cost = pd()
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 sgd_optimizer.minimize(avg_cost)
 
 fluid.memory_optimize(fluid.default_main_program())
@@ -48,7 +62,6 @@ train_reader = paddle.batch(
 #         paddle.dataset.uci_housing.train(), buf_size=500),
 #     batch_size=BATCH_SIZE)
 
-place = fluid.CPUPlace()
 feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
 exe = fluid.Executor(place)
 
@@ -65,6 +78,7 @@ for pass_id in range(PASS_NUM):
 
         if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
+        print avg_loss_value[0]
         if math.isnan(float(avg_loss_value)):
             sys.exit("got NaN loss, training failed.")
 exit(1)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index a3e0893f2dfa370a11e30fcff15e7151b4e6e0b2..80ff11f8d78b0a22fc6aefd722c9e6a2c23fbd5c 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -89,10 +89,10 @@ def vgg16_bn_drop(input):
     conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
 
     drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
     bn = fluid.layers.batch_norm(input=fc1, act='relu')
     drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
     return fc2
 
 
diff --git a/python/paddle/fluid/tests/notest_concurrency.py b/python/paddle/fluid/tests/notest_concurrency.py
index 602d5f31eb311e8f87d3f5eae78cd41d64f61141..77107f8b36f31c1f494b0ade218ee047ef7eb7c6 100644
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
@@ -20,7 +20,8 @@ from paddle.fluid.executor import Executor
 
 class TestRoutineOp(unittest.TestCase):
     def test_simple_routine(self):
-        ch = fluid.make_channel(dtype=bool)
+        ch = fluid.make_channel(
+            dtype=core.VarDesc.VarType.BOOL, name="CreateChannel")
         with fluid.Go():
             fluid.channel_send(ch, True)
 
diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/test_concurrency.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7bf63c5e017251e87af94690ff32c47c538c6b
--- /dev/null
+++ b/python/paddle/fluid/tests/test_concurrency.py
@@ -0,0 +1,100 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import framework, unique_name
+from paddle.fluid.executor import Executor
+from paddle.fluid.layers import fill_constant
+
+
+class TestRoutineOp(unittest.TestCase):
+    def test_simple_routine(self):
+        ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+        # Create LOD_TENSOR<INT64> and put it into the scope.  This placeholder
+        # variable will be filled in and returned by fluid.channel_recv
+        result = self._create_tensor('return_value',
+                                     core.VarDesc.VarType.LOD_TENSOR,
+                                     core.VarDesc.VarType.INT64)
+
+        with fluid.Go():
+            input_value = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234)
+            fluid.channel_send(ch, input_value)
+
+        result, status = fluid.channel_recv(ch, result)
+        fluid.channel_close(ch)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+
+        outs = exe.run(fetch_list=[result])
+        self.assertEqual(outs[0], 1234)
+
+    def test_daisy_chain(self):
+        '''
+        Mimics classic Daisy-chain test:  https://talks.golang.org/2012/concurrency.slide#39
+        '''
+        n = 100
+
+        leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+        left = leftmost
+
+        # TODO(thuan): Use fluid.While() after scope capture is implemented.
+        # https://github.com/PaddlePaddle/Paddle/issues/8502
+        for i in range(n):
+            right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+            with fluid.Go():
+                one_tensor = self._create_one_dim_tensor(1)
+                result = self._create_tensor('return_value',
+                                             core.VarDesc.VarType.LOD_TENSOR,
+                                             core.VarDesc.VarType.INT64)
+
+                result, status = fluid.channel_recv(right, result)
+                one_added = fluid.layers.elementwise_add(x=one_tensor, y=result)
+                fluid.channel_send(left, one_added)
+            left = right
+
+        # Trigger the channel propagation by sending a "1" to rightmost channel
+        with fluid.Go():
+            one_tensor = self._create_one_dim_tensor(1)
+            fluid.channel_send(right, one_tensor)
+
+        leftmost_result = self._create_tensor('return_value',
+                                              core.VarDesc.VarType.LOD_TENSOR,
+                                              core.VarDesc.VarType.INT64)
+        leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        leftmost_data = exe.run(fetch_list=[leftmost_result])
+
+        # The leftmost_data should be equal to the number of channels + 1
+        self.assertEqual(leftmost_data[0][0], n + 1)
+
+    def _create_one_dim_tensor(self, value):
+        one_dim_tensor = fill_constant(
+            shape=[1], dtype=core.VarDesc.VarType.INT64, value=value)
+        one_dim_tensor.stop_gradient = True
+        return one_dim_tensor
+
+    def _create_tensor(self, name, type, dtype):
+        return framework.default_main_program().current_block().create_var(
+            name=unique_name.generate(name), type=type, dtype=dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index fc25786499ff054a32e5503e796992d7f1e3ba02..0d2d653c01cb994bb10ea6bc5c6c8b8ce5004d7c 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -35,12 +35,12 @@ class TestDetection(unittest.TestCase):
                 dtype='float32')
             loc = layers.data(
                 name='target_box',
-                shape=[20, 4],
+                shape=[2, 10, 4],
                 append_batch_size=False,
                 dtype='float32')
             scores = layers.data(
                 name='scores',
-                shape=[2, 20, 10],
+                shape=[2, 10, 20],
                 append_batch_size=False,
                 dtype='float32')
             out = layers.detection_output(
@@ -117,9 +117,7 @@ class TestMultiBoxHead(unittest.TestCase):
         assert len(box.shape) == 2
         assert box.shape == var.shape
         assert box.shape[1] == 4
-
-        for loc, conf in zip(mbox_locs, mbox_confs):
-            assert loc.shape[1:3] == conf.shape[1:3]
+        assert mbox_locs.shape[1] == mbox_confs.shape[1]
 
     def multi_box_head_output(self, data_shape):
         images = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index b83917609226f3ae0b0eda177484c364009400e5..56f5af91d8e58086c12fde6948229675569aa271 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -51,8 +51,6 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
                 prior_box_var[:,:,3]
 
     elif (code_type == "DecodeCenterSize"):
-        target_box = target_box.reshape(target_box.shape[0], 1,
-                                        target_box.shape[1])
         target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
                        prior_box_width + prior_box_x
         target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
@@ -61,6 +59,7 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
                            prior_box_width
         target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
                             prior_box_height
+
         output_box[:, :, 0] = target_box_x - target_box_width / 2
         output_box[:, :, 1] = target_box_y - target_box_height / 2
         output_box[:, :, 2] = target_box_x + target_box_width / 2
@@ -72,8 +71,14 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
     for i in range(len(lod) - 1):
-        box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var,
-                  output_box[lod[i]:lod[i + 1], :, :], code_type)
+        if (code_type == "EncodeCenterSize"):
+            box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
+                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+                      code_type)
+        elif (code_type == "DecodeCenterSize"):
+            box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
+                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+                      code_type)
     return output_box
 
 
@@ -83,10 +88,10 @@ class TestBoxCoderOp(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 20]]
+        lod = [[0, 1, 2, 3, 4, 5]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
-        target_box = np.random.random((20, 4)).astype('float32')
+        target_box = np.random.random((5, 10, 4)).astype('float32')
         code_type = "DecodeCenterSize"
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
                                      lod[0], code_type)
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index c8e930dad762b867d0148ebcdb3637b8cc9560ce..5b2384e94d788342c692fcb8e33f3a2ff663ab53 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -50,6 +50,16 @@ class TestElementwiseAddOp_scalar(TestElementwiseOp):
         self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
 
 
+class TestElementwiseAddOp_scalar2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+
+
 class TestElementwiseAddOp_Vector(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_add"
@@ -115,6 +125,20 @@ class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
         }
 
 
+class TestElementwiseAddOp_broadcast_4(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(2, 1).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1, 1)
+        }
+
+
 class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_add"
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
index 595b05168920f3b9497172e502591cef82903cdc..5c221a0325b6cdc27ec22e5a8b02ae8eec9f6d80 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-
-import math
 import copy
+import math
+import unittest
 
-import paddle.fluid.framework as framework
 import paddle.fluid as fluid
-import paddle.fluid.layers as layers
+import paddle.fluid.framework as framework
 import paddle.fluid.learning_rate_decay as lr_decay
 
 
@@ -28,7 +26,7 @@ def exponential_decay(learning_rate,
                       decay_steps,
                       decay_rate,
                       staircase=False):
-    exponent = float(global_step) / float(decay_steps)
+    exponent = global_step / decay_steps
     if staircase:
         exponent = math.floor(exponent)
     return learning_rate * decay_rate**exponent
@@ -83,22 +81,24 @@ def piecewise_decay(global_step, boundaries, values):
 
 class TestLearningRateDecay(unittest.TestCase):
     def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
-        global_step = layers.create_global_var(
-            shape=[1], value=0.0, dtype='float32', persistable=True)
-
-        decayed_lr = fluid_decay_fn(global_step=global_step, **kwargs)
-        layers.increment(global_step, 1.0)
+        decayed_lr = fluid_decay_fn(**kwargs)
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         exe.run(fluid.default_startup_program())
         for step in range(10):
-            step_val, lr_val = exe.run(fluid.default_main_program(),
-                                       feed=[],
-                                       fetch_list=[global_step, decayed_lr])
-            python_decayed_lr = python_decay_fn(global_step=step, **kwargs)
-            self.assertAlmostEqual(python_decayed_lr, lr_val[0])
+            lr_val, = exe.run(fluid.default_main_program(),
+                              feed=[],
+                              fetch_list=[decayed_lr])
+            python_decayed_lr = python_decay_fn(
+                global_step=float(step), **kwargs)
+            self.assertAlmostEqual(
+                python_decayed_lr,
+                lr_val[0],
+                msg='Failed fn is {0}, Python result is {1}, Fluid result is {2}'.
+                format(python_decay_fn.__name__,
+                       str(python_decayed_lr), str(lr_val[0])))
 
     def test_decay(self):
         common_kwargs_true = {
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index 6864d271e795026d59525e9f1e4d86e32df980bf..852a80261e02f5ed19e7fbe608d490be1f7798a9 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -1,11 +1,11 @@
 #   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,13 +23,21 @@ class TestMathOpPatches(unittest.TestCase):
     def test_add_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
         b = a + 10
+        ab = fluid.layers.concat(input=[a, b], axis=1)
+        c = ab + 10
+        d = ab + a
+        # e = a + ab
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         a_np = numpy.random.random(size=[10, 1]).astype('float32')
-        b_np = exe.run(fluid.default_main_program(),
-                       feed={"a": a_np},
-                       fetch_list=[b])
+        b_np, c_np, d_np = exe.run(fluid.default_main_program(),
+                                   feed={"a": a_np},
+                                   fetch_list=[b, c, d])
         self.assertTrue(numpy.allclose(a_np + 10, b_np))
+        ab_np = numpy.concatenate([a_np, b_np], axis=1)
+        self.assertTrue(numpy.allclose(ab_np + 10, c_np))
+        d_expected = ab_np + numpy.concatenate([a_np, a_np], axis=1)
+        self.assertTrue(numpy.allclose(d_expected, d_np))
 
     @decorators.prog_scope()
     def test_radd_scalar(self):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 6ee7fc819a501eb7bf452b5d470a2e0df1b44600..9d87f4daa98da42fcc33aba4b51a4528343fb137 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -46,43 +46,6 @@ class TestOptimizer(unittest.TestCase):
         self.assertEqual([op.type for op in opts],
                          ["fill_constant", "elementwise_mul", "sgd"])
 
-    def test_sgd_optimizer_with_global_step(self):
-        init_program = framework.Program()
-        program = framework.Program()
-        block = program.global_block()
-        mul_x = block.create_parameter(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        block.append_op(
-            type="mul",
-            inputs={"X": mul_x,
-                    "Y": mul_y},
-            outputs={"Out": mul_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
-        global_step = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="step")
-        learning_rate = 0.01
-        sgd_optimizer = optimizer.SGDOptimizer(
-            learning_rate=learning_rate, global_step=global_step)
-        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
-        self.assertEqual(len(opts), 4)
-        self.assertEqual(
-            [op.type for op in opts],
-            ["fill_constant", "elementwise_mul", "sgd", "increment"])
-
-        # Check init_program
-        init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 1)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
-
 
 class TestMomentumOptimizer(unittest.TestCase):
     class MockMomentum(optimizer.MomentumOptimizer):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index cd20b430f93498372dd706a46c3a6d9d798721f5..1a7551c57b26f576ab286e7b18177b9120261623 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -170,6 +170,7 @@ class ParallelOpTest(BaseParallelForTest):
         x = fluid.layers.data(shape=[784], dtype='float32', name='img')
         x = yield x
         hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+        hidden = fluid.layers.batch_norm(input=hidden)
         loss = fluid.layers.mean(hidden)
         yield loss
 
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index b2ce655cad004abd8bb5591f88d293786be7ab4c..bc8db255e1d1a359c2e6a9ebab6acab7eff82a08 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -22,7 +22,7 @@ import paddle.fluid.core as core
 
 
 class TestProfiler(unittest.TestCase):
-    def net_profiler(self, state):
+    def net_profiler(self, state, profile_path='/tmp/profile'):
         enable_if_gpu = state == 'GPU' or state == "All"
         if enable_if_gpu and not core.is_compiled_with_cuda():
             return
@@ -72,7 +72,9 @@ class TestProfiler(unittest.TestCase):
         self.net_profiler('GPU')
 
     def test_all_profiler(self):
-        self.net_profiler('All')
+        self.net_profiler('All', '/tmp/profile_out')
+        with open('/tmp/profile_out', 'r') as f:
+            self.assertGreater(len(f.read()), 0)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index c3cd4cf8c32e20f3ef86305489fc415397dec1b8..32698e5b2cb52acf960d83a05d2d49c632cd73fc 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -361,6 +361,7 @@ def settings(batch_size,
              learning_rate_decay_b=0.,
              learning_rate_schedule='poly',
              learning_rate_args='',
+             async_lagged_grad_discard_ratio=1.5,
              learning_method=None,
              regularization=None,
              is_async=False,
@@ -396,6 +397,10 @@ def settings(batch_size,
                                         value larger than some value, will be
                                         clipped.
     :type gradient_clipping_threshold: float
+    :param async_lagged_grad_discard_ratio: async SGD gradient commit control,
+          when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, 
+          the current async SGD gradient is discarded.
+    :type async_lagged_grad_discard_ratio: float
     """
     if isinstance(regularization, BaseRegularization):
         regularization = [regularization]
@@ -409,7 +414,7 @@ def settings(batch_size,
     args = [
         'batch_size', 'learning_rate', 'learning_rate_decay_a',
         'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
-        'gradient_clipping_threshold'
+        'gradient_clipping_threshold', 'async_lagged_grad_discard_ratio'
     ]
     kwargs = dict()
     kwargs['algorithm'] = algorithm
diff --git a/python/setup.py.in b/python/setup.py.in
index 6fff1bb09e4923650356a50afba0e105b3d8c0c2..f830039a3af581d593d510326f15139377cb25f1 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -73,6 +73,7 @@ packages=['paddle',
           'paddle.v2.plot',
           'paddle.fluid',
           'paddle.fluid.proto',
+          'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
           'py_paddle']
 
@@ -109,6 +110,7 @@ setup(name='${PACKAGE_NAME}',
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.fluid.proto will be generated while compiling.
           # So that package points to other directory.
+          'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
           'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2
--- /dev/null
+++ b/tools/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tools/timeline.py b/tools/timeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1d1dae2bd85e85b069dc41bdd3e3e54579f9c49
--- /dev/null
+++ b/tools/timeline.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import sys
+import unittest
+
+import google.protobuf.text_format as text_format
+import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--profile_path', type=str, default='', help='Input profile file name.')
+parser.add_argument(
+    '--timeline_path', type=str, default='', help='Output timeline file name.')
+args = parser.parse_args()
+
+
+class _ChromeTraceFormatter(object):
+    def __init__(self):
+        self._events = []
+        self._metadata = []
+
+    def _create_event(self, ph, category, name, pid, tid, timestamp):
+        """Creates a new Chrome Trace event.
+
+        For details of the file format, see:
+        https://github.com/catapult-project/catapult/blob/master/tracing/README.md
+
+        Args:
+          ph:  The type of event - usually a single character.
+          category: The event category as a string.
+          name:  The event name as a string.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          timestamp:  The timestamp of this event as a long integer.
+
+        Returns:
+          A JSON compatible event object.
+        """
+        event = {}
+        event['ph'] = ph
+        event['cat'] = category
+        event['name'] = name
+        event['pid'] = pid
+        event['tid'] = tid
+        event['ts'] = timestamp
+        return event
+
+    def emit_pid(self, name, pid):
+        """Adds a process metadata event to the trace.
+
+        Args:
+          name:  The process name as a string.
+          pid:  Identifier of the process as an integer.
+        """
+        event = {}
+        event['name'] = 'process_name'
+        event['ph'] = 'M'
+        event['pid'] = pid
+        event['args'] = {'name': name}
+        self._metadata.append(event)
+
+    def emit_region(self, timestamp, duration, pid, tid, category, name, args):
+        """Adds a region event to the trace.
+
+        Args:
+          timestamp:  The start timestamp of this region as a long integer.
+          duration:  The duration of this region as a long integer.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          category: The event category as a string.
+          name:  The event name as a string.
+          args:  A JSON-compatible dictionary of event arguments.
+        """
+        event = self._create_event('X', category, name, pid, tid, timestamp)
+        event['dur'] = duration
+        event['args'] = args
+        self._events.append(event)
+
+    def format_to_string(self, pretty=False):
+        """Formats the chrome trace to a string.
+
+        Args:
+          pretty: (Optional.)  If True, produce human-readable JSON output.
+
+        Returns:
+          A JSON-formatted string in Chrome Trace format.
+        """
+        trace = {}
+        trace['traceEvents'] = self._metadata + self._events
+        if pretty:
+            return json.dumps(trace, indent=4, separators=(',', ': '))
+        else:
+            return json.dumps(trace, separators=(',', ':'))
+
+
+class Timeline(object):
+    def __init__(self, profile_pb):
+        self._profile_pb = profile_pb
+        self._pid = 0
+        self._devices = dict()
+        self._chrome_trace = _ChromeTraceFormatter()
+
+    def _allocate_pid(self):
+        cur_pid = self._pid
+        self._pid += 1
+        return cur_pid
+
+    def _allocate_pids(self):
+        for event in self._profile_pb.events:
+            if event.device_id not in self._devices:
+                pid = self._allocate_pid()
+                self._devices[event.device_id] = pid
+                if event.device_id >= 0:
+                    self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
+                                                (pid, event.stream_id), pid)
+                elif event.device_id == -1:
+                    self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
+                                                event.stream_id, pid)
+
+    def _allocate_events(self):
+        for event in self._profile_pb.events:
+            pid = self._devices[event.device_id]
+            args = {'name': event.name}
+            if event.memcopy.bytes > 0:
+                args = {'mem_bytes': event.memcopy.bytes}
+            # TODO(panyx0718): Chrome tracing only handles ms. However, some
+            # ops takes micro-seconds. Hence, we keep the ns here.
+            self._chrome_trace.emit_region(event.start_ns,
+                                           (event.end_ns - event.start_ns) /
+                                           1.0, pid, 0, 'Op', event.name, args)
+
+    def generate_chrome_trace(self):
+        self._allocate_pids()
+        self._allocate_events()
+        return self._chrome_trace.format_to_string()
+
+
+profile_path = '/tmp/profile'
+if args.profile_path:
+    profile_path = args.profile_path
+timeline_path = '/tmp/timeline'
+if args.timeline_path:
+    timeline_path = args.timeline_path
+
+with open(profile_path, 'r') as f:
+    profile_s = f.read()
+    profile_pb = profiler_pb2.Profile()
+    text_format.Merge(profile_s, profile_pb)
+
+tl = Timeline(profile_pb)
+with open(timeline_path, 'w') as f:
+    f.write(tl.generate_chrome_trace())