Merge remote-tracking branch 'origin/develop' into feature/recordio

6540cda1 · dongzhihong · af5dcda4 · d43469b7 · 6540cda1 · 6540cda1
133 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,13 +4,17 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
+services:
+  - docker
 os:
  - linux
 env:
  - JOB=build_doc
  - JOB=check_style
+  - JOB=build_android
 addons:
  apt:
    packages:
@@ -41,8 +45,10 @@ before_install:
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
-    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
+    # 43min timeout
+    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
+    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
  - |
    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;

--- a/AUTHORS.md
+++ b/AUTHORS.md
 | Github account | name |
 |---|---|
+| abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
 | beckett1124 | Bin Qi |
 | JiayiFeng | Jia-Yi Feng |

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

 include(system)

-project(paddle CXX C Go)
+project(paddle CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
@@ -166,16 +166,16 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")

 set(EXTERNAL_LIBS
-    ${GFLAGS_LIBRARIES}
-    ${GLOG_LIBRARIES}
+    gflags
+    glog
    ${CBLAS_LIBRARIES}
-    ${PROTOBUF_LIBRARY}
-    ${ZLIB_LIBRARIES}
+    protobuf
+    zlib
    ${PYTHON_LIBRARIES}
 )

 if(WITH_GPU)
-  include(cuda)
+    include(cuda)
 endif(WITH_GPU)

 if(WITH_MKLML)
@@ -202,17 +202,18 @@ endif()
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
+    enable_language(Go)
    add_subdirectory(go)
 endif(WITH_GOLANG)

 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")

-SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
-SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

 add_subdirectory(paddle)
 if(WITH_PYTHON)
-  add_subdirectory(python)
+    add_subdirectory(python)
 endif()

 if(WITH_DOC)

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -21,16 +21,6 @@ RUN apt-get update && \
    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
    apt-get clean -y

-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
 # git credential to skip password typing
 RUN git config --global credential.helper store


--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -59,7 +59,6 @@ endif(NOT WITH_GOLANG)

 if(NOT WITH_GPU)
    add_definitions(-DHPPL_STUB_FUNC)
-    add_definitions("-DCUPTI_LIB_PATH=\"\"")

    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
@@ -77,9 +76,7 @@ else()
    if(CUPTI_FOUND)
        include_directories(${CUPTI_INCLUDE_DIR})
        add_definitions(-DPADDLE_WITH_CUPTI)
-        add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
    else()
-        add_definitions("-DCUPTI_LIB_PATH=\"\"")
        message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
    endif()
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -28,7 +28,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})

 ExternalProject_Add(
-    zlib
+    extern_zlib
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
    GIT_TAG         "v1.2.8"
@@ -49,9 +49,11 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )

+ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+ADD_DEPENDENCIES(zlib extern_zlib)
+
 LIST(APPEND external_project_dependencies zlib)
-ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})

 IF(WITH_C_API)
  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,7 +104,9 @@ function(merge_static_libs TARGET_NAME)
  foreach(lib ${libs})
    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()
-  list(REMOVE_DUPLICATES libs_deps)
+  if(libs_deps)
+    list(REMOVE_DUPLICATES libs_deps)
+  endif()

  # To produce a library we need at least one source file.
  # It is created by add_custom_command below and will helps
@@ -191,10 +193,13 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-      target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+      else()
+        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -72,7 +72,7 @@ copy(inference_lib DEPENDS paddle_fluid_shared
 )

 set(module "platform")
-copy(platform_lib
+copy(platform_lib DEPS profiler_py_proto
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
 )

--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -9,7 +9,7 @@
 为了编译PaddlePaddle，我们需要

 1. 一台电脑，可以装的是 Linux, Windows 或者 MacOS 操作系统
-1. Docker
+2. Docker

 不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要，因为我们会把所有编译工具都安装进一个 Docker 镜像里。

@@ -189,7 +189,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
    "WITH_TESTING", "是否开启单元测试", "OFF"
    "WITH_DOC", "是否编译中英文文档", "OFF"
    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
-    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "OFF"
    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"

 BLAS

--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -9,7 +9,7 @@ Requirements
 To build PaddlePaddle, you need

 1. A computer -- Linux, Windows, MacOS.
-1. Docker.
+2. Docker.

 Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
 We run all the tools by running this image.
@@ -191,7 +191,7 @@ You can add :code:`-D` argument to pass such options, like:
    "WITH_TESTING", "Build unit tests", "OFF"
    "WITH_DOC", "Build documentations", "OFF"
    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
-    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "OFF"
    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"



--- a/doc/build_and_install/pip_install_en.rst
+++ b/doc/build_and_install/pip_install_en.rst
-Install Using pip
+Install using pip
 ================================

 You can use current widely used Python package management
@@ -8,7 +8,7 @@ most of current Linux systems or MacOS.

 .. _pip_install:

-Install Using pip
+Install using pip
 ------------------------------

 Run the following command to install PaddlePaddle on the current

--- a/doc/design/parallel_do.md
+++ b/doc/design/parallel_do.md
@@ -39,15 +39,16 @@ In the backward pass
 This implementation allows to write mixed device program like this

 ```python
-# get embedding feature on CPU
-feature = some_cpu_only_op(data)
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)

-gpu_places = get_place(use_gpu=True)
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
 # parallel processing on multiple GPUs
 pd = ParallelDo(gpu_places)
-with pd.do():
-    read_input(feature)
-    prediction = my_net(feature)
+with pd.do(input=data):
+    prediction = softmax(fc(fc(data, W1), W2))
    write_output(prediction)
 prediction = pd()
 loss = cross_entropy(prediction, label)
@@ -66,20 +67,20 @@ start_program
 main_program
 {
 block0 {
-  vars: data, places, w1, w2
+  vars: data, places, w1, w2, w1_grad, w2_grad,
  ops: data, get_place, parallel_do(block1),
       parallel_do_grad(block2),
       sgd(w2, w2_grad),
       sgd(w1, w1_grad)
 }
-block1 {
+block1 { # the forward pass
  parent_block: 0
  vars: data, h1, h2, loss
  ops: fc, fc, softmax
 }
-block2 {
+block2 { # the backward pass
  parent_block: 1
-  vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+  vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
  ops: softmax_grad,
       fc_grad
       fc_grad

--- a/doc/dev/contribute_to_paddle_cn.md
+++ b/doc/dev/contribute_to_paddle_cn.md
@@ -222,6 +222,7 @@ upstream
 ## 提交代码的一些约定

 为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
 1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
 2. 提交PUll Request前：
   - 请注意commit的数量：
@@ -231,6 +232,7 @@ upstream
 3. 如果解决了某个Issue的问题，请在该PUll Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。

 此外，在回复评审人意见时，请您遵守以下约定：
+
 1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
   - 对评审意见不同意的，请给出您自己的反驳理由。

--- a/doc/dev/index_en.rst
+++ b/doc/dev/index_en.rst
@@ -4,6 +4,5 @@ Development
 ..  toctree::
  :maxdepth: 1

-  new_layer_en.rst
  contribute_to_paddle_en.md
  write_docs_en.rst
--- a/doc/faq/build_and_install/index_cn.rst
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -16,7 +16,7 @@
    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu

-更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_cn.html>`_ 。


 2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致

--- a/doc/faq/build_and_install/index_en.rst
+++ b/doc/faq/build_and_install/index_en.rst
+############################
+Install, Build and Unit test
+############################
+
+TBD
--- a/doc/faq/cluster/index_en.rst
+++ b/doc/faq/cluster/index_en.rst
+###############################
+Cluster Training and Prediction
+###############################
+
+TBD
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
 FAQ
 ====

+本文档对关于PaddlePaddle的一些常见问题提供了解答。如果您的问题未在此处，请您到 `PaddlePaddle社区 <https://github.com/PaddlePaddle/Paddle/issues>`_ 查找答案或直接提 `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_ ，我们会及时进行回复。
+ 
 ..  toctree::
  :maxdepth: 1


--- a/doc/faq/index_en.rst
+++ b/doc/faq/index_en.rst
+FAQ
+====
+
+ 
+..  toctree::
+  :maxdepth: 1
+
+  build_and_install/index_en.rst
+  model/index_en.rst
+  parameter/index_en.rst
+  local/index_en.rst
+  cluster/index_en.rst
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -148,10 +148,10 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异

 ..  code-block:: python

-optimizer = paddle.optimizer.RMSProp(
-    learning_rate=1e-3,
-    gradient_clipping_threshold=10.0,
-    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))

 具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。

@@ -159,13 +159,13 @@ optimizer = paddle.optimizer.RMSProp(

 ..  code-block:: python

-decoder_inputs = paddle.layer.fc(
-    act=paddle.activation.Linear(),
-    size=decoder_size * 3,
-    bias_attr=False,
-    input=[context, current_word],
-    layer_attr=paddle.attr.ExtraLayerAttribute(
-        error_clipping_threshold=100.0))
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))

 完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。


--- a/doc/faq/local/index_en.rst
+++ b/doc/faq/local/index_en.rst
+#############################
+Local Training and Prediction
+#############################
+
+TBD
--- a/doc/faq/model/index_en.rst
+++ b/doc/faq/model/index_en.rst
+###################
+Model Configuration
+###################
+
+TBD
--- a/doc/faq/parameter/index_cn.rst
+++ b/doc/faq/parameter/index_cn.rst
@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
        obj="process",
        args={"src_dict_path": src_dict_path})

-完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ 示例。


--- a/doc/faq/parameter/index_en.rst
+++ b/doc/faq/parameter/index_en.rst
+#################
+Parameter Setting
+#################
+
+TBD
--- a/doc/getstarted/concepts/use_concepts_en.rst
+++ b/doc/getstarted/concepts/use_concepts_en.rst
+Basic Concept
+=============
+TBD
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -5,3 +5,4 @@ GET STARTED
  :maxdepth: 1

  quickstart_en.rst
+  concepts/use_concepts_en.rst
--- a/doc/howto/capi/compile_paddle_lib_en.md
+++ b/doc/howto/capi/compile_paddle_lib_en.md
+## Install and Build
+
+TBD
--- a/doc/howto/capi/index_en.rst
+++ b/doc/howto/capi/index_en.rst
+C-API Prediction Library
+========================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_en.md
+  organization_of_the_inputs_en.md
+  workflow_of_capi_en.md
--- a/doc/howto/capi/organization_of_the_inputs_cn.md
+++ b/doc/howto/capi/organization_of_the_inputs_cn.md
@@ -4,12 +4,15 @@

 ### 输入/输出数据类型
 在C-API中，按照基本数据类型在PaddlePaddle内部的定义和实现，输入数据可分为：
+
 1. 一维整型数组
 1. 二维浮点型矩阵
+
    - 稠密矩阵
    - 稀疏矩阵

 说明：
+
 1. 一维数组**仅支持整型值**；
    - 常用于自然语言处理任务，例如：表示词语在词典中的序号；
    - 分类任务中类别标签；
@@ -274,6 +277,7 @@ PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织
 如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。

 于是，在组织神经网络输入时，需要思考完成以下工作：
+
 1. 为每一个输入/输出创建`argument`。
    - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
 1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。

--- a/doc/howto/capi/organization_of_the_inputs_en.md
+++ b/doc/howto/capi/organization_of_the_inputs_en.md
+## Input/Output Data Organization
+
+TBD
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -11,6 +11,7 @@
 </p>

 - 准备预测模型
+
    1. 只将神经网络结构进行序列化。
        - 只对神经网络结构进行序列化，加载模型需同时指定：网络结构的序列化结果和模型参数存储目录。
    1. 将网络结构定义和训练结束存储下来的模型参数文件（多个）合并入一个文件。
@@ -18,6 +19,7 @@
        - 预测时只需加载一个文件便于发布。
    - **注意**：以上两种方式只需选择其一即可。
 - 调用 C-API 开发预测序
+
    1. 初始化PaddlePaddle运行环境。
    1. 加载预测模型。
    1. 创建神经网络输入，组织输入数据。
@@ -90,6 +92,7 @@
 1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口，与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时，通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。

 - 注意事项
+
    1. 使用PaddlePaddle V2 API训练，模型中所有可学习参数会被存为一个压缩文件，需要手动进行解压，将它们放在同一目录中，C-API不会直接加载 V2 API 存储的压缩文件。
    1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件，请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
    1. 通过灵活使用以上两个接口，加载模型可其它多种方式，例如也可在程序运行过程中再加载另外一个模型。
@@ -106,6 +109,7 @@ C-API支持的所有输入数据类型和他们的组织方式，请参考“输
 这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出，使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。

 在组织神经网络输入，获取输出时，需要思考完成以下工作：
+
 1. 为每一个输入/输出创建`argument`；
 1. 为每一个`argument`创建`paddle_matrix`来存储数据；


--- a/doc/howto/capi/workflow_of_capi_en.md
+++ b/doc/howto/capi/workflow_of_capi_en.md
+## C-API Workflow
+
+TBD
--- a/doc/howto/cluster/cmd_argument_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
-## 启动参数说明
+# 启动参数说明

 下面以`doc/howto/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。

-### 启动参数服务器
+## 启动参数服务器
+
 执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+
 ```bash
 $ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
 ```

 如果希望可以在后台运行pserver程序，并保存输出到一个日志文件，可以运行：
+
 ```bash
 $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
 ```
@@ -20,8 +23,10 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
 - ports_num_for_sparse：**必选，默认0**，用于稀疏类型参数通信的端口个数
 - num_gradient_servers：**必选，默认1**，当前训练任务pserver总数

-### 启动计算节点
+## 启动计算节点
+
 执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
+
 ```bash
 $ python train.py
 ```
@@ -67,7 +72,7 @@ paddle.init(
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开


-### 准备数据集
+## 准备数据集

 参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。

@@ -84,7 +89,8 @@ for f in flist:
 ```

 示例程序`prepare.py`会把训练集和测试集分别分割成多个文件（例子中为3个，后缀为`-00000`、`-00001`和`-00002`）:
-```
+
+```bash
 train.txt
 train.txt-00000
 train.txt-00001
@@ -99,12 +105,13 @@ test.txt-00002

 对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。

-### 准备训练程序
+## 准备训练程序

 我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。

 最后，工作空间应如下所示：
-```
+
+```bash
 .
 |-- my_lib.py
 |-- word_dict.pickle
@@ -133,3 +140,21 @@ test.txt-00002

 - `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
 - `test_data_dir`：包含测试数据集的目录。
+
+## 异步 SGD 更新
+
+我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。
+例如，设置 `AdaGrad` optimize 的 `is_async` 和 `async_lagged_grad_discard_ratio` 参数：
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: 是否为异步SGD更新模式。
+- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制，接收到足够的gradient(
+  `async_lagged_grad_discard_ratio * num_gradient_servers`)之后，后面的gradient
+  将会被抛弃。
--- a/doc/howto/cluster/cmd_argument_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
-## Command-line arguments
+# Command-line arguments

 We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.

-### Starting parameter server
+## Starting parameter server

 Type the below command to start a parameter server which will wait for trainers to connect:

 ```bash
-$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0
 ```

 If you wish to run parameter servers in background, and save a log file, you can type:
+
 ```bash
-$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log &
 ```

 Parameter Description
@@ -21,8 +22,10 @@ Parameter Description
 - ports_num: **required, default 1**, total number of ports will listen on.
 - ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
 - num_gradient_servers: **required, default 1**, total number of gradient servers.
+- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on.
+
+## Starting trainer

-### Starting trainer
 Type the command below to start the trainer(name the file whatever you want, like "train.py")

 ```bash
@@ -70,7 +73,7 @@ Parameter Description
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".

-### Prepare Training Dataset
+## Prepare Training Dataset

 Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.

@@ -88,7 +91,7 @@ for f in flist:

 Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:

-```
+```bash
 train.txt
 train.txt-00000
 train.txt-00001
@@ -103,13 +106,13 @@ When job started, every trainer needs to get it's own part of data. In some dist

 Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.

-### Prepare Training program
+## Prepare Training program

 We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.

-
 Your workspace may looks like:
-```
+
+```bash
 .
 |-- my_lib.py
 |-- word_dict.pickle
@@ -138,3 +141,21 @@ Your workspace may looks like:

 - `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
 - `test_data_dir`: containing testing data.
+
+## Async SGD Update
+
+We can set some parameters of the optimizer to make it support async SGD update.
+For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer:
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+    is_async=True,
+    async_lagged_grad_discard_ratio=1.6,
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: Is Async-SGD or not.
+- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control.
+  when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed,
+  current async gradient will be discard silently.
--- a/doc/howto/cluster/multi_cluster/fabric_en.md
+++ b/doc/howto/cluster/multi_cluster/fabric_en.md
-# Cluster Training Using Fabric
+# Fabric

 ## Prepare a Linux cluster


--- a/doc/howto/cluster/multi_cluster/index_cn.rst
+++ b/doc/howto/cluster/multi_cluster/index_cn.rst
 在不同集群中运行
 ================
+用户的集群环境不尽相同，为了方便大家的部署，我们提供了多种的集群部署方式，方便提交集群训练任务，以下将一一介绍:

-PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
+`Kubernetes <http://kubernetes.io>`_ 是Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。以下指南展示了PaddlePaddle对Kubernetes的支持：

-对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
+..  toctree::
+  :maxdepth: 1
+
+  k8s_cn.md
+  k8s_distributed_cn.md

-在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
+`OpenMPI <https://www.open-mpi.org>`_  是成熟的高性能并行计算框架，在HPC领域使用非常的广泛。以下指南介绍了如何使用OpenMPI来搭建PaddlePaddle的集群训练任务:

 ..  toctree::
  :maxdepth: 1

-  fabric_cn.md
  openmpi_cn.md
-  k8s_cn.md
-  k8s_distributed_cn.md
+
+`Fabric <http://www.fabfile.org>`_ 是一个方便的程序部署和管理工具。我们提供了使用Fabric 进行部署、管理的方法，如果想详细了解，请阅读以下指南:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_cn.md
+
+我们也支持在AWS上部署PaddlePaddle，详细请了解:
+
+..  toctree::
+  :maxdepth: 1
+
  k8s_aws_cn.md
+
+您可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到以上相关的例子。
+
--- a/doc/howto/cluster/multi_cluster/k8s_aws_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
-
-# Distributed PaddlePaddle Training on AWS with Kubernetes
+# Kubernetes on AWS

 We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.


--- a/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
+# Kubernetes Distributed
+
+TBD
--- a/doc/howto/cluster/multi_cluster/k8s_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_en.md
-# PaddlePaddle On Kubernetes
+# Kubernetes

 In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.


--- a/doc/howto/cluster/multi_cluster/openmpi_cn.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_cn.md
-# 在OpenMPI集群中提交训练作业
+# 在OpenMPI集群中启动训练

 ## 准备OpenMPI集群


--- a/doc/howto/cluster/multi_cluster/openmpi_en.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_en.md
-# Cluster Training Using OpenMPI
+# OpenMPI

 ## Prepare an OpenMPI cluster


--- a/doc/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/howto/rnn/hierarchical_layer_en.rst
+Layers supporting hierarchical sequence as input
+================================================
+
+TBD
--- a/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
+API comparision between RNN and hierarchical RNN
+================================================
+
+TBD
--- a/doc/howto/rnn/index_en.rst
+++ b/doc/howto/rnn/index_en.rst
@@ -5,3 +5,6 @@ RNN Models
  :maxdepth: 1

  rnn_config_en.rst
+  recurrent_group_en.md
+  hierarchical_layer_en.rst
+  hrnn_rnn_api_compare_en.rst
--- a/doc/howto/rnn/recurrent_group_en.md
+++ b/doc/howto/rnn/recurrent_group_en.md
+# Recurrent Group Tutorial
+
+TBD
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -8,3 +8,4 @@ PaddlePaddle Documentation
  build_and_install/index_en.rst
  howto/index_en.rst
  dev/index_en.rst
+  faq/index_en.rst
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -23,6 +23,12 @@ $ docker build -t username/paddle-android:dev . -f Dockerfile.android
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```

+对于国内用户，我们提供了加速访问的镜像源：
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### 编译PaddlePaddle C-API库
 构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
 Android的Docker开发镜像向用户提供两个可配置的参数：
@@ -56,15 +62,15 @@ Android的Docker开发镜像向用户提供两个可配置的参数：

 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库

-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```

 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库

-  ```bash
-  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
-  ```
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```

 执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。

@@ -155,7 +161,11 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
      ..
 ```

-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。
+
+- 设置`CMAKE_BUILD_TYPE`为`MinSizeRel`，最小化生成的库的大小。
+- 设置`CMAKE_BUILD_TYPE`为`Release`，获得最快的执行速度，
+- 用户亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。

 **性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：


--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -25,6 +25,12 @@ Users can directly use the published Docker image.
 $ docker pull paddlepaddle/paddle:latest-dev-android
 ```

+For users in China, we provide a faster mirror.
+
+```bash
+$ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
+```
+
 ### Build the Inference Library

 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -86,19 +92,19 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht

 - To build the standalone toolchain for `armeabi-v7a` and Android API level 21:

-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
  
  The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.

 - To build the standalone toolchain for `arm64-v8a` and Android API level 21:

-  ```bash
-  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-          --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
-  ```
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```

  The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.


--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -56,7 +56,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
+    shape_inference data_transform lod_tensor profiler)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)

@@ -80,7 +80,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)

 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table profiler feed_fetch_method)
+framework_proto backward glog lod_rank_table feed_fetch_method)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -96,3 +96,6 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
      
 cc_test(channel_test SRCS channel_test.cc)
+cc_test(tuple_test SRCS tuple_test.cc )
+cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+        channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc)
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -91,6 +91,11 @@ class ChannelHolder {

  inline bool IsInitialized() const { return holder_ != nullptr; }

+  inline const std::type_index Type() {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    return holder_->Type();
+  }
+
 private:
  /**
   * @note    Placeholder hides type T, so it doesn't appear as a template

--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -542,3 +542,341 @@ TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
  ChannelHolderSendReceive(ch);
  delete ch;
 }
+
+TEST(ChannelHolder, ChannelUninitializedTest) {
+  ChannelHolder *ch = new ChannelHolder();
+  EXPECT_EQ(ch->IsInitialized(), false);
+  int i = 10;
+  EXPECT_EQ(ch->Send(&i), false);
+  EXPECT_EQ(ch->Receive(&i), false);
+  bool is_exception = false;
+  try {
+    ch->Type();
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+TEST(ChannelHolder, ChannelInitializedTest) {
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(2);
+  EXPECT_EQ(ch->IsInitialized(), true);
+  // Channel should remain intialized even after close
+  ch->close();
+  EXPECT_EQ(ch->IsInitialized(), true);
+  delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchSendTest) {
+  // Test with unbuffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  bool is_exception = false;
+  bool boolean_data = true;
+  try {
+    ch->Send(&boolean_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+
+  // Test with Buffered Channel
+  ch = new ChannelHolder();
+  ch->Reset<float>(10);
+  is_exception = false;
+  int int_data = 23;
+  try {
+    ch->Send(&int_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchReceiveTest) {
+  // Test with unbuffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  bool is_exception = false;
+  bool float_data;
+  try {
+    ch->Receive(&float_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+
+  // Test with Buffered Channel
+  ch = new ChannelHolder();
+  ch->Reset<float>(10);
+  is_exception = false;
+  int int_data = 23;
+  try {
+    ch->Receive(&int_data);
+  } catch (paddle::platform::EnforceNotMet e) {
+    is_exception = true;
+  }
+  EXPECT_EQ(is_exception, true);
+  delete ch;
+}
+
+void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  ch->close();
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.1 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  if (isBuffered) {
+    // If ch is Buffered, atleast 4 threads must be blocked.
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (!thread_ended[i]) ct++;
+    }
+    EXPECT_GE(ct, 4);
+  } else {
+    // If ch is UnBuffered, all the threads should be blocked.
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly close the thread
+  // This should unblock all senders
+  ch->close();
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  if (isBuffered) {
+    // Verify that only 1 send was successful
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (send_success[i]) ct++;
+    }
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a channelholder unblocks
+//  any receivers waiting on the channel
+TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
+  // Check for buffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderCloseUnblocksReceiversTest(ch);
+  delete ch;
+
+  // Check for unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderCloseUnblocksReceiversTest(ch);
+  delete ch;
+}
+
+// This tests that closing a channelholder unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
+  // Check for buffered channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderCloseUnblocksSendersTest(ch, true);
+  delete ch;
+
+  // Check for unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderCloseUnblocksSendersTest(ch, false);
+  delete ch;
+}
+
+// This tests that destroying a channelholder unblocks
+//  any senders waiting for channel
+void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  if (isBuffered) {
+    // If channel is buffered, verify that atleast 4 threads are blocked
+    int ct = 0;
+    for (size_t i = 0; i < num_threads; i++) {
+      if (thread_ended[i] == false) ct++;
+    }
+    // Atleast 4 threads must be blocked
+    EXPECT_GE(ct, 4);
+  } else {
+    // Verify that all the threads are blocked
+    for (size_t i = 0; i < num_threads; i++) {
+      EXPECT_EQ(thread_ended[i], false);
+    }
+  }
+  // Explicitly destroy the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Count number of successfuld sends
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+
+  if (isBuffered) {
+    // Only 1 send must be successful
+    EXPECT_EQ(ct, 1);
+  } else {
+    // In unbuffered channel, no send should be successful
+    EXPECT_EQ(ct, 0);
+  }
+
+  // Join all threads
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channelholder also unblocks
+//  any receivers waiting on the channel
+void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+  // delete the channel
+  delete ch;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderDestroyUnblockReceivers(ch);
+  // ch is already deleted already deleted in
+  // ChannelHolderDestroyUnblockReceivers
+
+  // Check for Unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderDestroyUnblockReceivers(ch);
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(1);
+  ChannelHolderDestroyUnblockSenders(ch, true);
+  // ch is already deleted already deleted in
+  // ChannelHolderDestroyUnblockReceivers
+
+  // Check for Unbuffered channel
+  ch = new ChannelHolder();
+  ch->Reset<int>(0);
+  ChannelHolderDestroyUnblockSenders(ch, false);
+}
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(go);
+USE_NO_KERNEL_OP(channel_close);
+USE_NO_KERNEL_OP(channel_create);
+USE_NO_KERNEL_OP(channel_recv);
+USE_NO_KERNEL_OP(channel_send);
+USE_NO_KERNEL_OP(elementwise_add);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name,
+                       T value) {
+  // Create LoDTensor<int> of dim [1,1]
+  auto var = scope.Var(name);
+  auto tensor = var->GetMutable<LoDTensor>();
+  tensor->Resize({1, 1});
+  T *expect = tensor->mutable_data<T>(place);
+  expect[0] = value;
+}
+
+void InitTensorsInScope(Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+
+  // Create channel variable
+  scope.Var("Channel");
+
+  // Create Variables, x0 will be put into channel,
+  // result will be pulled from channel
+  CreateIntVariable(scope, place, "Status", false);
+  CreateIntVariable(scope, place, "x0", 99);
+  CreateIntVariable(scope, place, "result", 0);
+}
+
+void AddOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, AttributeMap attrs,
+           BlockDesc *block) {
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(Concurrency, Go_Op) {
+  Scope scope;
+  p::CPUPlace place;
+
+  // Initialize scope variables
+  InitTensorsInScope(scope, place);
+
+  framework::Executor executor(place);
+  ProgramDesc program;
+  BlockDesc *block = program.MutableBlock(0);
+
+  // Create channel OP
+  AddOp("channel_create", {}, {{"Out", {"Channel"}}},
+        {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
+        block);
+
+  // Create Go Op routine
+  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
+  AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
+        {{"Status", {"Status"}}}, {}, goOpBlock);
+
+  // Create Go Op
+  AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
+        block);
+
+  // Create Channel Receive Op
+  AddOp("channel_recv", {{"Channel", {"Channel"}}},
+        {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
+
+  // Create Channel Close Op
+  AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
+
+  // Check the result tensor to make sure it is set to 0
+  const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
+  auto *initialData = tensor.data<int>();
+  EXPECT_EQ(initialData[0], 0);
+
+  executor.Run(program, &scope, 0, true, true);
+
+  // After we call executor.run, the Go operator should do a channel_send to set
+  // the
+  // "result" variable to 99
+  auto *finalData = tensor.data<int>();
+  EXPECT_EQ(finalData[0], 99);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -26,12 +26,15 @@ Dim<i> make_dim(const int64_t* d) {
 }

 template <>
-Dim<1> make_dim<1>(const int64_t* d) {
-  return Dim<1>(*d);
+Dim<0> make_dim<0>(const int64_t* d) {
+  return Dim<0>(*d);
 }

 void make_ddim(DDim& ddim, const int64_t* dims, int n) {
  switch (n) {
+    case 0:
+      ddim = make_dim<0>(dims);
+      break;
    case 1:
      ddim = make_dim<1>(dims);
      break;
@@ -190,7 +193,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
    this->operator()(t.tail);
  }

-  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+  void operator()(const Dim<0>& t) {}
 };
 /// @endcond

@@ -247,9 +250,8 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
    }
  }

-  void operator()(const Dim<1>& dim) {
-    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
-    vector.push_back(dim.head);
+  void operator()(const Dim<0>& dim) {
+    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
  }
 };


--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -30,8 +30,8 @@ namespace framework {
 * The number of dimensions must be between [1, 9].
 */
 struct DDim {
-  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                         Dim<8>, Dim<9>>
+  typedef boost::variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
+                         Dim<7>, Dim<8>, Dim<9>>
      DDimVar;
  DDimVar var;


--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -72,38 +72,36 @@ struct Dim {

 // Base case specialization
 template <>
-struct Dim<1> {
-  static constexpr int dimensions = 1;
+struct Dim<0> {
+  static constexpr int dimensions = 0;

  HOSTDEVICE
-  Dim(int64_t _head) : head(_head) {}
+  Dim(int64_t _head) {}

  HOSTDEVICE
-  Dim() : head(0) {}
+  Dim() {}

  HOSTDEVICE
-  Dim(int idx, const Dim<1>& size) : head(idx) {
+  Dim(int idx, const Dim<0>& size) {
 #ifndef __CUDA_ARCH__
-    if (idx >= size.head) {
+    if (idx > 0) {
      throw std::invalid_argument("Index out of range.");
    }
 #else
-    PADDLE_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx == 0);
 #endif
  }

  HOSTDEVICE
-  bool operator==(const Dim<1>& o) const { return (head == o.head); }
+  bool operator==(const Dim<0>& o) const { return true; }

  HOSTDEVICE
-  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
+  bool operator!=(const Dim<0>& o) const { return false; }

  HOSTDEVICE
  int64_t& operator[](int idx);
  HOSTDEVICE
  int64_t operator[](int idx) const;
-
-  int64_t head;
 };

 namespace {
@@ -154,15 +152,14 @@ HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 }

 template <>
-HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
+  throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(idx == 0);
+  PADDLE_ASSERT(false);
 #endif
-  return dim.head;
+  static int64_t head = 0;
+  return head;
 }

 template <int D>
@@ -181,15 +178,14 @@ HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 }

 template <>
-HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
 #ifndef __CUDA_ARCH__
-  if (idx != 0) {
-    throw std::invalid_argument("Invalid index");
-  }
+  throw std::invalid_argument("Invalid index");
 #else
-  PADDLE_ASSERT(idx == 0);
+  PADDLE_ASSERT(false);
 #endif
-  return dim.head;
+  static int64_t head = 0;
+  return head;
 }

 }  // namespace
@@ -218,12 +214,12 @@ HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
 }

 // Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
  return indexer(*this, i);
 }

 // Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
  return indexer(*this, i);
 }

@@ -251,8 +247,8 @@ HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
-  return a.head * b.head;
+HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
+  return 0;
 }

 // Product of a Dim
@@ -264,8 +260,8 @@ HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
-  return prod * a.head;
+HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
+  return prod;
 }

 // Is 0 <= idx_i < size_i for all i?
@@ -278,8 +274,8 @@ HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
 // Base case of is 0 <= idx_i < size_i ?
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
-  return ((0 <= idx.head) && (idx.head < size.head));
+HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
+  return true;
 }

 /**
@@ -294,8 +290,8 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
 // Base case of ex_prefix_mul
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
-  return Dim<1>(mul);
+HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
+  return Dim<0>();
 }
 ///\endcond

@@ -309,8 +305,8 @@ HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {

 // Base case
 template <>
-HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head + b.head);
+HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
+  return Dim<0>();
 }

 template <int i>
@@ -328,8 +324,8 @@ HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {

 // Base case
 template <>
-HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
-  return Dim<1>(a.head * b.head);
+HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
+  return Dim<0>();
 }

 template <int i>
@@ -356,10 +352,9 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
 ///\cond HIDDEN

 template <>
-HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
-                                           const Dim<1>& stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<1>(norm_stride);
+HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
+                                           const Dim<0>& stride) {
+  return Dim<0>();
 }

 ///\endcond
@@ -394,6 +389,10 @@ typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
  return os;
 }

+inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
+  return os;
+}
+
 template <int i>
 HOST std::string Dim<i>::to_string() const {
  std::stringstream stream;

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
@@ -126,11 +125,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);

-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    // TODO(panyx0718): Need a program id to distinguish programs.
-    platform::RecordEvent record_event(op->Type(), pool.Get(place_),
-                                       op_desc->Block()->ID());
-
    VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
    op->Run(*local_scope, place_);


--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -117,6 +117,7 @@ message VarType {
    // raw variables should manage their own allocations
    // in operators like nccl_op
    RAW = 17;
+    TUPLE = 18;
  }

  required Type type = 1;
@@ -148,6 +149,9 @@ message VarType {
    required int64 capacity = 2;
  }
  optional ChannelDesc channel = 6;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
 }

 message VarDesc {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(benchmark);

@@ -497,7 +498,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  this->InferShape(&infer_shape_ctx);
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto dev_ctx = pool.Get(place);
-
+  // profile
+  platform::RecordEvent record_event(Type(), dev_ctx);
  // check if op[type] has kernel registered.
  auto& all_op_kernels = AllOpKernels();
  auto kernels_iter = all_op_kernels.find(type_);

--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+typedef boost::variant<int, int64_t, float, double, std::string, Tensor,
+                       LoDTensor /*, ChannelHolder*/>
+    ElementVar;
+
+class Tuple {
+ public:
+  using ElementVars = std::vector<ElementVar>;
+
+  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+      : var_(var), var_desc_(var_desc) {}
+  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+
+  ElementVar get(int idx) const { return var_[idx]; };
+
+  ElementVar& get(int idx) { return var_[idx]; };
+
+  bool isSameType(Tuple& t) const;
+
+  size_t getSize() const { return var_.size(); };
+
+ private:
+  ElementVars var_;
+  std::vector<VarDesc> var_desc_;
+};
+
+bool Tuple::isSameType(Tuple& t) const {
+  size_t tuple_size = getSize();
+  if (tuple_size != t.getSize()) {
+    return false;
+  }
+  for (size_t j = 0; j < tuple_size; ++j) {
+    auto type1 = get(j).which();
+    auto type2 = t.get(j).which();
+    if (type1 != type2) return false;
+  }
+  return true;
+}
+
+Tuple* make_tuple(std::vector<ElementVar> tuple) { return new Tuple(tuple); }
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/tuple_test.cc
+++ b/paddle/fluid/framework/tuple_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <sstream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tuple.h"
+
+TEST(Tuple, Make) {
+  std::vector<paddle::framework::ElementVar> element_type;
+  element_type.push_back(12);
+  element_type.push_back(12.0f);
+  element_type.push_back("ElementVar");
+
+  paddle::framework::Tuple* tuple = paddle::framework::make_tuple(element_type);
+
+  EXPECT_EQ(boost::get<int>(tuple->get(0)), 12);
+  EXPECT_EQ(boost::get<float>(tuple->get(1)), 12.0f);
+  EXPECT_EQ(boost::get<std::string>(tuple->get(2)), "ElementVar");
+
+  delete tuple;
+}
+
+TEST(Tuple, IsTheSameType) {
+  std::vector<paddle::framework::ElementVar> element_type1;
+  std::vector<paddle::framework::ElementVar> element_type2;
+  std::vector<paddle::framework::ElementVar> element_type3;
+
+  element_type1.push_back(12);
+  element_type1.push_back(12.0f);
+  element_type1.push_back("Tuple1");
+
+  element_type2.push_back(13);
+  element_type2.push_back(13.0f);
+  element_type2.push_back("Tuple2");
+
+  element_type3.push_back(14.0f);
+  element_type3.push_back(14);
+  element_type3.push_back("Tuple3");
+
+  paddle::framework::Tuple* tuple1 =
+      paddle::framework::make_tuple(element_type1);
+  paddle::framework::Tuple* tuple2 =
+      paddle::framework::make_tuple(element_type2);
+  paddle::framework::Tuple* tuple3 =
+      paddle::framework::make_tuple(element_type3);
+
+  EXPECT_TRUE(tuple1->isSameType(*tuple2));
+  EXPECT_FALSE(tuple1->isSameType(*tuple3));
+
+  delete tuple1;
+  delete tuple2;
+  delete tuple3;
+}
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -130,7 +130,7 @@ endif()

 if(WITH_DISTRIBUTE)
    add_subdirectory(detail)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -41,6 +41,14 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("ColToRowMatchIndices", dims);
    ctx->SetOutputDim("ColToRowMatchDist", dims);
  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
+  }
 };

 template <typename T>

--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -37,12 +37,19 @@ class BoxCoderOp : public framework::OperatorWithKernel {
                      "The rank of Input of PriorBoxVar must be 2");
    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
-    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                      "The rank of Input of TargetBox must be 2");
-    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                      "The shape of TargetBox is [M, 4]");

-    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                        "The rank of Input of TargetBox must be 2");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                        "The shape of TargetBox is [M, 4]");
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
+                        "The rank of Input of TargetBox must be 3");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+    }

    ctx->SetOutputDim(
        "OutputBox",
@@ -70,25 +77,28 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
             "of variance.");
    AddInput(
        "TargetBox",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the box if the input "
-        "is image feature map, they are close to the origin of the coordinate "
-        "system. [xmax, ymax] is the right bottom coordinate of the box. "
-        "This tensor can contain LoD information to represent a batch "
-        "of inputs. One instance of this batch can contain different "
-        "numbers of entities.");
+        "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
+        "[N, 4] when code_type is 'encode_center_size'. This input also can "
+        "be a 3-D Tensor with shape [N, M, 4] when code_type is "
+        "'decode_center_size'. [N, 4], each box is represented as "
+        "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
+        "of the box if the input is image feature map, they are close to "
+        "the origin of the coordinate system. [xmax, ymax] is the right "
+        "bottom coordinate of the box. This tensor can contain LoD "
+        "information to represent a batch of inputs. One instance of this "
+        "batch can contain different numbers of entities.");
    AddAttr<std::string>("code_type",
                         "(string, default encode_center_size) "
                         "the code type used with the target box")
        .SetDefault("encode_center_size")
        .InEnum({"encode_center_size", "decode_center_size"});
-    AddOutput(
-        "OutputBox",
-        "(LoDTensor or Tensor) "
-        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
-        "representing the result of N target boxes encoded/decoded with "
-        "M Prior boxes and variances.");
+    AddOutput("OutputBox",
+              "(LoDTensor or Tensor) "
+              "When code_type is 'encode_center_size', the output tensor of "
+              "box_coder_op with shape [N, M, 4] representing the result of N "
+              "target boxes encoded with M Prior boxes and variances. When "
+              "code_type is 'decode_center_size', N represents the batch size "
+              "and M represents the number of deocded boxes.");

    AddComment(R"DOC(
 Bounding Box Coder Operator.

--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
@@ -66,7 +66,6 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                       T* output) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < row * col) {
-    const int row_idx = idx / col;
    const int col_idx = idx % col;
    T prior_box_width =
        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
@@ -79,17 +78,16 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                           2;

    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
-                             target_box_data[row_idx * len + 2]) *
+                             target_box_data[idx * len + 2]) *
                         prior_box_width;
    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
-                              target_box_data[row_idx * len + 3]) *
+                              target_box_data[idx * len + 3]) *
                          prior_box_height;
    T target_box_center_x = prior_box_var_data[col_idx * len] *
-                                target_box_data[row_idx * len] *
-                                prior_box_width +
+                                target_box_data[idx * len] * prior_box_width +
                            prior_box_center_x;
    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
-                                target_box_data[row_idx * len + 1] *
+                                target_box_data[idx * len + 1] *
                                prior_box_height +
                            prior_box_center_y;


--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
@@ -89,6 +89,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {

    for (int64_t i = 0; i < row; ++i) {
      for (int64_t j = 0; j < col; ++j) {
+        size_t offset = i * col * len + j * len;
        T prior_box_width =
            prior_box_data[j * len + 2] - prior_box_data[j * len];
        T prior_box_height =
@@ -99,20 +100,19 @@ class BoxCoderKernel : public framework::OpKernel<T> {
            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;

        T target_box_center_x = prior_box_var_data[j * len] *
-                                    target_box_data[i * len] * prior_box_width +
+                                    target_box_data[offset] * prior_box_width +
                                prior_box_center_x;
        T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                    target_box_data[i * len + 1] *
+                                    target_box_data[offset + 1] *
                                    prior_box_height +
                                prior_box_center_y;
        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                      target_box_data[i * len + 2]) *
+                                      target_box_data[offset + 2]) *
                             prior_box_width;
        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                       target_box_data[i * len + 3]) *
+                                       target_box_data[offset + 3]) *
                              prior_box_height;

-        size_t offset = i * col * len + j * len;
        output[offset] = target_box_center_x - target_box_width / 2;
        output[offset + 1] = target_box_center_y - target_box_height / 2;
        output[offset + 2] = target_box_center_x + target_box_width / 2;

--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace pf = paddle::framework;
+static constexpr char kChannel[] = "Channel";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCloseOp : public framework::OperatorBase {
+ public:
+  ChannelCloseOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto &inp = *scope.FindVar(Input(kChannel));
+
+    // Get the mutable version of the channel variable and closes it.
+    pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
+    ch->close();
+  }
+};
+
+class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("Channel"),
+                   "The input of ChannelClose op must be set");
+  }
+};
+
+class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelCloseOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kChannel,
+             "The Channel Variable that should be closed by"
+             " the ChannelClose Op.");
+    AddComment(R"DOC(
+Channel Close Operator.
+
+This operator closes an open channel.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelCloseOpMaker);
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace pf = paddle::framework;
+
+static constexpr char kOutput[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCreateOp : public framework::OperatorBase {
+ public:
+  ChannelCreateOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto &out = *scope.FindVar(Output(kOutput));
+
+    // Determine the datatype and capacity of the channel to be created
+    // from the attributes provided.
+    auto dtype =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
+    auto capacity = Attr<int>("capacity");
+
+    // Based on the datatype, create a new channel holder initialized with
+    // the given capacity. When capacity is 0, an unbuffered channel is
+    // created.
+    pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
+    if (dtype == framework::proto::VarType::LOD_TENSOR) {
+      ch->Reset<pf::LoDTensor>(capacity);
+    } else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
+      ch->Reset<pf::SelectedRows>(capacity);
+    } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
+      ch->Reset<pf::LoDRankTable>(capacity);
+    } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      ch->Reset<pf::LoDTensorArray>(capacity);
+    } else if (dtype == framework::proto::VarType::READER) {
+      ch->Reset<pf::ReaderHolder>(capacity);
+    } else if (dtype == framework::proto::VarType::CHANNEL) {
+      ch->Reset<pf::ChannelHolder>(capacity);
+    } else if (dtype == framework::proto::VarType::BOOL) {
+      ch->Reset<bool>(capacity);
+    } else if (dtype == framework::proto::VarType::INT32) {
+      ch->Reset<int>(capacity);
+    } else if (dtype == framework::proto::VarType::INT64) {
+      ch->Reset<int64_t>(capacity);
+    } else if (dtype == framework::proto::VarType::FP32) {
+      ch->Reset<float>(capacity);
+    } else if (dtype == framework::proto::VarType::FP64) {
+      ch->Reset<double>(capacity);
+    } else {
+      PADDLE_THROW(
+          "Data type %d is not in "
+          "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
+          "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
+          dtype);
+    }
+  }
+};
+
+class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasOutput(kOutput),
+                   "The output of ChannelCreate op must be set");
+    context->SetOutputDim(kOutput, {1});
+  }
+};
+
+class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelCreateOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(kOutput,
+              "The object of a Channel type created by ChannelCreate Op.");
+    AddAttr<int>("capacity", "The size of the buffer of Channel.")
+        .SetDefault(0);
+    AddAttr<int>("data_type", "The data type of elements inside the Channel.");
+    AddComment(R"DOC(
+Channel Create Operator.
+
+This operator creates an object of the VarType Channel and returns it.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelCreateOpMaker);
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include <paddle/fluid/framework/lod_rank_table.h>
+#include <paddle/fluid/framework/lod_tensor_array.h>
+#include <paddle/fluid/framework/reader.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char Status[] = "Status";
+static constexpr char Out[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+void SetReceiveStatus(const platform::Place &dev_place,
+                      framework::Variable &status_var, bool status) {
+  auto cpu = platform::CPUPlace();
+  auto status_tensor =
+      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                        cpu);
+  status_tensor[0] = status;
+}
+
+bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) {
+  // Get type of channel and use that to call mutable data for Variable
+  auto type = framework::ToVarType(ch->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    return ch->Receive(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    return ch->Receive(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelReceive:Unsupported type");
+}
+
+class ChannelRecvOp : public framework::OperatorBase {
+ public:
+  ChannelRecvOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const {
+    PADDLE_ENFORCE(ctx->HasInput(Channel),
+                   "Input(Channel) of ChannelRecvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Out),
+                   "Input(Channel) of ChannelRecvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Status),
+                   "Output(Status) of ChannelRecvOp should not be null.");
+    ctx->SetOutputDim("Status", {1});
+  }
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    // Get the channel holder created by channel_create op, passed as input.
+    framework::ChannelHolder *ch =
+        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
+    auto output_var = scope.FindVar(Output(Out));
+    // Receive the data from the channel.
+    bool ok = ChannelReceive(ch, output_var);
+
+    // Set the status output of the `ChannelReceive` call.
+    SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+  }
+};
+
+class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelRecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(Channel,
+             "(Channel) A variable which \"receives\" the a value sent"
+             "to it by a channel_send op.")
+        .AsDuplicable();
+    AddOutput(Out,
+              "(Variable) Output Variable that will hold the data received"
+              " from the Channel")
+        .AsDuplicable();
+    AddOutput(Status,
+              "(Tensor) An LoD Tensor that returns a boolean status of the"
+              "result of the receive operation.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelRecvOpMaker);
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include <paddle/fluid/framework/lod_rank_table.h>
+#include <paddle/fluid/framework/lod_tensor_array.h>
+#include <paddle/fluid/framework/reader.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char X[] = "X";
+static constexpr char Status[] = "Status";
+static constexpr char copy[] = "copy";
+
+namespace paddle {
+namespace operators {
+
+void SetSendStatus(const platform::Place &dev_place,
+                   framework::Variable &status_var, bool status) {
+  auto cpu = platform::CPUPlace();
+  auto status_tensor =
+      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                        cpu);
+  status_tensor[0] = status;
+}
+
+bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+  auto type = framework::ToVarType(var->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    return ch->Send(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    return ch->Send(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    return ch->Send(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    return ch->Send(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    return ch->Send(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    return ch->Send(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelSend:Unsupported type");
+}
+
+class ChannelSendOp : public framework::OperatorBase {
+ public:
+  ChannelSendOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const {
+    PADDLE_ENFORCE(ctx->HasInput(Channel),
+                   "Input(Channel) of ChannelSendOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(X),
+                   "Input(X) of ChannelSendOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(Status),
+                   "Output(Status) of ChannelSendOp should not be null.");
+    ctx->SetOutputDim("Status", {1});
+  }
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    // Get the channel holder created by channel_create op, passed as input.
+    framework::ChannelHolder *ch =
+        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
+    auto input_var = scope.FindVar(Input(X));
+
+    // Send the input data through the channel.
+    bool ok = ChannelSend(ch, input_var);
+
+    // Set the status output of the `ChannelSend` call.
+    SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+  }
+};
+
+class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChannelSendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(Channel,
+             "(Channel) A variable which \"sends\" the passed in value to "
+             "a listening receiver.")
+        .AsDuplicable();
+    AddInput(X, "(Variable) The value which gets sent by the channel.")
+        .AsDuplicable();
+    AddOutput(Status,
+              "(Tensor) An LoD Tensor that returns a boolean status of the"
+              "result of the send operation.")
+        .AsDuplicable();
+    AddAttr<bool>(copy, "(bool, default false) Should copy before send")
+        .SetDefault(false);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::ChannelSendOpMaker);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -54,12 +54,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
-                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
-                       0,
-                   "Due to the settings of paddings, filter_dims and "
-                   "dilations, the output size is less than 0, please check "
-                   "again.");
    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
                                          dilations[i], paddings[i],
                                          strides[i]));

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -31,7 +31,14 @@ using Tensor = framework::Tensor;
 inline int ConvOutputSize(int input_size, int filter_size, int dilation,
                          int padding, int stride) {
  const int dkernel = dilation * (filter_size - 1) + 1;
-  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  PADDLE_ENFORCE(
+      output_size > 0,
+      "Due to the settings of padding(%d), filter_size(%d), dilation(%d) and "
+      "stride(%d), the output size is less than 0, please check "
+      "again. Input_size:%d",
+      padding, filter_size, dilation, stride, input_size);
+
  return output_size;
 }
 inline bool IsExpand(std::vector<int64_t>& filter_dim,

--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -24,6 +24,29 @@ namespace detail {
 template <typename T, int Rank>
 struct StridedMemcpyFunctor;

+template <typename T>
+struct StridedMemcpyFunctor<T, 0> {
+  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+                  framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
+                  framework::Dim<0> dst_stride, T* dst) const {
+    auto place = dev_ctx.GetPlace();
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
+      auto& cuda_ctx =
+          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
+                   cuda_ctx.stream());
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+    }
+  }
+};
+
 template <typename T>
 struct StridedMemcpyFunctor<T, 1> {
  void operator()(const platform::DeviceContext& dev_ctx, const T* src,

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -65,12 +65,17 @@ smaller than or equal to the dimensions of $X$.

 There are two cases for this operator:
 1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a subset of $X$.
+2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
+   of size 1 for $Y$ will be ignored for the consideration of subsequence.
+

 For case 2:
+
 $Y$ will be broadcasted to match the shape of $X$ and axis should be
 set to index of the start dimension to broadcast $Y$ onto $X$.

+If axis is -1, it is treated as axis=rank(X)-rank(Y).
+
 For example
  .. code-block:: python

@@ -79,6 +84,7 @@ For example
    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0

 Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
 information. However, the output only shares the LoD information with input $X$.

--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -20,6 +20,7 @@ limitations under the License. */

 #ifdef __NVCC__
 #include <thrust/iterator/iterator_adaptor.h>
+#include "paddle/fluid/platform/cuda_helper.h"
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif

@@ -61,6 +62,19 @@ inline void get_mid_dims(const framework::DDim& x_dims,
  }
 }

+inline void trim_trailing_singular_dims(framework::DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  if (actual_dims_size != dims.size()) {
+    auto actual_dims = framework::vectorize(dims);
+    actual_dims.resize(actual_dims_size);
+    dims = framework::make_ddim(actual_dims);
+  }
+}
+
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
 template <typename T, typename DeviceContext>
@@ -263,44 +277,6 @@ class TransformFunctor {
    }                                                                          \
  }

-template <class functor, typename DeviceContext, typename T>
-void ElementwiseCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<T>(ctx.GetPlace());
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.");
-
-  if (x_dims == y_dims) {
-    functor f;
-    f.template Run<DeviceContext, T>(x, y, z, ctx);
-    return;
-  }
-
-  int axis = ctx.Attr<int>("axis");
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                 "Axis should be in range [0, x_dims)");
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
-  if (post == 1) {
-    functor f;
-    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
-    return;
-  } else {
-    functor f;
-    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
-    return;
-  }
-}
-
 #define EIGEN_ADD(x, y) ((x) + (y))
 EIGEN_FUNCTOR(Add, EIGEN_ADD);

@@ -361,13 +337,10 @@ template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
    const T* x, const T* y, const T* out, const T* dout, int h, int w,
    DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
-  extern __shared__ char shm_buffer[];
-  T* shm = reinterpret_cast<T*>(shm_buffer);
-
  int j = blockIdx.x;
  int i = threadIdx.x;
  int tid = threadIdx.x;
-  shm[tid] = 0;
+  T val = 0;

  do {
    int x_offset = i * w + j;
@@ -375,22 +348,16 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
      dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
    }
    if (dy) {
-      shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
    }
    i += ELEMWISE_MAX_BLOCK_DIM;
  } while (i < h);

  if (dy) {
-    __syncthreads();
-
    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
-    // Sum, could be optimized
+    val = platform::reduceSum(val, tid, h);
    if (threadIdx.x == 0) {
-      for (int k = 1; k < h; ++k) {
-        shm[0] += shm[k];
-      }
-      dy[j] = shm[0];
+      dy[j] = val;
    }
  }
 }
@@ -402,10 +369,8 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x,
                                       T* dx, T* dy) {
  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
  int gird_size = w;
-  int shared_mem_size = block_size * sizeof(T);
-  ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, shared_mem_size,
-                                     stream>>>(x, y, out, dout, h, w, dx_op,
-                                               dy_op, dx, dy);
+  ElemwiseGradBroadcast1CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+      x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
 }

 #endif
@@ -436,7 +401,6 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out,
 }

 #ifdef __NVCC__
-
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast2CUDAKernel(
    const T* x, const T* y, const T* out, const T* dout, int pre, int n,
@@ -444,9 +408,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
  int tid = threadIdx.x;
  int j = blockIdx.x;

-  extern __shared__ char shm_buffer[];
-  T* shm = reinterpret_cast<T*>(shm_buffer);
-  shm[tid] = 0;
+  T val = 0;
  int ttid = tid;

  while (true) {
@@ -461,23 +423,18 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
    }

    if (dy != nullptr) {
-      shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+      val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
    }

    ttid += ELEMWISE_MAX_BLOCK_DIM;
  }

  if (dy) {
-    __syncthreads();
    int h = pre * post;
    h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
-    // Sum, could be optimized
-    if (tid == 0) {
-      for (int i = 1; i < h; ++i) {
-        shm[0] += shm[i];
-      }
-      dy[j] = shm[0];
+    val = platform::reduceSum(val, tid, h);
+    if (threadIdx.x == 0) {
+      dy[j] = val;
    }
  }
 }
@@ -489,10 +446,8 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
                                       DY_OP dy_op, T* dx, T* dy) {
  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
  int gird_size = n;
-  int shared_mem_size = block_size * sizeof(T);
-  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, shared_mem_size,
-                                     stream>>>(x, y, out, dout, pre, n, post,
-                                               dx_op, dy_op, dx, dy);
+  ElemwiseGradBroadcast2CUDAKernel<<<gird_size, block_size, 0, stream>>>(
+      x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
 }

 #endif
@@ -516,14 +471,10 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
    auto x_dim = x.dims();
    auto y_dim = y.dims();

-    if (y_dim.size() == 1 && y_dim[0] == 1) {
-      // y is a scalar
-      auto extended_dims = framework::vectorize(x_dim);
-      extended_dims.push_back(1);
-      x_dim = framework::make_ddim(extended_dims);
-    }
-
    axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+    trim_trailing_singular_dims(y_dim);
+    axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
    int pre, n, post;
    get_mid_dims(x_dim, y_dim, axis, pre, n, post);
    if (post == 1) {
@@ -591,14 +542,9 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
    return;
  }

-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  trim_trailing_singular_dims(y_dims);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;

  int pre, n, post;
  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
@@ -633,16 +579,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
    return;
  }

-  if (y_dims.size() == 1 && y_dims[0] == 1) {
-    // y is a scalar
-    auto extended_dims = framework::vectorize(x_dims);
-    extended_dims.push_back(1);
-    x_dims = framework::make_ddim(extended_dims);
-  }
-
  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                 "Axis should be in range [0, x_dims)");
+  trim_trailing_singular_dims(y_dims);
+  axis = (y_dims.size() == 0) ? x_dims.size() : axis;

  int pre, n, post;
  get_mid_dims(x_dims, y_dims, axis, pre, n, post);

--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+
+static constexpr char kBlock[] = "sub_block";
+static constexpr char kX[] = "X";
+
+class GoOp : public framework::OperatorBase {
+ public:
+  GoOp(const std::string &type, const framework::VariableNameMap &inputs,
+       const framework::VariableNameMap &outputs,
+       const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void ExecuteOnThread(framework::Executor *executor,
+                       framework::BlockDesc *block,
+                       framework::Scope *scope) const {
+    framework::ProgramDesc *program = block->Program();
+    executor->Run(*program, scope, block->ID(), false /*create_local_scope*/);
+  }
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    /*
+     * Determine the global scope. Create a new child scope.
+     * Within the child scope, add all the local variables relevant
+     * to that scope.
+     *
+     * Now go through all the inputs to the op to ensure that
+     * all of them are in the newly created scope. This is important
+     * to ensure that they don't get destroyed when the parent scope
+     * is deleted.
+     * */
+
+    // TODO(varunarora): Consider moving this root scope lookup to scope.h.
+    const framework::Scope *root_scope = &scope;
+    const framework::Scope *parent_scope = &(root_scope->parent());
+
+    while (parent_scope != nullptr) {
+      root_scope = parent_scope;
+      parent_scope = &(parent_scope->parent());
+    }
+
+    framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
+    framework::Executor executor(dev_place);
+    framework::Scope &new_scope = root_scope->NewScope();
+
+    for (auto &var : block->AllVars()) {
+      new_scope.Var(var->Name());
+    }
+
+    auto &inputs = Inputs(kX);
+    for (size_t i = 0; i < inputs.size(); i++) {
+      PADDLE_ENFORCE_NOT_NULL(new_scope.FindVar(inputs.at(i)),
+                              "All variables used in the go block "
+                              "should be created in the global scope");
+    }
+
+    // Now execute the go op with the newly created scope.
+    std::thread go_thread([dev_place, block, &new_scope, this]() {
+      framework::Executor executor(dev_place);
+      ExecuteOnThread(&executor, block, &new_scope);
+    });
+    go_thread.detach();
+  }
+};
+
+class GoOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GoOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kX,
+             "A set of variables, which are required by operators inside the "
+             "block of Go Op.")
+        .AsDuplicable();
+    AddAttr<framework::BlockDesc *>(kBlock, "The block inside GoOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+// TODO(thuan): Look into Gradient Operator for GO_OP
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(go, paddle::operators::GoOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::GoOpMaker);
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -62,7 +62,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
    return framework::OpKernelType(
        framework::ToDataType(
            ctx.Input<framework::LoDTensor>("Scores")->type()),
-        ctx.device_context());
+        platform::CPUPlace());
  }
 };


--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -19,6 +19,11 @@ namespace operators {

 int PoolOutputSize(int input_size, int filter_size, int padding, int stride) {
  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  PADDLE_ENFORCE(output_size > 0,
+                 "Due to the settings of padding(%d), filter_size(%d) and "
+                 "stride(%d), the output size is less than 0, please check "
+                 "again. Input_size:%d",
+                 padding, filter_size, stride, input_size);
  return output_size;
 }


--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -67,6 +67,14 @@ class PriorBoxOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        platform::CPUPlace());
+  }
 };

 class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -121,10 +121,15 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;

 REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
            ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape,
-                       ops::ReshapeKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    reshape_grad, ops::ReshapeGradKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
+                       ops::ReshapeKernel<CPU, double>,
+                       ops::ReshapeKernel<CPU, int>,
+                       ops::ReshapeKernel<CPU, int64_t>);
+REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
+                       ops::ReshapeGradKernel<CPU, double>,
+                       ops::ReshapeGradKernel<CPU, int>,
+                       ops::ReshapeGradKernel<CPU, int64_t>);
--- a/paddle/fluid/operators/reshape_op.cu
+++ b/paddle/fluid/operators/reshape_op.cu
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/reshape_op.h"
+using CUDA = paddle::platform::CUDADeviceContext;

-REGISTER_OP_CUDA_KERNEL(
-    reshape,
-    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
-REGISTER_OP_CUDA_KERNEL(
-    reshape_grad,
-    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
+REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
+                        paddle::operators::ReshapeKernel<CUDA, double>,
+                        paddle::operators::ReshapeKernel<CUDA, int>,
+                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
+REGISTER_OP_CUDA_KERNEL(reshape_grad,
+                        paddle::operators::ReshapeGradKernel<CUDA, float>,
+                        paddle::operators::ReshapeGradKernel<CUDA, double>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int>,
+                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -59,6 +59,16 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
  }
 };

+class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
+    }
+  }
+};
+
 class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -80,7 +90,8 @@ class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
                  ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker);
+                  ops::SplitSelectedRowsGradMaker,
+                  ops::SplitSelectedRowsOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
    split_selected_rows,
    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
 proto_library(profiler_proto SRCS profiler.proto)
+py_proto_compile(profiler_py_proto SRCS profiler.proto)
+
+add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+
+add_dependencies(profiler_py_proto profiler_py_proto_init)
+
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 if(WITH_GPU)
  cc_library(enforce SRCS enforce.cc DEPS)

--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -62,5 +62,53 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 }
 #endif

+// __shfl_down has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // TODO(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  __shared__ T shm[32];
+  const int warpSize = 32;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += __shfl_down_sync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += __shfl_down_sync(mask, val, offset);
+  }
+
+  return val;
+}
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/device_tracer.h"
+#include <google/protobuf/text_format.h>
+#include <fstream>
 #include <map>
 #include <mutex>
+#include <numeric>
+#include <thread>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
@@ -51,6 +55,36 @@ uint64_t kAlignSize = 8;
    }                                                                      \
  } while (0)

+std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
+  switch (kind) {
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
+      return "MEMCPY_HtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
+      return "MEMCPY_DtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
+      return "MEMCPY_HtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
+      return "MEMCPY_AtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOA:
+      return "MEMCPY_AtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
+      return "MEMCPY_AtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
+      return "MEMCPY_DtoA";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
+      return "MEMCPY_DtoD";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
+      return "MEMCPY_HtoH";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
+      return "MEMCPY_PtoP";
+    case CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT:
+      return "MEMCPY_FORCE_INT";
+    default:
+      break;
+  }
+  return "MEMCPY";
+}
+
 void EnableActivity() {
  // Device activity record is created when CUDA initializes, so we
  // want to enable it before cuInit() or any CUDA runtime call.
@@ -107,6 +141,26 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
                                     kernel->correlationId);
            break;
          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
+          case CUPTI_ACTIVITY_KIND_MEMCPY2: {
+            auto *memcpy =
+                reinterpret_cast<const CUpti_ActivityMemcpy2 *>(record);
+            tracer->AddMemRecords(
+                MemcpyKind(
+                    static_cast<CUpti_ActivityMemcpyKind>(memcpy->copyKind)),
+                memcpy->start, memcpy->end, memcpy->deviceId, memcpy->streamId,
+                memcpy->correlationId, memcpy->bytes);
+            break;
+          }
          default: { break; }
        }
      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
@@ -137,6 +191,20 @@ class DeviceTracerImpl : public DeviceTracer {
    correlations_[id] = anno;
  }

+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
+    std::lock_guard<std::mutex> l(trace_mu_);
+    cpu_records_.push_back(
+        CPURecord{anno, start_ns, end_ns,
+                  std::hash<std::thread::id>{}(std::this_thread::get_id())});
+  }
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {
+    mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
+                                     stream_id, correlation_id, bytes});
+  }
+
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {
    std::lock_guard<std::mutex> l(trace_mu_);
@@ -172,17 +240,15 @@ class DeviceTracerImpl : public DeviceTracer {
    CUPTI_CALL(
        dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
                                     CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
-
    CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
    enabled_ = true;
  }

-  proto::Profile GenProfile() {
+  proto::Profile GenProfile(const std::string &profile_path) {
    std::lock_guard<std::mutex> l(trace_mu_);
    proto::Profile profile_pb;
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
-    std::map<std::string, std::vector<uint64_t>> event_times;
    for (const KernelRecord &r : kernel_records_) {
      if (correlations_.find(r.correlation_id) == correlations_.end()) {
        fprintf(stderr, "cannot relate a kernel activity\n");
@@ -194,15 +260,31 @@ class DeviceTracerImpl : public DeviceTracer {
      event->set_end_ns(r.end_ns);
      event->set_stream_id(r.stream_id);
      event->set_device_id(r.device_id);
-      event_times[event->name()].push_back(r.end_ns - r.start_ns);
    }
-    for (const auto &et : event_times) {
-      fprintf(
-          stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
-          et.first.c_str(),
-          std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
-          et.second.size());
+
+    for (const CPURecord &r : cpu_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.thread_id);
+      event->set_device_id(-1);
    }
+    for (const MemRecord &r : mem_records_) {
+      auto *event = profile_pb.add_events();
+      event->set_name(r.name);
+      event->set_start_ns(r.start_ns);
+      event->set_end_ns(r.end_ns);
+      event->set_stream_id(r.stream_id);
+      event->set_device_id(r.device_id);
+      event->mutable_memcopy()->set_bytes(r.bytes);
+    }
+    std::string profile_str;
+    google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
+    std::ofstream profile_f;
+    profile_f.open(profile_path, std::ios::out | std::ios::trunc);
+    profile_f << profile_str;
+    profile_f.close();
    return profile_pb;
  }

@@ -240,6 +322,8 @@ class DeviceTracerImpl : public DeviceTracer {
  uint64_t start_ns_;
  uint64_t end_ns_;
  std::vector<KernelRecord> kernel_records_;
+  std::vector<MemRecord> mem_records_;
+  std::vector<CPURecord> cpu_records_;
  std::unordered_map<uint32_t, std::string> correlations_;
  CUpti_SubscriberHandle subscriber_;
 };
@@ -252,6 +336,12 @@ class DeviceTracerDummy : public DeviceTracer {

  void AddAnnotation(uint64_t id, const std::string &anno) {}

+  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
+
+  void AddMemRecords(const std::string &name, uint64_t start_ns,
+                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint32_t correlation_id, uint64_t bytes) {}
+
  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
                        uint32_t stream_id, uint32_t correlation_id) {}

@@ -259,7 +349,9 @@ class DeviceTracerDummy : public DeviceTracer {

  void Enable() {}

-  proto::Profile GenProfile() { return proto::Profile(); }
+  proto::Profile GenProfile(const std::string &profile_path) {
+    return proto::Profile();
+  }

  void Disable() {}
 };
@@ -281,5 +373,7 @@ void SetCurAnnotation(const char *anno) { cur_annotation = anno; }

 void ClearCurAnnotation() { cur_annotation = nullptr; }

+const char *CurAnnotation() { return cur_annotation; }
+
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -36,6 +36,21 @@ class DeviceTracer {
    uint32_t stream_id;
    uint32_t correlation_id;
  };
+  struct CPURecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint64_t thread_id;
+  };
+  struct MemRecord {
+    std::string name;
+    uint64_t start_ns;
+    uint64_t end_ns;
+    uint32_t device_id;
+    uint32_t stream_id;
+    uint32_t correlation_id;
+    uint64_t bytes;
+  };

  virtual ~DeviceTracer() {}
  // Needs to be called once before use.
@@ -48,6 +63,14 @@ class DeviceTracer {
  // human-readable annotations.
  virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;

+  virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
+                             uint64_t end_ns, uint32_t device_id,
+                             uint32_t stream_id, uint32_t correlation_id,
+                             uint64_t bytes) = 0;
+
+  virtual void AddCPURecords(const char* anno, uint64_t start_ns,
+                             uint64_t end_ns) = 0;
+
  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
  virtual void AddKernelRecords(uint64_t start, uint64_t end,
@@ -55,7 +78,7 @@ class DeviceTracer {
                                uint32_t correlation_id) = 0;

  // Generate a proto after done (Disabled).
-  virtual proto::Profile GenProfile() = 0;
+  virtual proto::Profile GenProfile(const std::string& profile_path) = 0;

  virtual bool IsEnabled() = 0;
 };
@@ -67,6 +90,7 @@ DeviceTracer* GetDeviceTracer();
 void SetCurAnnotation(const char* anno);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
-
+// Current name of the operation being run in the thread.
+const char* CurAnnotation();
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)

 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -74,7 +74,8 @@ extern void *cupti_dso_handle;
  __macro(cuptiFinalize);                     \
  __macro(cuptiSubscribe);                    \
  __macro(cuptiUnsubscribe);                  \
-  __macro(cuptiEnableCallback);
+  __macro(cuptiEnableCallback);               \
+  __macro(cuptiEnableDomain);

 CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);


--- a/paddle/fluid/platform/dynload/cupti_lib_path.h.in
+++ b/paddle/fluid/platform/dynload/cupti_lib_path.h.in
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
 #include "paddle/fluid/platform/enforce.h"

 DEFINE_string(cudnn_dir, "",
@@ -45,8 +46,7 @@ DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 namespace paddle {
 namespace platform {
 namespace dynload {
-
-static const char* cupti_lib_path = CUPTI_LIB_PATH;
+static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;

 static inline std::string join(const std::string& part1,
                               const std::string& part2) {

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
+#include <sys/time.h>
+#include <time.h>
 #include <iomanip>
 #include <map>
 #ifdef PADDLE_WITH_CUDA
@@ -52,6 +54,12 @@ inline uint64_t GetTimeInNsec() {
      .count();
 }

+inline uint64_t PosixInNsec() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
+}
+
 Event::Event(EventKind kind, std::string name, uint32_t thread_id,
             const DeviceContext* dev_ctx)
    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
@@ -132,21 +140,23 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
 }

-RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
-                         int32_t block_id) {
+RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
+    : start_ns_(PosixInNsec()) {
  if (g_state == ProfilerState::kDisabled) return;
  dev_ctx_ = dev_ctx;
  name_ = name;
  PushEvent(name_, dev_ctx_);
-
-  full_name_ = string::Sprintf("%s_b%d", name, block_id);
  // Maybe need the same push/pop behavior.
-  SetCurAnnotation(full_name_.c_str());
+  SetCurAnnotation(name_.c_str());
 }

 RecordEvent::~RecordEvent() {
-  ClearCurAnnotation();
  if (g_state == ProfilerState::kDisabled) return;
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer) {
+    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
+  }
+  ClearCurAnnotation();
  PopEvent(name_, dev_ctx_);
 }

@@ -201,22 +211,22 @@ std::vector<std::vector<Event>> GetAllEvents() {
  return result;
 }

-void DisableProfiler(EventSortingKey sorted_key) {
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path) {
  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
                 "Can't disable profiling, since it's not starting.");
  // Mark the profiling stop.
  Mark("_stop_profiler_", nullptr);
  g_state = ProfilerState::kDisabled;

+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
  DeviceTracer* tracer = GetDeviceTracer();
  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
    tracer->Disable();
-    tracer->GenProfile();
+    tracer->GenProfile(profile_path);
  }
-
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
 }

 void ParseEvents(std::vector<std::vector<Event>>& events,

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -104,11 +104,11 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

 struct RecordEvent {
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
-              int32_t block_id);
+  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);

  ~RecordEvent();

+  uint64_t start_ns_;
  // The device context is used by Event to get the current cuda stream.
  const DeviceContext* dev_ctx_;
  // Event name
@@ -141,7 +141,8 @@ void EnableProfiler(ProfilerState state);
 // Clear the g_all_event_lists, which is total event lists of all threads.
 void ResetProfiler();

-void DisableProfiler(EventSortingKey sorted_key);
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path);

 // Parse the event list and output the profiling report
 void ParseEvents(std::vector<std::vector<Event>>&,

--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -15,12 +15,17 @@ limitations under the License. */
 syntax = "proto2";
 package paddle.platform.proto;

+message MemCopy { optional uint64 bytes = 3; }
+
 message Event {
  optional string name = 1;
  optional uint64 start_ns = 2;
  optional uint64 end_ns = 3;
-  optional uint32 device_id = 5;
+  // When positive, it represents gpu id. When -1, it represents CPU.
+  optional int64 device_id = 5;
  optional uint32 stream_id = 6;
+
+  optional MemCopy memcopy = 7;
 }

 message Profile {

--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) {
   */
  for (int i = 1; i < 5; ++i) {
    std::string name = "evs_op_" + std::to_string(i);
-    RecordEvent record_event(name, dev_ctx, 0);
+    RecordEvent record_event(name, dev_ctx);
    int counter = 1;
    while (counter != i * 1000) counter++;
  }
@@ -125,5 +125,5 @@ TEST(RecordEvent, RecordEvent) {
  EXPECT_EQ(start_profiler_count, 1);

  // Will remove parsing-related code from test later
-  DisableProfiler(EventSortingKey::kTotal);
+  DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -58,7 +58,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
+| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_C_API` | OFF | Build capi libraries for inference. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -40,7 +40,7 @@ function cmake_gen() {
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
        -DWITH_MKL=${WITH_MKL:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
        -DWITH_SWIG_PY=ON
        -DWITH_C_API=${WITH_C_API:-OFF}
@@ -65,7 +65,7 @@ EOF
        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
        -DWITH_MKL=${WITH_MKL:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
        -DWITH_C_API=${WITH_C_API:-OFF} \

--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -31,7 +31,6 @@ ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
 BUILD_ROOT=/paddle/build_android
 DEST_ROOT=/paddle/install_android

-rm -rf $BUILD_ROOT 2>/dev/null || true
 mkdir -p $BUILD_ROOT
 cd $BUILD_ROOT


--- a/paddle/scripts/travis/build_android.sh
+++ b/paddle/scripts/travis/build_android.sh
-#!/bin/bash
-set -e
-
-ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
-TMP_DIR=$HOME/$JOB/tmp
-mkdir -p $TMP_DIR
-cd $TMP_DIR
-wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
-unzip -q android-ndk-r14b-linux-x86_64.zip
-chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
-$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-cd $HOME
-rm -rf $TMP_DIR
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_android
-cd $TRAVIS_BUILD_DIR/build_android
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-      -DANDROID_ABI=armeabi-v7a \
-      -DANDROID_ARM_NEON=ON \
-      -DANDROID_ARM_MODE=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_C_API=ON \
-      -DWITH_SWIG_PY=OFF \
-      -DWITH_STYLE_CHECK=OFF \
-      ..
-
-make -j `nproc`
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -57,7 +57,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})

 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
 if(WITH_SWIG_PY)

--- a/python/paddle/fluid/tests/notest_csp.py
+++ b/python/paddle/fluid/tests/notest_csp.py
@@ -11,27 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-
-
-class TestCSPFramework(unittest.TestCase):
-    def daisy_chain(self):
-        n = 10000
-        leftmost = fluid.make_channel(dtype=int)
-        right = leftmost
-        left = leftmost
-        with fluid.While(steps=n):
-            right = fluid.make_channel(dtype=int)
-            with fluid.go():
-                fluid.send(left, 1 + fluid.recv(right))
-            left = right
-
-        with fluid.go():
-            fluid.send(right, 1)
-        fluid.Print(fluid.recv(leftmost))
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -220,7 +220,10 @@ def _callback_lookup_(op):
    :return: callback function
    """
    if op.type == 'parallel_do' and op.attr('use_nccl'):
+        all_vars = op.block.vars
        param_names = set(op.input('parameters'))
+        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
+                             param_names)
        param_grad_names = [n + "@GRAD" for n in param_names]

        class ParallelDoCallBack(object):

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/learning_rate_decay.py
+++ b/python/paddle/fluid/learning_rate_decay.py
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ b/python/paddle/fluid/tests/test_concurrency.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/__init__.py
+++ b/tools/__init__.py
--- a/tools/timeline.py
+++ b/tools/timeline.py