diff --git a/.travis.yml b/.travis.yml
index e2d49daa1981396628efa5d16459eb70e9e76884..ed566016ec6e0d30a0761356580114020ada66a0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,13 +4,17 @@ cache:
- $HOME/.ccache
- $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party
+ - $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required
dist: trusty
+services:
+ - docker
os:
- linux
env:
- JOB=build_doc
- JOB=check_style
+ - JOB=build_android
addons:
apt:
packages:
@@ -41,8 +45,10 @@ before_install:
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
script:
- |
- timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
- RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
+ # 43min timeout
+ if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
+ else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
+ RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
- |
if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/AUTHORS.md b/AUTHORS.md
index 389a7ad8bcfbb1d294bdd1f16905a7a0d2fbb932..9c6821d9f8681c5907c2fc9938fdb62ba64b9a92 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -1,5 +1,6 @@
| Github account | name |
|---|---|
+| abhinavarora | Abhinav Arora |
| backyes | Yan-Fei Wang |
| beckett1124 | Bin Qi |
| JiayiFeng | Jia-Yi Feng |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2f440c2d089b5d596ab59d5099c0066ef325614..469af0f7859b9ea79d1fc4c53e19cc29bfe28ce8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,11 +166,11 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
set(EXTERNAL_LIBS
- ${GFLAGS_LIBRARIES}
- ${GLOG_LIBRARIES}
+ gflags
+ glog
${CBLAS_LIBRARIES}
- ${PROTOBUF_LIBRARY}
- ${ZLIB_LIBRARIES}
+ protobuf
+ zlib
${PYTHON_LIBRARIES}
)
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 80eee112ddfb97bea57e302d76e0fef8fc9df9eb..786f224608f7d41c438411de0e09fedbcf2264b8 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -106,10 +106,10 @@ def vgg16_bn_drop(input):
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
- fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+ fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
- fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+ fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
return fc2
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7730453fc9292015465713232abda155a18a1aad..0f76f55270592c5625a9624b33f4c0f82efdc627 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -59,7 +59,6 @@ endif(NOT WITH_GOLANG)
if(NOT WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC)
- add_definitions("-DCUPTI_LIB_PATH=\"\"")
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
@@ -77,9 +76,7 @@ else()
if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
- add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
else()
- add_definitions("-DCUPTI_LIB_PATH=\"\"")
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index e568880632c885ca77e8e609918ff643e7ed6e47..20b8506e678af4db6ccb65bef99d28e085a67bf2 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -28,7 +28,7 @@ ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
ExternalProject_Add(
- zlib
+ extern_zlib
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/madler/zlib.git"
GIT_TAG "v1.2.8"
@@ -49,9 +49,11 @@ ExternalProject_Add(
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
+ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+ADD_DEPENDENCIES(zlib extern_zlib)
+
LIST(APPEND external_project_dependencies zlib)
-ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
IF(WITH_C_API)
INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 12e07bd5f883d2cf100ece391bdf7d2bbcafc8f7..356da582d1f8b6a8858af90ccdf5af2100e5db87 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,7 +104,9 @@ function(merge_static_libs TARGET_NAME)
foreach(lib ${libs})
list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
endforeach()
- list(REMOVE_DUPLICATES libs_deps)
+ if(libs_deps)
+ list(REMOVE_DUPLICATES libs_deps)
+ endif()
# To produce a library we need at least one source file.
# It is created by add_custom_command below and will helps
@@ -191,10 +193,13 @@ function(cc_library TARGET_NAME)
list(REMOVE_ITEM cc_library_DEPS warpctc)
add_dependencies(${TARGET_NAME} warpctc)
endif()
- # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
- target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+ # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
+ # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
+ target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
+ else()
+ target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index df186637726f60ee1b69cec7291477f3efcd059c..4471df36b0717171da4dff92ca0ec98b4f981028 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -72,7 +72,7 @@ copy(inference_lib DEPENDS paddle_fluid_shared
)
set(module "platform")
-copy(platform_lib
+copy(platform_lib DEPS profiler_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
)
diff --git a/doc/api/overview.rst b/doc/api/overview.rst
index 16b6cf42660c51feee09c689c671d5ef06663efb..a6f21428de1e4906e4af9433bc1c994f2b2c8b8e 100644
--- a/doc/api/overview.rst
+++ b/doc/api/overview.rst
@@ -7,10 +7,6 @@ it proposes some high-level concepts such as `Layers `_ to help control the training and inference phrase,
-it has several easy to use methods
+it has several easy to use methods to better expose the internal running details, different `events `_ are available to users by writing some callbacks.
-- `paddle.train`
-- `paddle.test`
-- `paddle.infer`
-
-to better expose the internal running details, different `events `_ are available to users by writing some callbacks.
+All in all, the V2 API gives a higher abstraction and make PaddlePaddle programs require fiew lines of code.
diff --git a/doc/build_and_install/build_from_source_cn.rst b/doc/build_and_install/build_from_source_cn.rst
index cb766c3838133740892928b587edcf3843b7abce..115b92a33888abf1e1be400e1abbb58b632a2976 100644
--- a/doc/build_and_install/build_from_source_cn.rst
+++ b/doc/build_and_install/build_from_source_cn.rst
@@ -9,7 +9,7 @@
为了编译PaddlePaddle,我们需要
1. 一台电脑,可以装的是 Linux, Windows 或者 MacOS 操作系统
-1. Docker
+2. Docker
不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker 镜像里。
diff --git a/doc/build_and_install/build_from_source_en.rst b/doc/build_and_install/build_from_source_en.rst
index 556cbfdf087c340a7f7a1760f92325ab87eeea89..8fef9e7347e8d924026999bfda985381750c6b51 100644
--- a/doc/build_and_install/build_from_source_en.rst
+++ b/doc/build_and_install/build_from_source_en.rst
@@ -9,7 +9,7 @@ Requirements
To build PaddlePaddle, you need
1. A computer -- Linux, Windows, MacOS.
-1. Docker.
+2. Docker.
Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image.
We run all the tools by running this image.
diff --git a/doc/build_and_install/pip_install_en.rst b/doc/build_and_install/pip_install_en.rst
index c1e806c0fe5f03139c0dff985f9ae0856eaa2e98..0d4c925b6e2731bdfd76582309aa7e8adbafa6ae 100644
--- a/doc/build_and_install/pip_install_en.rst
+++ b/doc/build_and_install/pip_install_en.rst
@@ -1,4 +1,4 @@
-Install Using pip
+Install using pip
================================
You can use current widely used Python package management
@@ -8,7 +8,7 @@ most of current Linux systems or MacOS.
.. _pip_install:
-Install Using pip
+Install using pip
------------------------------
Run the following command to install PaddlePaddle on the current
diff --git a/doc/design/parallel_do.md b/doc/design/parallel_do.md
index 45f7731996519b87fc591ee8ce831341d7ab2965..42bd136f825986d94fafaeaa5f58edb02848a74c 100644
--- a/doc/design/parallel_do.md
+++ b/doc/design/parallel_do.md
@@ -39,15 +39,16 @@ In the backward pass
This implementation allows to write mixed device program like this
```python
-# get embedding feature on CPU
-feature = some_cpu_only_op(data)
+W1 = fluid.tensor(size=[100,20], parameter=true)
+W2 = fluid.tensor(size=[20,15], parameter=true)
-gpu_places = get_place(use_gpu=True)
+data = layers.data()
+
+gpu_places = layers.get_place(use_gpu=True)
# parallel processing on multiple GPUs
pd = ParallelDo(gpu_places)
-with pd.do():
- read_input(feature)
- prediction = my_net(feature)
+with pd.do(input=data):
+ prediction = softmax(fc(fc(data, W1), W2))
write_output(prediction)
prediction = pd()
loss = cross_entropy(prediction, label)
@@ -66,20 +67,20 @@ start_program
main_program
{
block0 {
- vars: data, places, w1, w2
+ vars: data, places, w1, w2, w1_grad, w2_grad,
ops: data, get_place, parallel_do(block1),
parallel_do_grad(block2),
sgd(w2, w2_grad),
sgd(w1, w1_grad)
}
-block1 {
+block1 { # the forward pass
parent_block: 0
vars: data, h1, h2, loss
ops: fc, fc, softmax
}
-block2 {
+block2 { # the backward pass
parent_block: 1
- vars: data_grad, h1_grad, h2_grad, loss_gard, w1_grad, w2_grad
+ vars: data_grad, h1_grad, h2_grad, loss_gard, local_w1_grad, local_w2_grad
ops: softmax_grad,
fc_grad
fc_grad
diff --git a/doc/dev/contribute_to_paddle_cn.md b/doc/dev/contribute_to_paddle_cn.md
index 3e0bf7b3973079a2063d33b6be4fe8a9dc5c07bb..d8bf093e09b53b302225739fa67146adc7976e4b 100644
--- a/doc/dev/contribute_to_paddle_cn.md
+++ b/doc/dev/contribute_to_paddle_cn.md
@@ -222,6 +222,7 @@ upstream
## 提交代码的一些约定
为了使评审人在评审代码时更好地专注于代码本身,请您每次提交代码时,遵守以下约定:
+
1. 请保证Travis-CI 中单元测试能顺利通过。如果没过,说明提交的代码存在问题,评审人一般不做评审。
2. 提交PUll Request前:
- 请注意commit的数量:
@@ -231,6 +232,7 @@ upstream
3. 如果解决了某个Issue的问题,请在该PUll Request的**第一个**评论框中加上:`fix #issue_number`,这样当该PUll Request被合并后,会自动关闭对应的Issue。关键词包括:close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved,请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
此外,在回复评审人意见时,请您遵守以下约定:
+
1. 评审人的每个意见都必须回复(这是开源社区的基本礼貌,别人帮了忙,应该说谢谢):
- 对评审意见同意且按其修改完的,给个简单的`Done`即可;
- 对评审意见不同意的,请给出您自己的反驳理由。
diff --git a/doc/dev/index_en.rst b/doc/dev/index_en.rst
index 5dd12d2233cff20e021b90beb94571a2817bd1ad..5fdc30a2d688a6a269b4972c2591af60893e7dfb 100644
--- a/doc/dev/index_en.rst
+++ b/doc/dev/index_en.rst
@@ -4,6 +4,5 @@ Development
.. toctree::
:maxdepth: 1
- new_layer_en.rst
contribute_to_paddle_en.md
write_docs_en.rst
diff --git a/doc/faq/build_and_install/index_en.rst b/doc/faq/build_and_install/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..614db457d715665073cec1a495d4d7df6887532f
--- /dev/null
+++ b/doc/faq/build_and_install/index_en.rst
@@ -0,0 +1,5 @@
+############################
+Install, Build and Unit test
+############################
+
+TBD
diff --git a/doc/faq/cluster/index_en.rst b/doc/faq/cluster/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..855b7e8e53307b82a72c156be4ef509e27edf822
--- /dev/null
+++ b/doc/faq/cluster/index_en.rst
@@ -0,0 +1,5 @@
+###############################
+Cluster Training and Prediction
+###############################
+
+TBD
diff --git a/doc/faq/index_en.rst b/doc/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..57df868f760038b25fae30df7ab20a68875ad36a
--- /dev/null
+++ b/doc/faq/index_en.rst
@@ -0,0 +1,12 @@
+FAQ
+====
+
+
+.. toctree::
+ :maxdepth: 1
+
+ build_and_install/index_en.rst
+ model/index_en.rst
+ parameter/index_en.rst
+ local/index_en.rst
+ cluster/index_en.rst
diff --git a/doc/faq/local/index_en.rst b/doc/faq/local/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4cb43031933a8bbe9aebae04bc3e9c74a6d21b95
--- /dev/null
+++ b/doc/faq/local/index_en.rst
@@ -0,0 +1,5 @@
+#############################
+Local Training and Prediction
+#############################
+
+TBD
diff --git a/doc/faq/model/index_en.rst b/doc/faq/model/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cb26f59655f97dc28a2047994643ae16b8857964
--- /dev/null
+++ b/doc/faq/model/index_en.rst
@@ -0,0 +1,5 @@
+###################
+Model Configuration
+###################
+
+TBD
diff --git a/doc/faq/parameter/index_en.rst b/doc/faq/parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..61c7845af7e531013a06125f7c35b59081dafb42
--- /dev/null
+++ b/doc/faq/parameter/index_en.rst
@@ -0,0 +1,5 @@
+#################
+Parameter Setting
+#################
+
+TBD
diff --git a/doc/getstarted/concepts/use_concepts_en.rst b/doc/getstarted/concepts/use_concepts_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..406b0cbb913894dc333d8e4561c207793c33e475
--- /dev/null
+++ b/doc/getstarted/concepts/use_concepts_en.rst
@@ -0,0 +1,3 @@
+Basic Concept
+=============
+TBD
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index c680e1903750117073bee64cb4d4f4ccfff5ac3d..33f299be5680e0aa4a3f36638f51135503193d94 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -5,3 +5,4 @@ GET STARTED
:maxdepth: 1
quickstart_en.rst
+ concepts/use_concepts_en.rst
diff --git a/doc/howto/capi/compile_paddle_lib_en.md b/doc/howto/capi/compile_paddle_lib_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..11d69b9b79c1a41898d3060d3fe25a31330334a3
--- /dev/null
+++ b/doc/howto/capi/compile_paddle_lib_en.md
@@ -0,0 +1,3 @@
+## Install and Build
+
+TBD
diff --git a/doc/howto/capi/index_en.rst b/doc/howto/capi/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2cbbe362fd8e06abe9866d998f60fbb3458a80b5
--- /dev/null
+++ b/doc/howto/capi/index_en.rst
@@ -0,0 +1,9 @@
+C-API Prediction Library
+========================
+
+.. toctree::
+ :maxdepth: 1
+
+ compile_paddle_lib_en.md
+ organization_of_the_inputs_en.md
+ workflow_of_capi_en.md
diff --git a/doc/howto/capi/organization_of_the_inputs_cn.md b/doc/howto/capi/organization_of_the_inputs_cn.md
index a889ae4ffab7be02468b4a5ac5a18e3cc77803c9..343526c213110cb9c6abaf9a12b3d634ad3fabe9 100644
--- a/doc/howto/capi/organization_of_the_inputs_cn.md
+++ b/doc/howto/capi/organization_of_the_inputs_cn.md
@@ -4,12 +4,15 @@
### 输入/输出数据类型
在C-API中,按照基本数据类型在PaddlePaddle内部的定义和实现,输入数据可分为:
+
1. 一维整型数组
1. 二维浮点型矩阵
+
- 稠密矩阵
- 稀疏矩阵
说明:
+
1. 一维数组**仅支持整型值**;
- 常用于自然语言处理任务,例如:表示词语在词典中的序号;
- 分类任务中类别标签;
@@ -274,6 +277,7 @@ PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织
如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
于是,在组织神经网络输入时,需要思考完成以下工作:
+
1. 为每一个输入/输出创建`argument`。
- C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
diff --git a/doc/howto/capi/organization_of_the_inputs_en.md b/doc/howto/capi/organization_of_the_inputs_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..250d3b2f749aed018e63527e817899c843dff996
--- /dev/null
+++ b/doc/howto/capi/organization_of_the_inputs_en.md
@@ -0,0 +1,3 @@
+## Input/Output Data Organization
+
+TBD
diff --git a/doc/howto/capi/workflow_of_capi_cn.md b/doc/howto/capi/workflow_of_capi_cn.md
index 1ccc72eefbc730b2eab2d51f5b04e50728b735d7..1968c1099ac5734cd68b437f2f7aa428d7b5265e 100644
--- a/doc/howto/capi/workflow_of_capi_cn.md
+++ b/doc/howto/capi/workflow_of_capi_cn.md
@@ -11,6 +11,7 @@
- 准备预测模型
+
1. 只将神经网络结构进行序列化。
- 只对神经网络结构进行序列化,加载模型需同时指定:网络结构的序列化结果和模型参数存储目录。
1. 将网络结构定义和训练结束存储下来的模型参数文件(多个)合并入一个文件。
@@ -18,6 +19,7 @@
- 预测时只需加载一个文件便于发布。
- **注意**:以上两种方式只需选择其一即可。
- 调用 C-API 开发预测序
+
1. 初始化PaddlePaddle运行环境。
1. 加载预测模型。
1. 创建神经网络输入,组织输入数据。
@@ -90,6 +92,7 @@
1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口,与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时,通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
- 注意事项
+
1. 使用PaddlePaddle V2 API训练,模型中所有可学习参数会被存为一个压缩文件,需要手动进行解压,将它们放在同一目录中,C-API不会直接加载 V2 API 存储的压缩文件。
1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件,请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
1. 通过灵活使用以上两个接口,加载模型可其它多种方式,例如也可在程序运行过程中再加载另外一个模型。
@@ -106,6 +109,7 @@ C-API支持的所有输入数据类型和他们的组织方式,请参考“输
这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出,使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
在组织神经网络输入,获取输出时,需要思考完成以下工作:
+
1. 为每一个输入/输出创建`argument`;
1. 为每一个`argument`创建`paddle_matrix`来存储数据;
diff --git a/doc/howto/capi/workflow_of_capi_en.md b/doc/howto/capi/workflow_of_capi_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..1692ecd56520675f02ad25ef73761330ebd0e740
--- /dev/null
+++ b/doc/howto/capi/workflow_of_capi_en.md
@@ -0,0 +1,3 @@
+## C-API Workflow
+
+TBD
diff --git a/doc/howto/cluster/cmd_argument_cn.md b/doc/howto/cluster/cmd_argument_cn.md
index 5c575dd5b53f6e4ea025a8fbaebdb2d1a1f1c9ed..40e1dde4858b802c2e703bcca4b71730facde5ef 100644
--- a/doc/howto/cluster/cmd_argument_cn.md
+++ b/doc/howto/cluster/cmd_argument_cn.md
@@ -1,14 +1,17 @@
-## 启动参数说明
+# 启动参数说明
下面以`doc/howto/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
-### 启动参数服务器
+## 启动参数服务器
+
执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+
```bash
$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
```
如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行:
+
```bash
$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
```
@@ -20,8 +23,10 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
-### 启动计算节点
+## 启动计算节点
+
执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py)
+
```bash
$ python train.py
```
@@ -67,7 +72,7 @@ paddle.init(
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
-### 准备数据集
+## 准备数据集
参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
@@ -84,7 +89,8 @@ for f in flist:
```
示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`):
-```
+
+```bash
train.txt
train.txt-00000
train.txt-00001
@@ -99,12 +105,13 @@ test.txt-00002
对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
-### 准备训练程序
+## 准备训练程序
我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
最后,工作空间应如下所示:
-```
+
+```bash
.
|-- my_lib.py
|-- word_dict.pickle
@@ -133,3 +140,21 @@ test.txt-00002
- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。
- `test_data_dir`:包含测试数据集的目录。
+
+## 异步 SGD 更新
+
+我们可以通过设置 `optimize` 的参数使之支持异步SGD更新。
+例如,设置 `AdaGrad` optimize 的 `is_async` 和 `async_lagged_grad_discard_ratio` 参数:
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+ is_async=True,
+ async_lagged_grad_discard_ratio=1.6,
+ learning_rate=3e-3,
+ regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: 是否为异步SGD更新模式。
+- `async_lagged_grad_discard_ratio`: 异步SGD更新的步长控制,接收到足够的gradient(
+ `async_lagged_grad_discard_ratio * num_gradient_servers`)之后,后面的gradient
+ 将会被抛弃。
diff --git a/doc/howto/cluster/cmd_argument_en.md b/doc/howto/cluster/cmd_argument_en.md
index 06fd5717564c99e3bb46835a2bd5071dff665f23..40179c28f83800c1c74a6045f8fac6841bdafeaa 100644
--- a/doc/howto/cluster/cmd_argument_en.md
+++ b/doc/howto/cluster/cmd_argument_en.md
@@ -1,18 +1,19 @@
-## Command-line arguments
+# Command-line arguments
We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
-### Starting parameter server
+## Starting parameter server
Type the below command to start a parameter server which will wait for trainers to connect:
```bash
-$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0
```
If you wish to run parameter servers in background, and save a log file, you can type:
+
```bash
-$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 --nics=eth0 &> pserver.log &
```
Parameter Description
@@ -21,8 +22,10 @@ Parameter Description
- ports_num: **required, default 1**, total number of ports will listen on.
- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
- num_gradient_servers: **required, default 1**, total number of gradient servers.
+- nics: **optional, default xgbe0,xgbe1**, network device name which paramter server will listen on.
+
+## Starting trainer
-### Starting trainer
Type the command below to start the trainer(name the file whatever you want, like "train.py")
```bash
@@ -70,7 +73,7 @@ Parameter Description
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
-### Prepare Training Dataset
+## Prepare Training Dataset
Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
@@ -88,7 +91,7 @@ for f in flist:
Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
-```
+```bash
train.txt
train.txt-00000
train.txt-00001
@@ -103,13 +106,13 @@ When job started, every trainer needs to get it's own part of data. In some dist
Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
-### Prepare Training program
+## Prepare Training program
We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
-
Your workspace may looks like:
-```
+
+```bash
.
|-- my_lib.py
|-- word_dict.pickle
@@ -138,3 +141,21 @@ Your workspace may looks like:
- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
- `test_data_dir`: containing testing data.
+
+## Async SGD Update
+
+We can set some parameters of the optimizer to make it support async SGD update.
+For example, we can set the `is_async` and `async_lagged_grad_discard_ratio` of the `AdaGrad` optimizer:
+
+```python
+adagrad = paddle.optimizer.AdaGrad(
+ is_async=True,
+ async_lagged_grad_discard_ratio=1.6,
+ learning_rate=3e-3,
+ regularization=paddle.optimizer.L2Regularization(8e-4))
+```
+
+- `is_async`: Is Async-SGD or not.
+- `async_lagged_grad_discard_ratio`: For async SGD gradient commit control.
+ when `async_lagged_grad_discard_ratio * num_gradient_servers` commit passed,
+ current async gradient will be discard silently.
diff --git a/doc/howto/cluster/multi_cluster/fabric_en.md b/doc/howto/cluster/multi_cluster/fabric_en.md
index bf270d89ab8514801ca4629cf412f73257429df9..bac9ffe1526a06a3a23b1d8acf33a5fb74b7e50d 100644
--- a/doc/howto/cluster/multi_cluster/fabric_en.md
+++ b/doc/howto/cluster/multi_cluster/fabric_en.md
@@ -1,4 +1,4 @@
-# Cluster Training Using Fabric
+# Fabric
## Prepare a Linux cluster
diff --git a/doc/howto/cluster/multi_cluster/k8s_aws_en.md b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
index 0dfa8237a3fa2c9c3ee11e873c9fbbed3cd6018f..8e8e87be711bd45177ed77c81c531606e801d1f0 100644
--- a/doc/howto/cluster/multi_cluster/k8s_aws_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_aws_en.md
@@ -1,5 +1,4 @@
-
-# Distributed PaddlePaddle Training on AWS with Kubernetes
+# Kubernetes on AWS
We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
diff --git a/doc/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc3d50b3ffd3b703a3a656caa1f96bdcf683f68b
--- /dev/null
+++ b/doc/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -0,0 +1,3 @@
+# Kubernetes Distributed
+
+TBD
diff --git a/doc/howto/cluster/multi_cluster/k8s_en.md b/doc/howto/cluster/multi_cluster/k8s_en.md
index c374f00a495d705ceddf8d3d930768ceeb93282b..96ff652705726fc56fa0078593cd2a695fcdb5e2 100644
--- a/doc/howto/cluster/multi_cluster/k8s_en.md
+++ b/doc/howto/cluster/multi_cluster/k8s_en.md
@@ -1,4 +1,4 @@
-# PaddlePaddle On Kubernetes
+# Kubernetes
In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
diff --git a/doc/howto/cluster/multi_cluster/openmpi_cn.md b/doc/howto/cluster/multi_cluster/openmpi_cn.md
index 831cafdc03c6a908f31769d0467de022df42dab5..954b2215cc3136ec5b3e1cdc2f6d3f508f814516 100644
--- a/doc/howto/cluster/multi_cluster/openmpi_cn.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_cn.md
@@ -1,4 +1,4 @@
-# 在OpenMPI集群中提交训练作业
+# 在OpenMPI集群中启动训练
## 准备OpenMPI集群
diff --git a/doc/howto/cluster/multi_cluster/openmpi_en.md b/doc/howto/cluster/multi_cluster/openmpi_en.md
index 09af46e25ebe1f843dc7c7be0997dc706413b65c..a5c02b336b8a974f546499acae32edac24219be9 100644
--- a/doc/howto/cluster/multi_cluster/openmpi_en.md
+++ b/doc/howto/cluster/multi_cluster/openmpi_en.md
@@ -1,4 +1,4 @@
-# Cluster Training Using OpenMPI
+# OpenMPI
## Prepare an OpenMPI cluster
diff --git a/doc/howto/cmd_parameter/index_cn.rst b/doc/howto/cmd_parameter/index_cn.rst
index 17b379f6295d66d864e2b53108012eff5895d96b..6900bb1443e611d326e8d5640e794ac2b9079beb 100644
--- a/doc/howto/cmd_parameter/index_cn.rst
+++ b/doc/howto/cmd_parameter/index_cn.rst
@@ -2,10 +2,25 @@
命令行参数设置
===============
+深度学习算法的实现有着多样化的特点,运行环境、运行阶段、模型结构、训练策略等等这些都是常见的变化因素。PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。
+
+在这一部分,首先以几个实际场景为例,展示了部分命令行参数的使用:
.. toctree::
:maxdepth: 1
use_case_cn.md
+
+接着对所有参数的使用场合进行概述和分类:
+
+.. toctree::
+ :maxdepth: 1
+
arguments_cn.md
+
+最后给出细节描述,详细解释这些参数的属性和意义:
+
+.. toctree::
+ :maxdepth: 1
+
detail_introduction_cn.md
diff --git a/doc/howto/rnn/hierarchical_layer_en.rst b/doc/howto/rnn/hierarchical_layer_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..236f58a160c7f77c28e4b1216b83b3d3cdaaa459
--- /dev/null
+++ b/doc/howto/rnn/hierarchical_layer_en.rst
@@ -0,0 +1,4 @@
+Layers supporting hierarchical sequence as input
+================================================
+
+TBD
diff --git a/doc/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e5aa05c117393e81c557ba67609f787b38587efd
--- /dev/null
+++ b/doc/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -0,0 +1,4 @@
+API comparision between RNN and hierarchical RNN
+================================================
+
+TBD
diff --git a/doc/howto/rnn/index_en.rst b/doc/howto/rnn/index_en.rst
index 7adc79873d699fdfd5a85034bcef964dd1f19132..e1b20ef2e7bf4c521b613e54577ff6a3feaa8936 100644
--- a/doc/howto/rnn/index_en.rst
+++ b/doc/howto/rnn/index_en.rst
@@ -5,3 +5,6 @@ RNN Models
:maxdepth: 1
rnn_config_en.rst
+ recurrent_group_en.md
+ hierarchical_layer_en.rst
+ hrnn_rnn_api_compare_en.rst
diff --git a/doc/howto/rnn/recurrent_group_en.md b/doc/howto/rnn/recurrent_group_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..d264b0a9f85faffd49c1982117cb5a3ac6ffc015
--- /dev/null
+++ b/doc/howto/rnn/recurrent_group_en.md
@@ -0,0 +1,3 @@
+# Recurrent Group Tutorial
+
+TBD
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 166f56c28f464563a0b36007f58cebb58c286916..909f035cca3db2a02fd38462acc451375eceff40 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -8,3 +8,4 @@ PaddlePaddle Documentation
build_and_install/index_en.rst
howto/index_en.rst
dev/index_en.rst
+ faq/index_en.rst
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 0b4c6db6f98d8d73b362d3c98f52a3914a031c68..82c7d4a2ec648449fc65ca2ae0de397b2f6fa120 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -96,3 +96,6 @@ cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_contex
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(channel_test SRCS channel_test.cc)
+cc_test(tuple_test SRCS tuple_test.cc )
+cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
+ channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc)
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index be578059388e627778d485365e5a735158603fe5..bda1bfb23b18f8c6b9f1c3eded461a9322a154af 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -91,6 +91,11 @@ class ChannelHolder {
inline bool IsInitialized() const { return holder_ != nullptr; }
+ inline const std::type_index Type() {
+ PADDLE_ENFORCE_EQ(IsInitialized(), true);
+ return holder_->Type();
+ }
+
private:
/**
* @note Placeholder hides type T, so it doesn't appear as a template
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 2c4e622bd789c691c38a8810fe5e09e464a8cf1f..695169fcb9e93b5e69d3d4ae6f63f8e4c2d1605f 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -542,3 +542,341 @@ TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
ChannelHolderSendReceive(ch);
delete ch;
}
+
+TEST(ChannelHolder, ChannelUninitializedTest) {
+ ChannelHolder *ch = new ChannelHolder();
+ EXPECT_EQ(ch->IsInitialized(), false);
+ int i = 10;
+ EXPECT_EQ(ch->Send(&i), false);
+ EXPECT_EQ(ch->Receive(&i), false);
+ bool is_exception = false;
+ try {
+ ch->Type();
+ } catch (paddle::platform::EnforceNotMet e) {
+ is_exception = true;
+ }
+ EXPECT_EQ(is_exception, true);
+ delete ch;
+}
+
+TEST(ChannelHolder, ChannelInitializedTest) {
+ ChannelHolder *ch = new ChannelHolder();
+ ch->Reset(2);
+ EXPECT_EQ(ch->IsInitialized(), true);
+ // Channel should remain intialized even after close
+ ch->close();
+ EXPECT_EQ(ch->IsInitialized(), true);
+ delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchSendTest) {
+ // Test with unbuffered channel
+ ChannelHolder *ch = new ChannelHolder();
+ ch->Reset(0);
+ bool is_exception = false;
+ bool boolean_data = true;
+ try {
+ ch->Send(&boolean_data);
+ } catch (paddle::platform::EnforceNotMet e) {
+ is_exception = true;
+ }
+ EXPECT_EQ(is_exception, true);
+ delete ch;
+
+ // Test with Buffered Channel
+ ch = new ChannelHolder();
+ ch->Reset(10);
+ is_exception = false;
+ int int_data = 23;
+ try {
+ ch->Send(&int_data);
+ } catch (paddle::platform::EnforceNotMet e) {
+ is_exception = true;
+ }
+ EXPECT_EQ(is_exception, true);
+ delete ch;
+}
+
+TEST(ChannelHolder, TypeMismatchReceiveTest) {
+ // Test with unbuffered channel
+ ChannelHolder *ch = new ChannelHolder();
+ ch->Reset(0);
+ bool is_exception = false;
+ bool float_data;
+ try {
+ ch->Receive(&float_data);
+ } catch (paddle::platform::EnforceNotMet e) {
+ is_exception = true;
+ }
+ EXPECT_EQ(is_exception, true);
+ delete ch;
+
+ // Test with Buffered Channel
+ ch = new ChannelHolder();
+ ch->Reset(10);
+ is_exception = false;
+ int int_data = 23;
+ try {
+ ch->Receive(&int_data);
+ } catch (paddle::platform::EnforceNotMet e) {
+ is_exception = true;
+ }
+ EXPECT_EQ(is_exception, true);
+ delete ch;
+}
+
+void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked because of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data;
+ EXPECT_EQ(ch->Receive(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
+
+ // Verify that all the threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+
+ // Explicitly close the channel
+ // This should unblock all receivers
+ ch->close();
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
+ using paddle::framework::details::Buffered;
+ using paddle::framework::details::UnBuffered;
+
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+ bool send_success[num_threads];
+
+ // Launches threads that try to write and are blocked because of no readers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ send_success[i] = false;
+ t[i] = std::thread(
+ [&](bool *ended, bool *success) {
+ int data = 10;
+ *success = ch->Send(&data);
+ *ended = true;
+ },
+ &thread_ended[i], &send_success[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
+
+ if (isBuffered) {
+ // If ch is Buffered, atleast 4 threads must be blocked.
+ int ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (!thread_ended[i]) ct++;
+ }
+ EXPECT_GE(ct, 4);
+ } else {
+ // If ch is UnBuffered, all the threads should be blocked.
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+ }
+ // Explicitly close the thread
+ // This should unblock all senders
+ ch->close();
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ if (isBuffered) {
+ // Verify that only 1 send was successful
+ int ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (send_success[i]) ct++;
+ }
+ // Only 1 send must be successful
+ EXPECT_EQ(ct, 1);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that closing a channelholder unblocks
+// any receivers waiting on the channel
+TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
+ // Check for buffered channel
+ ChannelHolder *ch = new ChannelHolder();
+ ch->Reset(1);
+ ChannelHolderCloseUnblocksReceiversTest(ch);
+ delete ch;
+
+ // Check for unbuffered channel
+ ch = new ChannelHolder();
+ ch->Reset(0);
+ ChannelHolderCloseUnblocksReceiversTest(ch);
+ delete ch;
+}
+
+// This tests that closing a channelholder unblocks
+// any senders waiting for channel to have write space
+TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
+ // Check for buffered channel
+ ChannelHolder *ch = new ChannelHolder();
+ ch->Reset(1);
+ ChannelHolderCloseUnblocksSendersTest(ch, true);
+ delete ch;
+
+ // Check for unbuffered channel
+ ch = new ChannelHolder();
+ ch->Reset(0);
+ ChannelHolderCloseUnblocksSendersTest(ch, false);
+ delete ch;
+}
+
+// This tests that destroying a channelholder unblocks
+// any senders waiting for channel
+void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+ bool send_success[num_threads];
+
+ // Launches threads that try to write and are blocked because of no readers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ send_success[i] = false;
+ t[i] = std::thread(
+ [&](bool *ended, bool *success) {
+ int data = 10;
+ *success = ch->Send(&data);
+ *ended = true;
+ },
+ &thread_ended[i], &send_success[i]);
+ }
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
+ if (isBuffered) {
+ // If channel is buffered, verify that atleast 4 threads are blocked
+ int ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (thread_ended[i] == false) ct++;
+ }
+ // Atleast 4 threads must be blocked
+ EXPECT_GE(ct, 4);
+ } else {
+ // Verify that all the threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+ }
+ // Explicitly destroy the channel
+ delete ch;
+ std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
+
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ // Count number of successfuld sends
+ int ct = 0;
+ for (size_t i = 0; i < num_threads; i++) {
+ if (send_success[i]) ct++;
+ }
+
+ if (isBuffered) {
+ // Only 1 send must be successful
+ EXPECT_EQ(ct, 1);
+ } else {
+ // In unbuffered channel, no send should be successful
+ EXPECT_EQ(ct, 0);
+ }
+
+ // Join all threads
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+// This tests that destroying a channelholder also unblocks
+// any receivers waiting on the channel
+void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
+ size_t num_threads = 5;
+ std::thread t[num_threads];
+ bool thread_ended[num_threads];
+
+ // Launches threads that try to read and are blocked because of no writers
+ for (size_t i = 0; i < num_threads; i++) {
+ thread_ended[i] = false;
+ t[i] = std::thread(
+ [&](bool *p) {
+ int data;
+ // All reads should return false
+ EXPECT_EQ(ch->Receive(&data), false);
+ *p = true;
+ },
+ &thread_ended[i]);
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
+
+ // Verify that all threads are blocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], false);
+ }
+ // delete the channel
+ delete ch;
+ std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
+ // Verify that all threads got unblocked
+ for (size_t i = 0; i < num_threads; i++) {
+ EXPECT_EQ(thread_ended[i], true);
+ }
+
+ for (size_t i = 0; i < num_threads; i++) t[i].join();
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
+ // Check for Buffered Channel
+ ChannelHolder *ch = new ChannelHolder();
+ ch->Reset(1);
+ ChannelHolderDestroyUnblockReceivers(ch);
+ // ch is already deleted already deleted in
+ // ChannelHolderDestroyUnblockReceivers
+
+ // Check for Unbuffered channel
+ ch = new ChannelHolder();
+ ch->Reset(0);
+ ChannelHolderDestroyUnblockReceivers(ch);
+}
+
+TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
+ // Check for Buffered Channel
+ ChannelHolder *ch = new ChannelHolder();
+ ch->Reset(1);
+ ChannelHolderDestroyUnblockSenders(ch, true);
+ // ch is already deleted already deleted in
+ // ChannelHolderDestroyUnblockReceivers
+
+ // Check for Unbuffered channel
+ ch = new ChannelHolder();
+ ch->Reset(0);
+ ChannelHolderDestroyUnblockSenders(ch, false);
+}
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5770b0a5a18659e615e80a7c48113d8b543b69ec
--- /dev/null
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(go);
+USE_NO_KERNEL_OP(channel_close);
+USE_NO_KERNEL_OP(channel_create);
+USE_NO_KERNEL_OP(channel_recv);
+USE_NO_KERNEL_OP(channel_send);
+USE_NO_KERNEL_OP(elementwise_add);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+namespace paddle {
+namespace framework {
+
+template
+void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name,
+ T value) {
+ // Create LoDTensor of dim [1,1]
+ auto var = scope.Var(name);
+ auto tensor = var->GetMutable();
+ tensor->Resize({1, 1});
+ T *expect = tensor->mutable_data(place);
+ expect[0] = value;
+}
+
+void InitTensorsInScope(Scope &scope, p::CPUPlace &place) {
+ p::CPUDeviceContext ctx(place);
+
+ // Create channel variable
+ scope.Var("Channel");
+
+ // Create Variables, x0 will be put into channel,
+ // result will be pulled from channel
+ CreateIntVariable(scope, place, "Status", false);
+ CreateIntVariable(scope, place, "x0", 99);
+ CreateIntVariable(scope, place, "result", 0);
+}
+
+void AddOp(const std::string &type, const VariableNameMap &inputs,
+ const VariableNameMap &outputs, AttributeMap attrs,
+ BlockDesc *block) {
+ // insert op
+ auto op = block->AppendOp();
+ op->SetType(type);
+ for (auto &kv : inputs) {
+ op->SetInput(kv.first, kv.second);
+ }
+ for (auto &kv : outputs) {
+ op->SetOutput(kv.first, kv.second);
+ }
+ op->SetAttrMap(attrs);
+}
+
+TEST(Concurrency, Go_Op) {
+ Scope scope;
+ p::CPUPlace place;
+
+ // Initialize scope variables
+ InitTensorsInScope(scope, place);
+
+ framework::Executor executor(place);
+ ProgramDesc program;
+ BlockDesc *block = program.MutableBlock(0);
+
+ // Create channel OP
+ AddOp("channel_create", {}, {{"Out", {"Channel"}}},
+ {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
+ block);
+
+ // Create Go Op routine
+ BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
+ AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
+ {{"Status", {"Status"}}}, {}, goOpBlock);
+
+ // Create Go Op
+ AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
+ block);
+
+ // Create Channel Receive Op
+ AddOp("channel_recv", {{"Channel", {"Channel"}}},
+ {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
+
+ // Create Channel Close Op
+ AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
+
+ // Check the result tensor to make sure it is set to 0
+ const LoDTensor &tensor = (scope.FindVar("result"))->Get();
+ auto *initialData = tensor.data();
+ EXPECT_EQ(initialData[0], 0);
+
+ executor.Run(program, &scope, 0, true, true);
+
+ // After we call executor.run, the Go operator should do a channel_send to set
+ // the
+ // "result" variable to 99
+ auto *finalData = tensor.data();
+ EXPECT_EQ(finalData[0], 99);
+}
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc
index 97afd366387e9ba6476be59a4d73d53a38834d0e..05e423b8a52962d47a6615d48243444374b470e3 100644
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/fluid/framework/ddim.cc
@@ -26,12 +26,15 @@ Dim make_dim(const int64_t* d) {
}
template <>
-Dim<1> make_dim<1>(const int64_t* d) {
- return Dim<1>(*d);
+Dim<0> make_dim<0>(const int64_t* d) {
+ return Dim<0>(*d);
}
void make_ddim(DDim& ddim, const int64_t* dims, int n) {
switch (n) {
+ case 0:
+ ddim = make_dim<0>(dims);
+ break;
case 1:
ddim = make_dim<1>(dims);
break;
@@ -190,7 +193,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
this->operator()(t.tail);
}
- void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+ void operator()(const Dim<0>& t) {}
};
/// @endcond
@@ -247,9 +250,8 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
}
}
- void operator()(const Dim<1>& dim) {
- PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
- vector.push_back(dim.head);
+ void operator()(const Dim<0>& dim) {
+ PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound.");
}
};
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index 5aff10d3b95902fdb9fe432d9f31830304dd3d07..f05b5ee3faee856a41f1376e5952710b550e7c42 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -30,8 +30,8 @@ namespace framework {
* The number of dimensions must be between [1, 9].
*/
struct DDim {
- typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
- Dim<8>, Dim<9>>
+ typedef boost::variant, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
+ Dim<7>, Dim<8>, Dim<9>>
DDimVar;
DDimVar var;
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
index 08b708006aadc4769bde7b37347ac1adfeca2bf7..58b75ba4b5a1973d97aedcea0f76681b625afa65 100644
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -72,38 +72,36 @@ struct Dim {
// Base case specialization
template <>
-struct Dim<1> {
- static constexpr int dimensions = 1;
+struct Dim<0> {
+ static constexpr int dimensions = 0;
HOSTDEVICE
- Dim(int64_t _head) : head(_head) {}
+ Dim(int64_t _head) {}
HOSTDEVICE
- Dim() : head(0) {}
+ Dim() {}
HOSTDEVICE
- Dim(int idx, const Dim<1>& size) : head(idx) {
+ Dim(int idx, const Dim<0>& size) {
#ifndef __CUDA_ARCH__
- if (idx >= size.head) {
+ if (idx > 0) {
throw std::invalid_argument("Index out of range.");
}
#else
- PADDLE_ASSERT(idx < size.head);
+ PADDLE_ASSERT(idx == 0);
#endif
}
HOSTDEVICE
- bool operator==(const Dim<1>& o) const { return (head == o.head); }
+ bool operator==(const Dim<0>& o) const { return true; }
HOSTDEVICE
- bool operator!=(const Dim<1>& o) const { return !(*this == o); }
+ bool operator!=(const Dim<0>& o) const { return false; }
HOSTDEVICE
int64_t& operator[](int idx);
HOSTDEVICE
int64_t operator[](int idx) const;
-
- int64_t head;
};
namespace {
@@ -154,15 +152,20 @@ HOSTDEVICE int64_t& indexer(Dim& dim, int idx) {
}
template <>
-HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
- if (idx != 0) {
- throw std::invalid_argument("Invalid index");
- }
+ throw std::invalid_argument("Invalid index");
#else
- PADDLE_ASSERT(idx == 0);
+ PADDLE_ASSERT(false);
#endif
- return dim.head;
+#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
+ // On CUDA versions previous to 8.0, only __shared__ variables
+ // could be declared as static in the device code.
+ int64_t head = 0;
+#else
+ static int64_t head = 0;
+#endif
+ return head;
}
template
@@ -181,15 +184,20 @@ HOSTDEVICE int64_t indexer(const Dim& dim, int idx) {
}
template <>
-HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) {
#ifndef __CUDA_ARCH__
- if (idx != 0) {
- throw std::invalid_argument("Invalid index");
- }
+ throw std::invalid_argument("Invalid index");
#else
- PADDLE_ASSERT(idx == 0);
+ PADDLE_ASSERT(false);
#endif
- return dim.head;
+#if (defined __CUDA_ARCH__) && (CUDA_VERSION < 8000)
+ // On CUDA versions previous to 8.0, only __shared__ variables
+ // could be declared as static in the device code.
+ int64_t head = 0;
+#else
+ static int64_t head = 0;
+#endif
+ return head;
}
} // namespace
@@ -218,12 +226,12 @@ HOSTDEVICE int64_t& Dim::operator[](int i) {
}
// Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
return indexer(*this, i);
}
// Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) {
return indexer(*this, i);
}
@@ -251,8 +259,8 @@ HOSTDEVICE int64_t linearize(const Dim& a, const Dim& b) {
// Base case dot product of two Dims
// Notice it is inline because it is no longer a template
template <>
-HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
- return a.head * b.head;
+HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) {
+ return 0;
}
// Product of a Dim
@@ -264,8 +272,8 @@ HOSTDEVICE int64_t product(const Dim& a, int prod = 1) {
// Base case product of a Dim
// Notice it is inline because it is no longer a template
template <>
-HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
- return prod * a.head;
+HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) {
+ return prod;
}
// Is 0 <= idx_i < size_i for all i?
@@ -278,8 +286,8 @@ HOSTDEVICE bool contained(const Dim& idx, const Dim& size) {
// Base case of is 0 <= idx_i < size_i ?
// Notice it is inline because it is no longer a template
template <>
-HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
- return ((0 <= idx.head) && (idx.head < size.head));
+HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) {
+ return true;
}
/**
@@ -294,8 +302,8 @@ HOSTDEVICE Dim ex_prefix_mul(const Dim& src, int mul = 1) {
// Base case of ex_prefix_mul
// Notice it is inline because it is no longer a template
template <>
-HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
- return Dim<1>(mul);
+HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) {
+ return Dim<0>();
}
///\endcond
@@ -309,8 +317,8 @@ HOSTDEVICE Dim dim_plus(const Dim& a, const Dim& b) {
// Base case
template <>
-HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
- return Dim<1>(a.head + b.head);
+HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) {
+ return Dim<0>();
}
template
@@ -328,8 +336,8 @@ HOSTDEVICE Dim dim_mult(const Dim& a, const Dim& b) {
// Base case
template <>
-HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
- return Dim<1>(a.head * b.head);
+HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) {
+ return Dim<0>();
}
template
@@ -356,10 +364,9 @@ HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) {
///\cond HIDDEN
template <>
-HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
- const Dim<1>& stride) {
- int norm_stride = size.head == 1 ? 0 : stride.head;
- return Dim<1>(norm_stride);
+HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size,
+ const Dim<0>& stride) {
+ return Dim<0>();
}
///\endcond
@@ -394,6 +401,10 @@ typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
return os;
}
+inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
+ return os;
+}
+
template
HOST std::string Dim::to_string() const {
std::stringstream stream;
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 38f22b89143c3e23c8368b9281ccc757a892a373..96f53dc1bc8747e1b8ea84166614f98ff363ae5e 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -117,6 +117,7 @@ message VarType {
// raw variables should manage their own allocations
// in operators like nccl_op
RAW = 17;
+ TUPLE = 18;
}
required Type type = 1;
@@ -148,6 +149,9 @@ message VarType {
required int64 capacity = 2;
}
optional ChannelDesc channel = 6;
+
+ message Tuple { repeated Type element_type = 1; }
+ optional Tuple tuple = 7;
}
message VarDesc {
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
new file mode 100644
index 0000000000000000000000000000000000000000..78996908b18a5a0935d8de9920e8ccef9069e74b
--- /dev/null
+++ b/paddle/fluid/framework/tuple.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include
+#include
+#include
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+typedef boost::variant
+ ElementVar;
+
+class Tuple {
+ public:
+ using ElementVars = std::vector;
+
+ Tuple(std::vector& var, std::vector& var_desc)
+ : var_(var), var_desc_(var_desc) {}
+ Tuple(std::vector& var) : var_(var) {}
+
+ ElementVar get(int idx) const { return var_[idx]; };
+
+ ElementVar& get(int idx) { return var_[idx]; };
+
+ bool isSameType(Tuple& t) const;
+
+ size_t getSize() const { return var_.size(); };
+
+ private:
+ ElementVars var_;
+ std::vector var_desc_;
+};
+
+bool Tuple::isSameType(Tuple& t) const {
+ size_t tuple_size = getSize();
+ if (tuple_size != t.getSize()) {
+ return false;
+ }
+ for (size_t j = 0; j < tuple_size; ++j) {
+ auto type1 = get(j).which();
+ auto type2 = t.get(j).which();
+ if (type1 != type2) return false;
+ }
+ return true;
+}
+
+Tuple* make_tuple(std::vector tuple) { return new Tuple(tuple); }
+
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/fluid/framework/tuple_test.cc b/paddle/fluid/framework/tuple_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..810900f161ccc08234e28b982bdd962e4cded9ae
--- /dev/null
+++ b/paddle/fluid/framework/tuple_test.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include
+#include
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tuple.h"
+
+TEST(Tuple, Make) {
+ std::vector element_type;
+ element_type.push_back(12);
+ element_type.push_back(12.0f);
+ element_type.push_back("ElementVar");
+
+ paddle::framework::Tuple* tuple = paddle::framework::make_tuple(element_type);
+
+ EXPECT_EQ(boost::get(tuple->get(0)), 12);
+ EXPECT_EQ(boost::get(tuple->get(1)), 12.0f);
+ EXPECT_EQ(boost::get(tuple->get(2)), "ElementVar");
+
+ delete tuple;
+}
+
+TEST(Tuple, IsTheSameType) {
+ std::vector element_type1;
+ std::vector element_type2;
+ std::vector element_type3;
+
+ element_type1.push_back(12);
+ element_type1.push_back(12.0f);
+ element_type1.push_back("Tuple1");
+
+ element_type2.push_back(13);
+ element_type2.push_back(13.0f);
+ element_type2.push_back("Tuple2");
+
+ element_type3.push_back(14.0f);
+ element_type3.push_back(14);
+ element_type3.push_back("Tuple3");
+
+ paddle::framework::Tuple* tuple1 =
+ paddle::framework::make_tuple(element_type1);
+ paddle::framework::Tuple* tuple2 =
+ paddle::framework::make_tuple(element_type2);
+ paddle::framework::Tuple* tuple3 =
+ paddle::framework::make_tuple(element_type3);
+
+ EXPECT_TRUE(tuple1->isSameType(*tuple2));
+ EXPECT_FALSE(tuple1->isSameType(*tuple3));
+
+ delete tuple1;
+ delete tuple2;
+ delete tuple3;
+}
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4da46e94c5cd979507fed80b35ebedf0cc6791d0..e1c02ec1613a7c31258f2d305b040ea627cebcdd 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -130,7 +130,7 @@ endif()
if(WITH_DISTRIBUTE)
add_subdirectory(detail)
- set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+ set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(send_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc
index 2b3f26c0a890c33f9b4f4c8a5a271123d7ff0b31..1218d9fdc1e6101d17bc09a4ae769f5fbf8e7b15 100644
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ b/paddle/fluid/operators/bipartite_match_op.cc
@@ -41,6 +41,14 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ColToRowMatchIndices", dims);
ctx->SetOutputDim("ColToRowMatchDist", dims);
}
+
+ protected:
+ framework::OpKernelType GetExpectedKernelType(
+ const framework::ExecutionContext& ctx) const override {
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("DistMat")->type()),
+ platform::CPUPlace());
+ }
};
template
diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc
index 1fc201286f335f9b3e1ef87c85c0b02fc0b29e4b..eccdd408a17a07a541480705242b137f8207c139 100644
--- a/paddle/fluid/operators/box_coder_op.cc
+++ b/paddle/fluid/operators/box_coder_op.cc
@@ -37,12 +37,19 @@ class BoxCoderOp : public framework::OperatorWithKernel {
"The rank of Input of PriorBoxVar must be 2");
PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
- PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
- "The rank of Input of TargetBox must be 2");
- PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
- "The shape of TargetBox is [M, 4]");
- GetBoxCodeType(ctx->Attrs().Get("code_type"));
+ auto code_type = GetBoxCodeType(ctx->Attrs().Get("code_type"));
+ if (code_type == BoxCodeType::kEncodeCenterSize) {
+ PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+ "The rank of Input of TargetBox must be 2");
+ PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+ "The shape of TargetBox is [M, 4]");
+ } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+ PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
+ "The rank of Input of TargetBox must be 3");
+ PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+ PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+ }
ctx->SetOutputDim(
"OutputBox",
@@ -70,25 +77,28 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
"of variance.");
AddInput(
"TargetBox",
- "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
- "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
- "[xmin, ymin] is the left top coordinate of the box if the input "
- "is image feature map, they are close to the origin of the coordinate "
- "system. [xmax, ymax] is the right bottom coordinate of the box. "
- "This tensor can contain LoD information to represent a batch "
- "of inputs. One instance of this batch can contain different "
- "numbers of entities.");
+ "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
+ "[N, 4] when code_type is 'encode_center_size'. This input also can "
+ "be a 3-D Tensor with shape [N, M, 4] when code_type is "
+ "'decode_center_size'. [N, 4], each box is represented as "
+ "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
+ "of the box if the input is image feature map, they are close to "
+ "the origin of the coordinate system. [xmax, ymax] is the right "
+ "bottom coordinate of the box. This tensor can contain LoD "
+ "information to represent a batch of inputs. One instance of this "
+ "batch can contain different numbers of entities.");
AddAttr("code_type",
"(string, default encode_center_size) "
"the code type used with the target box")
.SetDefault("encode_center_size")
.InEnum({"encode_center_size", "decode_center_size"});
- AddOutput(
- "OutputBox",
- "(LoDTensor or Tensor) "
- "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
- "representing the result of N target boxes encoded/decoded with "
- "M Prior boxes and variances.");
+ AddOutput("OutputBox",
+ "(LoDTensor or Tensor) "
+ "When code_type is 'encode_center_size', the output tensor of "
+ "box_coder_op with shape [N, M, 4] representing the result of N "
+ "target boxes encoded with M Prior boxes and variances. When "
+ "code_type is 'decode_center_size', N represents the batch size "
+ "and M represents the number of deocded boxes.");
AddComment(R"DOC(
Bounding Box Coder Operator.
diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu
index 7ab242edfa3bae0c394d2ca8c0b32e2c5761fa70..0944e9c95d4a66cc4a51751a8c70cd7a3fefaf1a 100644
--- a/paddle/fluid/operators/box_coder_op.cu
+++ b/paddle/fluid/operators/box_coder_op.cu
@@ -66,7 +66,6 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
T* output) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < row * col) {
- const int row_idx = idx / col;
const int col_idx = idx % col;
T prior_box_width =
prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
@@ -79,17 +78,16 @@ __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
2;
T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
- target_box_data[row_idx * len + 2]) *
+ target_box_data[idx * len + 2]) *
prior_box_width;
T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
- target_box_data[row_idx * len + 3]) *
+ target_box_data[idx * len + 3]) *
prior_box_height;
T target_box_center_x = prior_box_var_data[col_idx * len] *
- target_box_data[row_idx * len] *
- prior_box_width +
+ target_box_data[idx * len] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
- target_box_data[row_idx * len + 1] *
+ target_box_data[idx * len + 1] *
prior_box_height +
prior_box_center_y;
diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/box_coder_op.h
index 5e105aff52673af54970b5d7f05dc1bc39bbc901..3c7cac1cd17042994287effc31a918ebd4353c4c 100644
--- a/paddle/fluid/operators/box_coder_op.h
+++ b/paddle/fluid/operators/box_coder_op.h
@@ -89,6 +89,7 @@ class BoxCoderKernel : public framework::OpKernel {
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
+ size_t offset = i * col * len + j * len;
T prior_box_width =
prior_box_data[j * len + 2] - prior_box_data[j * len];
T prior_box_height =
@@ -99,20 +100,19 @@ class BoxCoderKernel : public framework::OpKernel {
(prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
T target_box_center_x = prior_box_var_data[j * len] *
- target_box_data[i * len] * prior_box_width +
+ target_box_data[offset] * prior_box_width +
prior_box_center_x;
T target_box_center_y = prior_box_var_data[j * len + 1] *
- target_box_data[i * len + 1] *
+ target_box_data[offset + 1] *
prior_box_height +
prior_box_center_y;
T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
- target_box_data[i * len + 2]) *
+ target_box_data[offset + 2]) *
prior_box_width;
T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
- target_box_data[i * len + 3]) *
+ target_box_data[offset + 3]) *
prior_box_height;
- size_t offset = i * col * len + j * len;
output[offset] = target_box_center_x - target_box_width / 2;
output[offset + 1] = target_box_center_y - target_box_height / 2;
output[offset + 2] = target_box_center_x + target_box_width / 2;
diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5892650c49e2e9d7345fb94465d124cff57f0a6f
--- /dev/null
+++ b/paddle/fluid/operators/channel_close_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace pf = paddle::framework;
+static constexpr char kChannel[] = "Channel";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCloseOp : public framework::OperatorBase {
+ public:
+ ChannelCloseOp(const std::string &type,
+ const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+ void RunImpl(const framework::Scope &scope,
+ const platform::Place &dev_place) const override {
+ auto &inp = *scope.FindVar(Input(kChannel));
+
+ // Get the mutable version of the channel variable and closes it.
+ pf::ChannelHolder *ch = inp.GetMutable();
+ ch->close();
+ }
+};
+
+class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
+ public:
+ void operator()(framework::InferShapeContext *context) const override {
+ PADDLE_ENFORCE(context->HasInput("Channel"),
+ "The input of ChannelClose op must be set");
+ }
+};
+
+class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ChannelCloseOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput(kChannel,
+ "The Channel Variable that should be closed by"
+ " the ChannelClose Op.");
+ AddComment(R"DOC(
+Channel Close Operator.
+
+This operator closes an open channel.
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
+ paddle::framework::EmptyGradOpMaker,
+ paddle::operators::ChannelCloseOpMaker);
diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2fdfd0e1f24ed071bb57b7de8f99b2d5e1d3196
--- /dev/null
+++ b/paddle/fluid/operators/channel_create_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace pf = paddle::framework;
+
+static constexpr char kOutput[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+class ChannelCreateOp : public framework::OperatorBase {
+ public:
+ ChannelCreateOp(const std::string &type,
+ const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+ void RunImpl(const framework::Scope &scope,
+ const platform::Place &dev_place) const override {
+ auto &out = *scope.FindVar(Output(kOutput));
+
+ // Determine the datatype and capacity of the channel to be created
+ // from the attributes provided.
+ auto dtype =
+ static_cast(Attr("data_type"));
+ auto capacity = Attr("capacity");
+
+ // Based on the datatype, create a new channel holder initialized with
+ // the given capacity. When capacity is 0, an unbuffered channel is
+ // created.
+ pf::ChannelHolder *ch = out.GetMutable();
+ if (dtype == framework::proto::VarType::LOD_TENSOR) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::READER) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::CHANNEL) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::BOOL) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::INT32) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::INT64) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::FP32) {
+ ch->Reset(capacity);
+ } else if (dtype == framework::proto::VarType::FP64) {
+ ch->Reset(capacity);
+ } else {
+ PADDLE_THROW(
+ "Data type %d is not in "
+ "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
+ "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
+ dtype);
+ }
+ }
+};
+
+class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
+ public:
+ void operator()(framework::InferShapeContext *context) const override {
+ PADDLE_ENFORCE(context->HasOutput(kOutput),
+ "The output of ChannelCreate op must be set");
+ context->SetOutputDim(kOutput, {1});
+ }
+};
+
+class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ChannelCreateOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddOutput(kOutput,
+ "The object of a Channel type created by ChannelCreate Op.");
+ AddAttr("capacity", "The size of the buffer of Channel.")
+ .SetDefault(0);
+ AddAttr("data_type", "The data type of elements inside the Channel.");
+ AddComment(R"DOC(
+Channel Create Operator.
+
+This operator creates an object of the VarType Channel and returns it.
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
+ paddle::framework::EmptyGradOpMaker,
+ paddle::operators::ChannelCreateOpMaker);
diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c12b88e7a91c4ea7044223464a2f902db494d1a8
--- /dev/null
+++ b/paddle/fluid/operators/channel_recv_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include
+#include
+#include
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char Status[] = "Status";
+static constexpr char Out[] = "Out";
+
+namespace paddle {
+namespace operators {
+
+void SetReceiveStatus(const platform::Place &dev_place,
+ framework::Variable &status_var, bool status) {
+ auto cpu = platform::CPUPlace();
+ auto status_tensor =
+ status_var.GetMutable()->mutable_data({1},
+ cpu);
+ status_tensor[0] = status;
+}
+
+bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) {
+ // Get type of channel and use that to call mutable data for Variable
+ auto type = framework::ToVarType(ch->Type());
+ if (type == framework::proto::VarType_Type_LOD_TENSOR)
+ return ch->Receive(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+ return ch->Receive(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+ return ch->Receive(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+ return ch->Receive(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_READER)
+ return ch->Receive(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_CHANNEL)
+ return ch->Receive(var->GetMutable());
+ else
+ PADDLE_THROW("ChannelReceive:Unsupported type");
+}
+
+class ChannelRecvOp : public framework::OperatorBase {
+ public:
+ ChannelRecvOp(const std::string &type,
+ const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ void InferShape(framework::InferShapeContext *ctx) const {
+ PADDLE_ENFORCE(ctx->HasInput(Channel),
+ "Input(Channel) of ChannelRecvOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput(Out),
+ "Input(Channel) of ChannelRecvOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput(Status),
+ "Output(Status) of ChannelRecvOp should not be null.");
+ ctx->SetOutputDim("Status", {1});
+ }
+
+ private:
+ void RunImpl(const framework::Scope &scope,
+ const platform::Place &dev_place) const override {
+ // Get the channel holder created by channel_create op, passed as input.
+ framework::ChannelHolder *ch =
+ scope.FindVar(Input(Channel))->GetMutable();
+ auto output_var = scope.FindVar(Output(Out));
+ // Receive the data from the channel.
+ bool ok = ChannelReceive(ch, output_var);
+
+ // Set the status output of the `ChannelReceive` call.
+ SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+ }
+};
+
+class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ChannelRecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput(Channel,
+ "(Channel) A variable which \"receives\" the a value sent"
+ "to it by a channel_send op.")
+ .AsDuplicable();
+ AddOutput(Out,
+ "(Variable) Output Variable that will hold the data received"
+ " from the Channel")
+ .AsDuplicable();
+ AddOutput(Status,
+ "(Tensor) An LoD Tensor that returns a boolean status of the"
+ "result of the receive operation.")
+ .AsDuplicable();
+ AddComment(R"DOC(
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
+ paddle::framework::EmptyGradOpMaker,
+ paddle::operators::ChannelRecvOpMaker);
diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d7715ad229e821f02437246e3326063cb1ee757
--- /dev/null
+++ b/paddle/fluid/operators/channel_send_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/channel.h"
+#include
+#include
+#include
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+static constexpr char Channel[] = "Channel";
+static constexpr char X[] = "X";
+static constexpr char Status[] = "Status";
+static constexpr char copy[] = "copy";
+
+namespace paddle {
+namespace operators {
+
+void SetSendStatus(const platform::Place &dev_place,
+ framework::Variable &status_var, bool status) {
+ auto cpu = platform::CPUPlace();
+ auto status_tensor =
+ status_var.GetMutable()->mutable_data({1},
+ cpu);
+ status_tensor[0] = status;
+}
+
+bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+ auto type = framework::ToVarType(var->Type());
+ if (type == framework::proto::VarType_Type_LOD_TENSOR)
+ return ch->Send(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+ return ch->Send(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+ return ch->Send(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+ return ch->Send(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_READER)
+ return ch->Send(var->GetMutable());
+ else if (type == framework::proto::VarType_Type_CHANNEL)
+ return ch->Send(var->GetMutable());
+ else
+ PADDLE_THROW("ChannelSend:Unsupported type");
+}
+
+class ChannelSendOp : public framework::OperatorBase {
+ public:
+ ChannelSendOp(const std::string &type,
+ const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ void InferShape(framework::InferShapeContext *ctx) const {
+ PADDLE_ENFORCE(ctx->HasInput(Channel),
+ "Input(Channel) of ChannelSendOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasInput(X),
+ "Input(X) of ChannelSendOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput(Status),
+ "Output(Status) of ChannelSendOp should not be null.");
+ ctx->SetOutputDim("Status", {1});
+ }
+
+ private:
+ void RunImpl(const framework::Scope &scope,
+ const platform::Place &dev_place) const override {
+ // Get the channel holder created by channel_create op, passed as input.
+ framework::ChannelHolder *ch =
+ scope.FindVar(Input(Channel))->GetMutable();
+ auto input_var = scope.FindVar(Input(X));
+
+ // Send the input data through the channel.
+ bool ok = ChannelSend(ch, input_var);
+
+ // Set the status output of the `ChannelSend` call.
+ SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+ }
+};
+
+class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ChannelSendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput(Channel,
+ "(Channel) A variable which \"sends\" the passed in value to "
+ "a listening receiver.")
+ .AsDuplicable();
+ AddInput(X, "(Variable) The value which gets sent by the channel.")
+ .AsDuplicable();
+ AddOutput(Status,
+ "(Tensor) An LoD Tensor that returns a boolean status of the"
+ "result of the send operation.")
+ .AsDuplicable();
+ AddAttr(copy, "(bool, default false) Should copy before send")
+ .SetDefault(false);
+ AddComment(R"DOC(
+)DOC");
+ }
+};
+} // namespace operators
+} // namespace paddle
+
+REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
+ paddle::framework::EmptyGradOpMaker,
+ paddle::operators::ChannelSendOpMaker);
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
index bac5cdc99c0133b1e6da3f6a23bc0512ca4177f5..0b7c470fe72eb4270b8d5b2d227642d85683c16d 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -24,6 +24,29 @@ namespace detail {
template
struct StridedMemcpyFunctor;
+template
+struct StridedMemcpyFunctor {
+ void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+ framework::Dim<0> src_stride, framework::Dim<0> dst_dim,
+ framework::Dim<0> dst_stride, T* dst) const {
+ auto place = dev_ctx.GetPlace();
+ if (platform::is_cpu_place(place)) {
+ auto& cpu_place = boost::get(place);
+ memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+ } else {
+#ifdef PADDLE_WITH_CUDA
+ auto& gpu_place = boost::get(place);
+ auto& cuda_ctx =
+ reinterpret_cast(dev_ctx);
+ memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T),
+ cuda_ctx.stream());
+#else
+ PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+ }
+ }
+};
+
template
struct StridedMemcpyFunctor {
void operator()(const platform::DeviceContext& dev_ctx, const T* src,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 06bcd0be646e7dff72b46b1c9031464de21b3c6a..fe31bbaed44fced68b7b51dd2c2031950ec4247d 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -65,12 +65,17 @@ smaller than or equal to the dimensions of $X$.
There are two cases for this operator:
1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a subset of $X$.
+2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
+ of size 1 for $Y$ will be ignored for the consideration of subsequence.
+
For case 2:
+
$Y$ will be broadcasted to match the shape of $X$ and axis should be
set to index of the start dimension to broadcast $Y$ onto $X$.
+If axis is -1, it is treated as axis=rank(X)-rank(Y).
+
For example
.. code-block:: python
@@ -79,6 +84,7 @@ For example
shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+ shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
information. However, the output only shares the LoD information with input $X$.
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 5c7830353093ebbf6a5216be497ea6f1f1c21909..ffda53a383ced411415e528886a23f28f6a62648 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -20,6 +20,7 @@ limitations under the License. */
#ifdef __NVCC__
#include
+#include "paddle/fluid/platform/cuda_helper.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif
@@ -61,6 +62,19 @@ inline void get_mid_dims(const framework::DDim& x_dims,
}
}
+inline void trim_trailing_singular_dims(framework::DDim& dims) {
+ // Remove trailing dimensions of size 1 for y
+ auto actual_dims_size = dims.size();
+ for (; actual_dims_size != 0; --actual_dims_size) {
+ if (dims[actual_dims_size - 1] != 1) break;
+ }
+ if (actual_dims_size != dims.size()) {
+ auto actual_dims = framework::vectorize(dims);
+ actual_dims.resize(actual_dims_size);
+ dims = framework::make_ddim(actual_dims);
+ }
+}
+
template
class RowwiseTransformIterator;
template
@@ -263,44 +277,6 @@ class TransformFunctor {
} \
}
-template
-void ElementwiseCompute(const framework::ExecutionContext& ctx) {
- using Tensor = framework::Tensor;
-
- auto* x = ctx.Input("X");
- auto* y = ctx.Input("Y");
- auto* z = ctx.Output("Out");
- z->mutable_data(ctx.GetPlace());
-
- auto x_dims = x->dims();
- auto y_dims = y->dims();
- PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
- "Rank of first input must >= rank of second input.");
-
- if (x_dims == y_dims) {
- functor f;
- f.template Run(x, y, z, ctx);
- return;
- }
-
- int axis = ctx.Attr("axis");
- axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
- PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
- "Axis should be in range [0, x_dims)");
-
- int pre, n, post;
- get_mid_dims(x_dims, y_dims, axis, pre, n, post);
- if (post == 1) {
- functor f;
- f.template RunBroadCast(x, y, z, ctx, pre, n);
- return;
- } else {
- functor f;
- f.template RunBroadCast2(x, y, z, ctx, pre, n, post);
- return;
- }
-}
-
#define EIGEN_ADD(x, y) ((x) + (y))
EIGEN_FUNCTOR(Add, EIGEN_ADD);
@@ -361,13 +337,10 @@ template
static __global__ void ElemwiseGradBroadcast1CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int h, int w,
DX_OP dx_op, DY_OP dy_op, T* dx, T* dy) {
- extern __shared__ char shm_buffer[];
- T* shm = reinterpret_cast(shm_buffer);
-
int j = blockIdx.x;
int i = threadIdx.x;
int tid = threadIdx.x;
- shm[tid] = 0;
+ T val = 0;
do {
int x_offset = i * w + j;
@@ -375,22 +348,16 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
if (dy) {
- shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+ val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
i += ELEMWISE_MAX_BLOCK_DIM;
} while (i < h);
if (dy) {
- __syncthreads();
-
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
- // Sum, could be optimized
+ val = platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) {
- for (int k = 1; k < h; ++k) {
- shm[0] += shm[k];
- }
- dy[j] = shm[0];
+ dy[j] = val;
}
}
}
@@ -402,10 +369,8 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T* x,
T* dx, T* dy) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
int gird_size = w;
- int shared_mem_size = block_size * sizeof(T);
- ElemwiseGradBroadcast1CUDAKernel<<>>(x, y, out, dout, h, w, dx_op,
- dy_op, dx, dy);
+ ElemwiseGradBroadcast1CUDAKernel<<>>(
+ x, y, out, dout, h, w, dx_op, dy_op, dx, dy);
}
#endif
@@ -436,7 +401,6 @@ static void ElemwiseGradBroadcast2CPU(const T* x, const T* y, const T* out,
}
#ifdef __NVCC__
-
template
static __global__ void ElemwiseGradBroadcast2CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int pre, int n,
@@ -444,9 +408,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
int tid = threadIdx.x;
int j = blockIdx.x;
- extern __shared__ char shm_buffer[];
- T* shm = reinterpret_cast(shm_buffer);
- shm[tid] = 0;
+ T val = 0;
int ttid = tid;
while (true) {
@@ -461,23 +423,18 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
}
if (dy != nullptr) {
- shm[tid] += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+ val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
}
ttid += ELEMWISE_MAX_BLOCK_DIM;
}
if (dy) {
- __syncthreads();
int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-
- // Sum, could be optimized
- if (tid == 0) {
- for (int i = 1; i < h; ++i) {
- shm[0] += shm[i];
- }
- dy[j] = shm[0];
+ val = platform::reduceSum(val, tid, h);
+ if (threadIdx.x == 0) {
+ dy[j] = val;
}
}
}
@@ -489,10 +446,8 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T* x,
DY_OP dy_op, T* dx, T* dy) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
int gird_size = n;
- int shared_mem_size = block_size * sizeof(T);
- ElemwiseGradBroadcast2CUDAKernel<<>>(x, y, out, dout, pre, n, post,
- dx_op, dy_op, dx, dy);
+ ElemwiseGradBroadcast2CUDAKernel<<>>(
+ x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy);
}
#endif
@@ -516,14 +471,10 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
auto x_dim = x.dims();
auto y_dim = y.dims();
- if (y_dim.size() == 1 && y_dim[0] == 1) {
- // y is a scalar
- auto extended_dims = framework::vectorize(x_dim);
- extended_dims.push_back(1);
- x_dim = framework::make_ddim(extended_dims);
- }
-
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
+ trim_trailing_singular_dims(y_dim);
+ axis = (y_dim.size() == 0) ? x_dim.size() : axis;
+
int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, pre, n, post);
if (post == 1) {
@@ -591,14 +542,9 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
return;
}
- if (y_dims.size() == 1 && y_dims[0] == 1) {
- // y is a scalar
- auto extended_dims = framework::vectorize(x_dims);
- extended_dims.push_back(1);
- x_dims = framework::make_ddim(extended_dims);
- }
-
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+ trim_trailing_singular_dims(y_dims);
+ axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
@@ -633,16 +579,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
return;
}
- if (y_dims.size() == 1 && y_dims[0] == 1) {
- // y is a scalar
- auto extended_dims = framework::vectorize(x_dims);
- extended_dims.push_back(1);
- x_dims = framework::make_ddim(extended_dims);
- }
-
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
+ trim_trailing_singular_dims(y_dims);
+ axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfa797717d78aa72e1b931b6db6e153270b3424e
--- /dev/null
+++ b/paddle/fluid/operators/go_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector;
+
+static constexpr char kBlock[] = "sub_block";
+static constexpr char kX[] = "X";
+
+class GoOp : public framework::OperatorBase {
+ public:
+ GoOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+ void ExecuteOnThread(framework::Executor *executor,
+ framework::BlockDesc *block,
+ framework::Scope *scope) const {
+ framework::ProgramDesc *program = block->Program();
+ executor->Run(*program, scope, block->ID(), false /*create_local_scope*/);
+ }
+
+ void RunImpl(const framework::Scope &scope,
+ const platform::Place &dev_place) const override {
+ /*
+ * Determine the global scope. Create a new child scope.
+ * Within the child scope, add all the local variables relevant
+ * to that scope.
+ *
+ * Now go through all the inputs to the op to ensure that
+ * all of them are in the newly created scope. This is important
+ * to ensure that they don't get destroyed when the parent scope
+ * is deleted.
+ * */
+
+ // TODO(varunarora): Consider moving this root scope lookup to scope.h.
+ const framework::Scope *root_scope = &scope;
+ const framework::Scope *parent_scope = &(root_scope->parent());
+
+ while (parent_scope != nullptr) {
+ root_scope = parent_scope;
+ parent_scope = &(parent_scope->parent());
+ }
+
+ framework::BlockDesc *block = Attr(kBlock);
+ framework::Executor executor(dev_place);
+ framework::Scope &new_scope = root_scope->NewScope();
+
+ for (auto &var : block->AllVars()) {
+ new_scope.Var(var->Name());
+ }
+
+ auto &inputs = Inputs(kX);
+ for (size_t i = 0; i < inputs.size(); i++) {
+ PADDLE_ENFORCE_NOT_NULL(new_scope.FindVar(inputs.at(i)),
+ "All variables used in the go block "
+ "should be created in the global scope");
+ }
+
+ // Now execute the go op with the newly created scope.
+ std::thread go_thread([dev_place, block, &new_scope, this]() {
+ framework::Executor executor(dev_place);
+ ExecuteOnThread(&executor, block, &new_scope);
+ });
+ go_thread.detach();
+ }
+};
+
+class GoOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ GoOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput(kX,
+ "A set of variables, which are required by operators inside the "
+ "block of Go Op.")
+ .AsDuplicable();
+ AddAttr(kBlock, "The block inside GoOp");
+ AddComment(R"DOC(
+)DOC");
+ }
+};
+
+// TODO(thuan): Look into Gradient Operator for GO_OP
+
+} // namespace operators
+} // namespace paddle
+
+REGISTER_OPERATOR(go, paddle::operators::GoOp,
+ paddle::framework::EmptyGradOpMaker,
+ paddle::operators::GoOpMaker);
diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
index b7e9f4e2248882103a3926c309d6fc455442c917..0e81d60878dce747b047abbe4641b71462373b2b 100644
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/mine_hard_examples_op.cc
@@ -247,7 +247,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input("ClsLoss")->type()),
- ctx.device_context());
+ platform::CPUPlace());
}
};
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
index 2565e7e9efad415c5e4db2489afa9553683b7b0a..c4e70cde6f8c6bdf1f28b010b0b90091772fdffb 100644
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/multiclass_nms_op.cc
@@ -62,7 +62,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input("Scores")->type()),
- ctx.device_context());
+ platform::CPUPlace());
}
};
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index 922b2bd237a1ec54aea895a05ccd78cd624e88ae..be7898c22190339e0717317807b91e038f4949f6 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -67,6 +67,14 @@ class PriorBoxOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
}
+
+ protected:
+ framework::OpKernelType GetExpectedKernelType(
+ const framework::ExecutionContext& ctx) const override {
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("Input")->type()),
+ platform::CPUPlace());
+ }
};
class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index b0e21e01ecc55d666f43aee621e9b5a05b347e1d..e1ce3d0c1bf11e9a623e4e9adc8f08f5069f4d94 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -59,6 +59,16 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
}
};
+class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
+ public:
+ void operator()(const framework::OpDesc &op_desc,
+ framework::BlockDesc *block) const override {
+ for (auto &out_var : op_desc.Output("Out")) {
+ block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
+ }
+ }
+};
+
class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
@@ -80,7 +90,8 @@ class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
namespace ops = paddle::operators;
REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
ops::SplitSelectedRowsOpMaker,
- ops::SplitSelectedRowsGradMaker);
+ ops::SplitSelectedRowsGradMaker,
+ ops::SplitSelectedRowsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
split_selected_rows,
ops::SplitSelectedRowsOpKernel);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 28a668c86aa322803a65b916b4273181f5652e21..a1a743d94e204ca506c994f8fabb6bbf8c22cea5 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,4 +1,15 @@
proto_library(profiler_proto SRCS profiler.proto)
+py_proto_compile(profiler_py_proto SRCS profiler.proto)
+
+add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+
+add_dependencies(profiler_py_proto profiler_py_proto_init)
+
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+ COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+ COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
if(WITH_GPU)
cc_library(enforce SRCS enforce.cc DEPS)
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 881d611d4ac26f992036f639097815aff625227b..a4ea4f21e3c16c9292cf67863616924e9d9f8aba 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -62,5 +62,53 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
}
#endif
+// __shfl_down has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+ return __shfl_down(val, delta);
+}
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+ mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+template
+__device__ T reduceSum(T val, int tid, int len) {
+ // TODO(zcd): The warp size should be taken from the
+ // parameters of the GPU but not specified as 32 simply.
+ // To make the reduceSum more efficiently,
+ // I use Warp-Level Parallelism and assume the Warp size
+ // is 32 which may be different for different GPU,
+ // but most card's warp size is 32.
+ __shared__ T shm[32];
+ const int warpSize = 32;
+ unsigned mask = 0u;
+ CREATE_SHFL_MASK(mask, tid < len);
+
+ for (int offset = warpSize / 2; offset > 0; offset /= 2)
+ val += __shfl_down_sync(mask, val, offset);
+
+ if (tid < warpSize) shm[tid] = 0;
+
+ __syncthreads();
+
+ if (tid % warpSize == 0) {
+ shm[tid / warpSize] = val;
+ }
+
+ CREATE_SHFL_MASK(mask, tid < warpSize);
+
+ if (tid < warpSize) {
+ val = shm[tid];
+ for (int offset = warpSize / 2; offset > 0; offset /= 2)
+ val += __shfl_down_sync(mask, val, offset);
+ }
+
+ return val;
+}
+
} // namespace platform
} // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 87bbdfa5fd5d9781d5f2b310d2142b1b4decbf9b..265343573b3c85335f0b1c9a55e60cf9cf13f3c3 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
+#include
+#include
#include