提交 a9855e4a 编写于 作者: L Liu Yiqun

Merge branch 'develop' into core_inference_fix_run

...@@ -25,12 +25,3 @@ third_party/ ...@@ -25,12 +25,3 @@ third_party/
# clion workspace. # clion workspace.
cmake-build-* cmake-build-*
# generated while compiling
paddle/pybind/pybind.h
CMakeFiles
cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
paddle/fluid/pybind/pybind.h
python/paddle/version.py
...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject) ...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.1.20171007") SET(MKLML_VER "mklml_lnx_2018.0.1.20171007")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz") SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
......
...@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL) ...@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
add_dependencies(snappystream extern_snappystream) add_dependencies(snappystream extern_snappystream)
...@@ -62,7 +62,8 @@ ExternalProject_Add( ...@@ -62,7 +62,8 @@ ExternalProject_Add(
) )
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
......
...@@ -25,7 +25,8 @@ ELSE(WIN32) ...@@ -25,7 +25,8 @@ ELSE(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32) ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
ExternalProject_Add( ExternalProject_Add(
extern_zlib extern_zlib
......
...@@ -244,14 +244,14 @@ function(cc_test TARGET_NAME) ...@@ -244,14 +244,14 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif() endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS} COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME) ...@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
endif() endif()
endfunction(nv_test) endfunction(nv_test)
...@@ -387,8 +387,8 @@ function(hip_test TARGET_NAME) ...@@ -387,8 +387,8 @@ function(hip_test TARGET_NAME)
endif() endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
endif() endif()
endfunction(hip_test) endfunction(hip_test)
...@@ -561,9 +561,9 @@ function(py_test TARGET_NAME) ...@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS) set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS} COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
endfunction() endfunction()
......
# FileManager设计文档
## 目标
在本文档中,我们设计说明了名为FileManager系统,方便用户上传自己的训练数据以进行分布式训练
主要功能包括:
- 提供常用的命令行管理命令管理文件和目录
- 支持大文件的断点上传、下载
## 名词解释
- PFS:是`Paddlepaddle cloud File System`的缩写,是对用户文件存储空间的抽象,与之相对的是local filesystem。目前我们用CephFS来搭建。
- [CephFS](http://docs.ceph.com/docs/master/cephfs/):一个POSIX兼容的文件系统。
- Chunk:逻辑划上文件分块的单位。
## 模块
### 架构图
<image src=./src/filemanager.png width=900>
### PFSClient
- 功能: 详细设计[link](./pfs/pfsclient.md)
- 提供用户管理文件的命令
- 需要可以跨平台执行
- 双向验证
PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>,所以用户需要首先在`cloud.paddlepaddle.org`上注册一下,申请用户空间,并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地,然后才能使用PFSClient。
### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
- 功能:
提供七层协议的反向代理、基于粘性会话的负载均衡功能。
- 透传用户身份的办法
Ingress需要把PFSClient的身份信息传给PFSServer,配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
### PFSServer
PFSServer提供RESTful API接口,接收处理PFSClient端的文件管理请求,并且把结果返回PFSClient端。
RESTful API
- /api/v1/files
- `GET /api/v1/files`: Get metadata of files or directories.
- `POST /api/v1/files`: Create files or directories.
- `PATCH /api/v1/files`: Update files or directories.
- `DELETE /api/v1/files`: Delete files or directories.
- /api/v1/file/chunks
- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
- /api/v1/storage/files
- `GET /api/v1/storage/files`: Download files or directories.
- `POST /api/v1/storage/files`: Upload files or directories.
- /api/v1/storage/file/chunks
- `GET /api/v1/storage/file/chunks`: Download chunks's data.
- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
## 文件传输优化
### 分块文件传输
用户文件可能是比较大的,上传到Cloud或者下载到本地的时间可能比较长,而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题,我们提出了Chunk的概念,一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小(默认256K),完成一个传输动作完成的时间也比较短,不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
一个典型的Chunk如下所示:
```
type Chunk struct {
fileOffset int64
checksum uint32
len uint32
data []byte
}
```
### 生成sparse文件
当destination文件不存在或者大小和source文件不一致时,可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件,然后就可以并发写入多个Chunk。
### 覆盖不一致的部分
文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致,不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
## 用户使用流程
参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
## 框架生成
[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分,以便我们可以把更多的精力放到逻辑本身上。
## 参考文档
- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
- [linux man document](https://linux.die.net/man/)
# PFSClient
## Description
The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
## Synopsis
```
paddle [options] pfs <subcommand> [parameters]
```
## Options
```
--profile (string)
Use a specific profile from your credential file.
--help (string)
Display more information about command
--version
Output version information and exit
--debug
Show detailed debugging log
--only-show-errors (boolean)
Only errors and warnings are displayed. All other output is suppressed.
```
## Path Arguments
When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.
A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
## order of Path Arguments
Commonly, if there are two path arguments, the first is the source, and the second is the destination.
## Subcommonds
- rm - remove files or directories
```
Synopsis:
rm [-r] [-v] <PFSPath> ...
Options:
-r
Remove directories and their contents recursively
-v
Cause rm to be verbose, showing files after they are removed.
Examples:
paddle pfs rm /pfs/$DATACENTER/home/$USER/file
paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
```
- mv - move (rename) files
```
Synopsis:
mv [-f | -n] [-v] <LocalPath> <PFSPath>
mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
mv [-f | -n] [-v] <PFSPath> <LocalPath>
mv [-f | -n] [-v] <PFSPath> ... <LocalPath>
mv [-f | -n] [-v] <PFSPath> <PFSPath>
mv [-f | -n] [-v] <PFSPath> ... <PFSPath>
Options:
-f
Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.)
-n
Do not overwrite an existing file. (The -n option overrides previous -f options.)
-v
Cause mv to be verbose, showing files after they are moved.
Examples:
paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
```
- cp - copy files or directories
```
Synopsis:
cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath>
cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
Options:
-r
Copy directories recursively
-f
Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.)
-n
Do not overwrite an existing file. (The -n option overrides previous -f options.)
-v
Cause cp to be verbose, showing files after they are copied.
--preserve--links
Reserve links when copy links
Examples:
paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
```
- ls- list files
```
Synopsis:
ls [-r] <PFSPath> ...
Options:
-R
List directory(ies) recursively
Examples:
paddle pfs ls /pfs/$DATACENTER/home/$USER/file
paddle pfs ls /pfs/$DATACENTER/home/$USER/folder
```
- mkdir - mkdir directory(ies)
Create intermediate directory(ies) as required.
```
Synopsis:
mkdir <PFSPath> ...
Examples:
paddle pfs mkdir /pfs/$DATACENTER/home/$USER/folder
```
...@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs ...@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN}) ${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_fluid_docs gen_proto_py) add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results # configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
...@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn ...@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN}) ${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_fluid_docs_cn gen_proto_py) add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
add_subdirectory(api) add_subdirectory(api)
...@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis ...@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN}) ${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind) add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
...@@ -9,5 +9,5 @@ ...@@ -9,5 +9,5 @@
use_eigen_cn.md use_eigen_cn.md
name_convention.md name_convention.md
support_new_device.md support_new_device.md
releasing_process.md releasing_process_cn.md
op_markdown_format.md op_markdown_format.md
...@@ -9,5 +9,5 @@ Development ...@@ -9,5 +9,5 @@ Development
use_eigen_en.md use_eigen_en.md
name_convention.md name_convention.md
support_new_device.md support_new_device.md
releasing_process.md releasing_process_en.md
op_markdown_format.md op_markdown_format.md
...@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* 使用Regression Test List作为检查列表,测试本次release的正确性。 * 使用Regression Test List作为检查列表,测试本次release的正确性。
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步 * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True` * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`
* 编译这个版本的python wheel包,并发布到pypi。 * 将这个版本的python wheel包发布到pypi。
* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64` * 更新Docker镜像(参考后面的操作细节)。
* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel` 1. 第三步完成后,将`release/版本号`分支合入master分支,将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。
* 上传方法: 1. 协同完成Release Note的书写。
```
cd build/python
pip install twine
twine upload dist/[package to upload]
```
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
1. 协同完成Release Note的书写
需要注意的是: 需要注意的是:
...@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本,遵循以下流程:
## 发布wheel包到pypi ## 发布wheel包到pypi
使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview) 1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以 完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后 弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。
可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m``cp27mu`的版本。然后按照上述的方法 <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
使用`twine`工具上传即可。 1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m``cp27mu`的版本。
1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png"> 1. 上传:
```
cd build/python
pip install twine
twine upload dist/[package to upload]
```
* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux * 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
...@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本,遵循以下流程:
上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
版本号对应的tag即可: 版本号对应的tag即可:
1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。 ```
1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。 docker pull [镜像]:latest
1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]` docker tag [镜像]:latest [镜像]:[version]
1. 执行 `docker push paddlepaddle/paddle:[version]` docker push [镜像]:[version]
```
需要更新的镜像tag包括:
* `[version]`: CPU版本
* `[version]-openblas`: openblas版本
* `[version]-gpu`: GPU版本(CUDA 8.0 cudnn 5)
* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
## PaddlePaddle 分支规范 ## PaddlePaddle 分支规范
...@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- ...@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
### PaddlePaddle Book中所有章节 ### PaddlePaddle Book中所有章节
PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练(V2和Fluid)模型正确性。
<table> <table>
<thead> <thead>
......
# PaddlePaddle Releasing Process
PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
Each time we release a new PaddlePaddle version, we should follow the below steps:
1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
1. After that, we should do:
* Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
that this release has no major bugs.
* If regression test fails, we must fix those bugs and create a new `release/[version]`
branch from previous release branch.
* Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
* Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
* Update the Docker images (see below instructions for detail).
1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
then merge `master` to `develop`.
1. Update the Release Note.
***NOTE:***
* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
features only for current release, so that we can test on that version.
* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
## Publish Wheel Packages to pypi
1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
to build all wheel packages needed to publish. As shown in the following picture, choose a build
version, click "..." button on the right side of "Run" button, and switch to the second tab in the
pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
step to start different versions of builds.
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
upload the package using `twine`, we need to rename the package from `linux_x86_64` to
`manylinux1_x86_64`.
1. Start the upload:
```
cd build/python
pip install twine
twine upload dist/[package to upload]
```
* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
scripts under `tools/manylinux1`.
* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
old version. you must change the version number before upload a new one.
## Publish Docker Images
Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
```
docker pull [image]:latest
docker tag [image]:latest [image]:[version]
docker push [image]:[version]
```
Tags that need to be updated are:
* `[version]`: CPU only version image
* `[version]-openblas`: openblas version image
* `[version]-gpu`: GPU version(using CUDA 8.0 cudnn 5)
* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
## Branching Model
We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
with some modifications:
* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
regression tests are run.
* `release/[version]` branch is used to publish each release. Latest release version branches have
bugfix only for that version, but no feature updates.
* Developer forks are not required to follow
[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
branching model, all forks is like a feature branch.
* Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
* Advise: developer use it's fork's develop branch to for new branch to start developing.
* Use that branch on developer's fork to create pull requests and start reviews.
* developer can push new commits to that branch when the pull request is open.
* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
`master`, `develop` and `releases`.
## PaddlePaddle Regression Test List
### All Chapters of PaddlePaddle Book
We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
V1 (`paddle_trainer` training) and V2 training and Fluid training.
<table>
<thead>
<tr>
<th></th>
<th>Linear Regression</th>
<th>Recognize Digits</th>
<th>Image Classification</th>
<th>Word2Vec</th>
<th>Personalized Recommendation</th>
<th>Sentiment Analysis</th>
<th>Semantic Role Labeling</th>
<th>Machine Translation</th>
</tr>
</thead>
<tbody>
<tr>
<td>API.V2 + Docker + GPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> API.V2 + Docker + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>`paddle_trainer` + Docker + GPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>`paddle_trainer` + Docker + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> API.V2 + Ubuntu + GPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>API.V2 + Ubuntu + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> `paddle_trainer` + Ubuntu + GPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> `paddle_trainer` + Ubuntu + CPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
</tbody>
</table>
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# serve to show the default. # serve to show the default.
import sys import sys
import os, subprocess import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex import shlex
from recommonmark import parser, transform from recommonmark import parser, transform
import paddle import paddle
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# serve to show the default. # serve to show the default.
import sys import sys
import os, subprocess import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python')) sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex import shlex
from recommonmark import parser, transform from recommonmark import parser, transform
import paddle import paddle
......
...@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs ...@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN}) ${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_v2_docs gen_proto_py) add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results # configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
...@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn ...@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN}) ${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_v2_docs_cn gen_proto_py) add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
add_subdirectory(api) add_subdirectory(api)
...@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis ...@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN}) ${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind) add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
...@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle ...@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
${START_END} ${START_END}
) )
add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
COMMAND ${CMAKE_COMMAND} -E touch .timestamp COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
DEPENDS _swig_paddle DEPENDS _swig_paddle
) )
# TODO(yuyang18) : make wheel name calculated by cmake # TODO(yuyang18) : make wheel name calculated by cmake
add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so) add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
if(WITH_TESTING) if(WITH_TESTING)
IF(NOT PY_PIP_FOUND) IF(NOT PY_PIP_FOUND)
......
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
)
add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
py_test(testTrain SRCS testTrain.py) py_test(testTrain SRCS testTrain.py)
py_test(testMatrix SRCS testMatrix.py) py_test(testMatrix SRCS testMatrix.py)
py_test(testVector SRCS testVector.py) py_test(testVector SRCS testVector.py)
......
...@@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) ...@@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
if(WITH_GPU) if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
endif() endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
...@@ -21,9 +21,9 @@ endif() ...@@ -21,9 +21,9 @@ endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
...@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto) ...@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init) add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/ COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <deque> #include <deque>
#include <memory> #include <memory>
#include <set> #include <set>
#include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -96,6 +97,8 @@ class BlockDesc { ...@@ -96,6 +97,8 @@ class BlockDesc {
*/ */
void RemoveOp(size_t s, size_t e); void RemoveOp(size_t s, size_t e);
void RemoveVar(const std::string &name) { vars_.erase(name); }
std::vector<OpDesc *> AllOps() const; std::vector<OpDesc *> AllOps() const;
size_t OpSize() const { return ops_.size(); } size_t OpSize() const { return ops_.size(); }
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stddef.h> // for size_t #include <stddef.h> // for size_t
#include <condition_variable> #include <condition_variable> // NOLINT
#include <typeindex> #include <typeindex>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -216,7 +216,8 @@ class ChannelHolder { ...@@ -216,7 +216,8 @@ class ChannelHolder {
template <typename T> template <typename T>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) { explicit PlaceholderImpl(size_t buffer_size)
: type_(std::type_index(typeid(T))) {
channel_.reset(MakeChannel<T>(buffer_size)); channel_.reset(MakeChannel<T>(buffer_size));
} }
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stddef.h> // for size_t #include <stddef.h> // for size_t
#include <atomic> #include <atomic>
#include <condition_variable> #include <condition_variable> // NOLINT
#include <deque> #include <deque>
#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
virtual void Unlock(); virtual void Unlock();
virtual bool IsClosed(); virtual bool IsClosed();
virtual void Close(); virtual void Close();
ChannelImpl(size_t); explicit ChannelImpl(size_t);
virtual ~ChannelImpl(); virtual ~ChannelImpl();
virtual void AddToSendQ(const void *referrer, T *data, virtual void AddToSendQ(const void *referrer, T *data,
...@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
const void *referrer; // TODO(thuan): figure out better way to do this const void *referrer; // TODO(thuan): figure out better way to do this
std::function<bool(ChannelAction)> callback; std::function<bool(ChannelAction)> callback;
QueueMessage(T *item) explicit QueueMessage(T *item)
: data(item), cond(std::make_shared<std::condition_variable_any>()) {} : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond) QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
...@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel<T> {
} }
std::shared_ptr<QueueMessage> get_first_message( std::shared_ptr<QueueMessage> get_first_message(
std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) { std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
while (!queue.empty()) { while (!queue->empty()) {
// Check whether this message was added by Select // Check whether this message was added by Select
// If this was added by Select then execute the callback // If this was added by Select then execute the callback
// to check if you can execute this message. The callback // to check if you can execute this message. The callback
// can return false if some other case was executed in Select. // can return false if some other case was executed in Select.
// In that case just discard this QueueMessage and process next. // In that case just discard this QueueMessage and process next.
std::shared_ptr<QueueMessage> m = queue.front(); std::shared_ptr<QueueMessage> m = queue->front();
queue.pop_front(); queue->pop_front();
if (m->callback == nullptr || m->callback(action)) return m; if (m->callback == nullptr || m->callback(action)) return m;
} }
return nullptr; return nullptr;
...@@ -147,7 +147,7 @@ void ChannelImpl<T>::Send(T *item) { ...@@ -147,7 +147,7 @@ void ChannelImpl<T>::Send(T *item) {
// to send to the receiver, bypassing the channel buffer if any // to send to the receiver, bypassing the channel buffer if any
if (!recvq.empty()) { if (!recvq.empty()) {
std::shared_ptr<QueueMessage> m = std::shared_ptr<QueueMessage> m =
get_first_message(recvq, ChannelAction::SEND); get_first_message(&recvq, ChannelAction::SEND);
if (m != nullptr) { if (m != nullptr) {
*(m->data) = std::move(*item); *(m->data) = std::move(*item);
...@@ -198,7 +198,7 @@ bool ChannelImpl<T>::Receive(T *item) { ...@@ -198,7 +198,7 @@ bool ChannelImpl<T>::Receive(T *item) {
// buffer and move front of send queue to the buffer // buffer and move front of send queue to the buffer
if (!sendq.empty()) { if (!sendq.empty()) {
std::shared_ptr<QueueMessage> m = std::shared_ptr<QueueMessage> m =
get_first_message(sendq, ChannelAction::RECEIVE); get_first_message(&sendq, ChannelAction::RECEIVE);
if (buf_.size() > 0) { if (buf_.size() > 0) {
// Case 1 : Channel is Buffered // Case 1 : Channel is Buffered
// Do Data transfer from front of buffer // Do Data transfer from front of buffer
...@@ -219,9 +219,10 @@ bool ChannelImpl<T>::Receive(T *item) { ...@@ -219,9 +219,10 @@ bool ChannelImpl<T>::Receive(T *item) {
if (m != nullptr) { if (m != nullptr) {
*item = std::move(*(m->data)); *item = std::move(*(m->data));
m->Notify(); m->Notify();
} else } else {
return recv_return(Receive(item)); return recv_return(Receive(item));
} }
}
return recv_return(true); return recv_return(true);
} }
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/channel.h"
#include <chrono> #include <chrono> // NOLINT
#include <thread> #include <thread> // NOLINT
#include "gtest/gtest.h" #include "gtest/gtest.h"
using paddle::framework::Channel; using paddle::framework::Channel;
...@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
std::thread t([&]() { std::thread t([&]() {
// Try to write more than buffer size. // Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) { for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size) if (i < buffer_size) {
ch->Send(&i); // should block after 10 iterations ch->Send(&i); // should block after 10 iterations
else { } else {
bool is_exception = false; bool is_exception = false;
try { try {
ch->Send(&i); ch->Send(&i);
...@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) { ...@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
} }
void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) { void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers // Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
...@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) { ...@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked // Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
...@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) { ...@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) { void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
bool send_success[num_threads]; bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers // Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
send_success[i] = false; send_success[i] = false;
t[i] = std::thread( t[i] = std::thread(
...@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) { ...@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
if (isBuffered) { if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked. // If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++; if (!thread_ended[i]) ct++;
} }
EXPECT_GE(ct, 4); EXPECT_GE(ct, 4);
} else { } else {
// If ch is UnBuffered, all the threads should be blocked. // If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
} }
...@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) { ...@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
if (isBuffered) { if (isBuffered) {
// Verify that only 1 send was successful // Verify that only 1 send was successful
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++; if (send_success[i]) ct++;
} }
// Only 1 send must be successful // Only 1 send must be successful
EXPECT_EQ(ct, 1); EXPECT_EQ(ct, 1);
} }
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
// This tests that closing a buffered channel also unblocks // This tests that closing a buffered channel also unblocks
...@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) { ...@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
// This tests that destroying a channel unblocks // This tests that destroying a channel unblocks
// any senders waiting for channel to have write space // any senders waiting for channel to have write space
void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) { void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
bool send_success[num_threads]; bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers // Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
send_success[i] = false; send_success[i] = false;
t[i] = std::thread( t[i] = std::thread(
...@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) { ...@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
if (isBuffered) { if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked // If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++; if (thread_ended[i] == false) ct++;
} }
// Atleast 4 threads must be blocked // Atleast 4 threads must be blocked
EXPECT_GE(ct, 4); EXPECT_GE(ct, 4);
} else { } else {
// Verify that all the threads are blocked // Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
} }
...@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) { ...@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
// Count number of successful sends // Count number of successful sends
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++; if (send_success[i]) ct++;
} }
...@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) { ...@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
} }
// Join all threads // Join all threads
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
// This tests that destroying a channel also unblocks // This tests that destroying a channel also unblocks
// any receivers waiting on the channel // any receivers waiting on the channel
void ChannelDestroyUnblockReceivers(Channel<int> *ch) { void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers // Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
...@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) { ...@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked // Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
// delete the channel // delete the channel
delete ch; delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) { TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
...@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) { ...@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
} }
void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers // Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
...@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { ...@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked // Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
...@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { ...@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
bool send_success[num_threads]; bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers // Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
send_success[i] = false; send_success[i] = false;
t[i] = std::thread( t[i] = std::thread(
...@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { ...@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
if (isBuffered) { if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked. // If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++; if (!thread_ended[i]) ct++;
} }
EXPECT_GE(ct, 4); EXPECT_GE(ct, 4);
} else { } else {
// If ch is UnBuffered, all the threads should be blocked. // If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
} }
...@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { ...@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
if (isBuffered) { if (isBuffered) {
// Verify that only 1 send was successful // Verify that only 1 send was successful
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++; if (send_success[i]) ct++;
} }
// Only 1 send must be successful // Only 1 send must be successful
EXPECT_EQ(ct, 1); EXPECT_EQ(ct, 1);
} }
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
// This tests that closing a channelholder unblocks // This tests that closing a channelholder unblocks
...@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) { ...@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
// This tests that destroying a channelholder unblocks // This tests that destroying a channelholder unblocks
// any senders waiting for channel // any senders waiting for channel
void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
bool send_success[num_threads]; bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers // Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
send_success[i] = false; send_success[i] = false;
t[i] = std::thread( t[i] = std::thread(
...@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { ...@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
if (isBuffered) { if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked // If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++; if (thread_ended[i] == false) ct++;
} }
// Atleast 4 threads must be blocked // Atleast 4 threads must be blocked
EXPECT_GE(ct, 4); EXPECT_GE(ct, 4);
} else { } else {
// Verify that all the threads are blocked // Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
} }
...@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { ...@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
// Count number of successfuld sends // Count number of successfuld sends
int ct = 0; int ct = 0;
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++; if (send_success[i]) ct++;
} }
...@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { ...@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
} }
// Join all threads // Join all threads
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
// This tests that destroying a channelholder also unblocks // This tests that destroying a channelholder also unblocks
// any receivers waiting on the channel // any receivers waiting on the channel
void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) { void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
size_t num_threads = 5; const size_t kNumThreads = 5;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers // Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
...@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) { ...@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads are blocked // Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false); EXPECT_EQ(thread_ended[i], false);
} }
// delete the channel // delete the channel
delete ch; delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked // Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) { TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
...@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) { ...@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
// This tests that closing a channelholder many times. // This tests that closing a channelholder many times.
void ChannelHolderManyTimesClose(ChannelHolder *ch) { void ChannelHolderManyTimesClose(ChannelHolder *ch) {
const int num_threads = 15; const int kNumThreads = 15;
std::thread t[num_threads]; std::thread t[kNumThreads];
bool thread_ended[num_threads]; bool thread_ended[kNumThreads];
// Launches threads that try to send data to channel. // Launches threads that try to send data to channel.
for (size_t i = 0; i < num_threads / 3; i++) { for (size_t i = 0; i < kNumThreads / 3; i++) {
thread_ended[i] = false; thread_ended[i] = false;
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended) { [&](bool *ended) {
...@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) { ...@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
} }
// Launches threads that try to receive data to channel. // Launches threads that try to receive data to channel.
for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) { for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
thread_ended[i] = false; thread_ended[i] = false;
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
...@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) { ...@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
} }
// Launches threads that try to close the channel. // Launches threads that try to close the channel.
for (size_t i = 2 * num_threads / 3; i < num_threads; i++) { for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
thread_ended[i] = false; thread_ended[i] = false;
t[i] = std::thread( t[i] = std::thread(
[&](bool *p) { [&](bool *p) {
...@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) { ...@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are unblocked // Verify that all threads are unblocked
for (size_t i = 0; i < num_threads; i++) { for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true); EXPECT_EQ(thread_ended[i], true);
} }
EXPECT_TRUE(ch->IsClosed()); EXPECT_TRUE(ch->IsClosed());
// delete the channel // delete the channel
delete ch; delete ch;
for (size_t i = 0; i < num_threads; i++) t[i].join(); for (size_t i = 0; i < kNumThreads; i++) t[i].join();
} }
TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) { TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
......
...@@ -16,6 +16,6 @@ else() ...@@ -16,6 +16,6 @@ else()
endif() endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps}) scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context) simple_threadpool device_context)
...@@ -142,6 +142,7 @@ class LoDTensor : public Tensor { ...@@ -142,6 +142,7 @@ class LoDTensor : public Tensor {
return (lod_)[level].size() - 1; return (lod_)[level].size() - 1;
} }
// Split LoDTensor and copy to each place specified in places.
std::vector<LoDTensor> SplitLoDTensor( std::vector<LoDTensor> SplitLoDTensor(
const std::vector<platform::Place> places) const; const std::vector<platform::Place> places) const;
......
...@@ -35,6 +35,17 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = { ...@@ -35,6 +35,17 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
}; };
proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
if (var->IsType<framework::LoDTensor>()) {
return framework::ToDataType(var->Get<framework::LoDTensor>().type());
} else if (var->IsType<framework::SelectedRows>()) {
return framework::ToDataType(
var->Get<framework::SelectedRows>().value().type());
} else {
PADDLE_THROW("Var should be LoDTensor or SelectedRows");
}
}
static DDim GetDims(const Scope& scope, const std::string& name) { static DDim GetDims(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name); Variable* var = scope.FindVar(name);
if (var == nullptr) { if (var == nullptr) {
......
...@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) { ...@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix; return var_name + kGradVarSuffix;
} }
proto::VarType::Type GetDataTypeOfVar(const Variable* var);
class OperatorBase; class OperatorBase;
class ExecutionContext; class ExecutionContext;
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/platform/profiler.h"
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -24,6 +23,7 @@ limitations under the License. */ ...@@ -24,6 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate { ...@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
#endif #endif
}; };
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_;
}
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(
size_t num_threads, bool use_event, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::unordered_set<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, bool allow_op_delay) const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
// Step 1. RunStartupProgram and Bcast the params to devs. // Step 1. Bcast the params to devs.
Executor exe(places[0]);
exe.Run(startup_program, scope, 0);
// Create local scopes // Create local scopes
if (local_scopes.empty()) {
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(&scope->NewScope()); member_->local_scopes_.push_back(&scope->NewScope());
} }
} else {
PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
for (size_t i = 0; i < member_->places_.size(); ++i) {
member_->local_scopes_.push_back(local_scopes[i]);
}
}
// Bcast Parameters to all GPUs // Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
#endif #endif
if (platform::is_gpu_place(places[0]) && if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
member_->local_scopes_.size() != 1) { // Is CUDA local_scopes.empty()) { // Is CUDA
BCastParamsToGPUs(startup_program); BCastParamsToGPUs(bcast_vars);
} }
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
...@@ -99,16 +109,17 @@ ParallelExecutor::ParallelExecutor( ...@@ -99,16 +109,17 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BCastParamsToGPUs(
const ProgramDesc &startup_program) const { const std::unordered_set<std::string> &vars) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto *main_scope = member_->local_scopes_[0]; auto *main_scope = member_->local_scopes_[0];
for (auto *var_desc : startup_program.Block(0).AllVars()) { for (auto &var : vars) {
size_t idx = var_desc->Name().find("@GRAD"); auto *main_var = main_scope->FindVar(var);
if (idx != std::string::npos) continue; if (!main_var->IsType<LoDTensor>()) {
if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { continue;
auto &main_tensor = }
main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
auto &main_tensor = main_var->Get<LoDTensor>();
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
...@@ -123,8 +134,7 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -123,8 +134,7 @@ void ParallelExecutor::BCastParamsToGPUs(
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type()); buffer = t->mutable_data(place, main_tensor.type());
} }
...@@ -136,13 +146,12 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -136,13 +146,12 @@ void ParallelExecutor::BCastParamsToGPUs(
platform::CPUPlace cpu; platform::CPUPlace cpu;
for (size_t i = 1; i < member_->places_.size(); ++i) { for (size_t i = 1; i < member_->places_.size(); ++i) {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>(); auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
t->Resize(dims); t->Resize(dims);
t->mutable_data(cpu, main_tensor.type()); t->mutable_data(cpu, main_tensor.type());
paddle::framework::TensorCopy(main_tensor, cpu, t); paddle::framework::TensorCopy(main_tensor, cpu, t);
} }
} }
}
member_->nccl_ctxs_->WaitAll(); member_->nccl_ctxs_->WaitAll();
} }
#else #else
...@@ -150,13 +159,30 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -150,13 +159,30 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif #endif
} }
void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors, void ParallelExecutor::Run(
const std::string &fetched_var_name) { const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name,
const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
platform::RecordBlock b(0); platform::RecordBlock b(0);
SplitTensorToPlaces(feed_tensors);
auto fetch_data = member_->executor_->Run(fetch_tensors); auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() = *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
fetch_data; fetch_data;
} }
void ParallelExecutor::SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
for (auto it : feed_tensors) {
auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
for (size_t j = 0; j < member_->places_.size(); ++j) {
// TODO(panxy0718): Do I need to delete this var?
member_->local_scopes_[j]
->Var(it.first)
->GetMutable<LoDTensor>()
->ShareDataWith(lod_tensors[j]);
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -36,18 +36,25 @@ class ParallelExecutor { ...@@ -36,18 +36,25 @@ class ParallelExecutor {
explicit ParallelExecutor(size_t num_threads, bool use_event, explicit ParallelExecutor(size_t num_threads, bool use_event,
const std::vector<platform::Place>& places, const std::vector<platform::Place>& places,
const std::unordered_set<std::string>& params, const std::unordered_set<std::string>& params,
const ProgramDesc& startup_program, const std::unordered_set<std::string>& bcast_vars,
const ProgramDesc& main_program, const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope, const std::string& loss_var_name, Scope* scope,
const std::vector<Scope*>& local_scopes,
bool allow_op_delay); bool allow_op_delay);
std::vector<Scope*>& GetLocalScopes();
void Run(const std::vector<std::string>& fetch_tensors, void Run(const std::vector<std::string>& fetch_tensors,
const std::string& fetched_var_name = "fetched_var"); const std::string& fetched_var_name,
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
private: private:
void SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
ParallelExecutorPrivate* member_; ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const ProgramDesc& startup_program) const; void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
}; };
} // namespace framework } // namespace framework
......
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <mutex> // for call_once
#include <set> #include <set>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
...@@ -39,6 +38,7 @@ Scope::~Scope() { ...@@ -39,6 +38,7 @@ Scope::~Scope() {
} }
Scope& Scope::NewScope() const { Scope& Scope::NewScope() const {
std::unique_lock<std::mutex> lock(mutex_);
kids_.push_back(new Scope(this)); kids_.push_back(new Scope(this));
return *kids_.back(); return *kids_.back();
} }
...@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const { ...@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
} }
void Scope::DeleteScope(Scope* scope) { void Scope::DeleteScope(Scope* scope) {
std::unique_lock<std::mutex> lock(mutex_);
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
...@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) {
} }
} }
void Scope::EraseVars(std::vector<std::string>& var_names) { void Scope::EraseVars(const std::vector<std::string>& var_names) {
std::set<std::string> var_set(var_names.begin(), var_names.end()); std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) { for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) { if (var_set.find(it->first) != var_set.end()) {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <list> #include <list>
#include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -51,7 +52,7 @@ class Scope { ...@@ -51,7 +52,7 @@ class Scope {
/// Create a variable with a scope-unique name. /// Create a variable with a scope-unique name.
Variable* Var(std::string* name = nullptr); Variable* Var(std::string* name = nullptr);
void EraseVars(std::vector<std::string>& var_names); void EraseVars(const std::vector<std::string>& var_names);
/// Find a variable in the scope or any of its ancestors. Returns /// Find a variable in the scope or any of its ancestors. Returns
/// nullptr if cannot find. /// nullptr if cannot find.
...@@ -88,6 +89,9 @@ class Scope { ...@@ -88,6 +89,9 @@ class Scope {
Scope const* parent_{nullptr}; Scope const* parent_{nullptr};
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
private:
mutable std::mutex mutex_;
}; };
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -13,6 +16,7 @@ limitations under the License. */ ...@@ -13,6 +16,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx) { const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version { // the 1st field, uint32_t version
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -47,6 +50,15 @@ class SelectedRows { ...@@ -47,6 +50,15 @@ class SelectedRows {
void set_rows(const Vector<int64_t>& rows) { rows_ = rows; } void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
/**
* get the index of id in rows
*/
int64_t index(int64_t id) const {
auto it = std::find(rows_.begin(), rows_.end(), id);
PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
DDim GetCompleteDims() const { DDim GetCompleteDims() const {
std::vector<int64_t> dims = vectorize(value_->dims()); std::vector<int64_t> dims = vectorize(value_->dims());
dims[0] = height_; dims[0] = height_;
......
...@@ -128,13 +128,20 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { ...@@ -128,13 +128,20 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>( holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type)); boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW(
"CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
} }
#else #else
if (platform::is_gpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>( holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type)); boost::get<platform::CUDAPlace>(place), size, type));
} else if (platform::is_cuda_pinned_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
boost::get<platform::CUDAPinnedPlace>(place), size, type));
}
} }
#endif #endif
offset_ = 0; offset_ = 0;
...@@ -145,7 +152,7 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { ...@@ -145,7 +152,7 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
inline void* Tensor::mutable_data(platform::Place place) { inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing"); "Cannot invoke mutable data if current hold nothing.");
return mutable_data(place, holder_->type()); return mutable_data(place, holder_->type());
} }
......
...@@ -35,24 +35,25 @@ class Tuple { ...@@ -35,24 +35,25 @@ class Tuple {
public: public:
using ElementVars = std::vector<ElementVar>; using ElementVars = std::vector<ElementVar>;
Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc) Tuple(const std::vector<ElementVar>& var,
const std::vector<VarDesc>& var_desc)
: var_(var), var_desc_(var_desc) {} : var_(var), var_desc_(var_desc) {}
Tuple(std::vector<ElementVar>& var) : var_(var) {} explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}
ElementVar get(int idx) const { return var_[idx]; }; ElementVar get(int idx) const { return var_[idx]; }
ElementVar& get(int idx) { return var_[idx]; }; ElementVar& get(int idx) { return var_[idx]; }
bool isSameType(Tuple& t) const; bool isSameType(const Tuple& t) const;
size_t getSize() const { return var_.size(); }; size_t getSize() const { return var_.size(); }
private: private:
ElementVars var_; ElementVars var_;
std::vector<VarDesc> var_desc_; std::vector<VarDesc> var_desc_;
}; };
bool Tuple::isSameType(Tuple& t) const { bool Tuple::isSameType(const Tuple& t) const {
size_t tuple_size = getSize(); size_t tuple_size = getSize();
if (tuple_size != t.getSize()) { if (tuple_size != t.getSize()) {
return false; return false;
......
set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init) set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS io.cc SRCS io.cc
......
...@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) { ...@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) {
return false; return false;
} }
void LoadPersistables(framework::Executor& executor, void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
framework::Scope& scope,
const framework::ProgramDesc& main_program, const framework::ProgramDesc& main_program,
const std::string& dirname, const std::string& dirname,
const std::string& param_filename) { const std::string& param_filename) {
...@@ -108,10 +107,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor, ...@@ -108,10 +107,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
} }
std::unique_ptr<framework::ProgramDesc> Load( std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor& executor, framework::Executor& executor, framework::Scope& scope,
framework::Scope& scope, const std::string& prog_filename, const std::string& param_filename) {
const std::string& prog_filename,
const std::string& param_filename) {
std::string model_filename = prog_filename; std::string model_filename = prog_filename;
std::string program_desc_str; std::string program_desc_str;
ReadBinaryFile(model_filename, program_desc_str); ReadBinaryFile(model_filename, program_desc_str);
......
...@@ -24,8 +24,7 @@ limitations under the License. */ ...@@ -24,8 +24,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void LoadPersistables(framework::Executor& executor, void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
framework::Scope& scope,
const framework::ProgramDesc& main_program, const framework::ProgramDesc& main_program,
const std::string& dirname, const std::string& dirname,
const std::string& param_filename); const std::string& param_filename);
......
...@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME) ...@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
set(multiValueArgs ARGS) set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "") set(arg_list "")
if(inference_test_ARGS) if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS}) foreach(arg ${inference_test_ARGS})
......
...@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) { ...@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) {
// The second dim of the input tensor should be 13 // The second dim of the input tensor should be 13
// The input data should be >= 0 // The input data should be >= 0
int64_t batch_size = 10; int64_t batch_size = 10;
SetupTensor<float>( SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10)); static_cast<float>(10));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -35,10 +35,8 @@ TEST(inference, image_classification) { ...@@ -35,10 +35,8 @@ TEST(inference, image_classification) {
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
// Use normilized image pixels as input data, // Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0]. // which should be in the range [0.0, 1.0].
SetupTensor<float>(input, SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
{FLAGS_batch_size, 3, 32, 32}, static_cast<float>(0), static_cast<float>(1));
static_cast<float>(0),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
...@@ -48,8 +46,8 @@ TEST(inference, image_classification) { ...@@ -48,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>( TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat); FLAGS_repeat);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -59,8 +57,8 @@ TEST(inference, image_classification) { ...@@ -59,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---"; LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>( TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat); FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) { ...@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
int64_t predicate_dict_len = 3162; int64_t predicate_dict_len = 3162;
int64_t mark_dict_len = 2; int64_t mark_dict_len = 2;
SetupLoDTensor(word, SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1)); static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(predicate, SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(predicate_dict_len - 1)); static_cast<int64_t>(predicate_dict_len - 1));
SetupLoDTensor(ctx_n2, SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1)); static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_n1, SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1)); static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_0, SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1)); static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_p1, SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1)); static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_p2, SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1)); static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(mark, SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(mark_dict_len - 1)); static_cast<int64_t>(mark_dict_len - 1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) { ...@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
// Use normilized image pixels as input data, // Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0]. // which should be in the range [-1.0, 1.0].
SetupTensor<float>(input, SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
{FLAGS_batch_size, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
static_cast<float>(-1),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
...@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) { ...@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {
// Run inference on CPU // Run inference on CPU
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---"; LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CPUPlace>( TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) { ...@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {
// Run inference on CUDA GPU // Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---"; LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CUDAPlace>( TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -36,25 +36,25 @@ TEST(inference, recommender_system) { ...@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
// Use the first data from paddle.dataset.movielens.test() as input // Use the first data from paddle.dataset.movielens.test() as input
std::vector<int64_t> user_id_data = {1}; std::vector<int64_t> user_id_data = {1};
SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data); SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
std::vector<int64_t> gender_id_data = {1}; std::vector<int64_t> gender_id_data = {1};
SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data); SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
std::vector<int64_t> age_id_data = {0}; std::vector<int64_t> age_id_data = {0};
SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data); SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
std::vector<int64_t> job_id_data = {10}; std::vector<int64_t> job_id_data = {10};
SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data); SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
std::vector<int64_t> movie_id_data = {783}; std::vector<int64_t> movie_id_data = {783};
SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data); SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
std::vector<int64_t> category_id_data = {10, 8, 9}; std::vector<int64_t> category_id_data = {10, 8, 9};
SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data); SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988}; std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data); SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&user_id); cpu_feeds.push_back(&user_id);
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) { ...@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
paddle::framework::LoDTensor word_data, trg_word; paddle::framework::LoDTensor word_data, trg_word;
paddle::framework::LoD lod{{0, 4, 10}}; paddle::framework::LoD lod{{0, 4, 10}};
SetupLoDTensor( SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); static_cast<int64_t>(1));
SetupLoDTensor( SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1)); static_cast<int64_t>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&word_data); cpu_feeds.push_back(&word_data);
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) { ...@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
paddle::framework::LoD lod{{0, 4, 10}}; paddle::framework::LoD lod{{0, 4, 10}};
int64_t word_dict_len = 5147; int64_t word_dict_len = 5147;
SetupLoDTensor(words, SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
lod,
static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1)); static_cast<int64_t>(word_dict_len - 1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
......
...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -33,10 +33,10 @@ TEST(inference, word2vec) { ...@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
paddle::framework::LoD lod{{0, 1}}; paddle::framework::LoD lod{{0, 1}};
int64_t dict_size = 2073; // The size of dictionary int64_t dict_size = 2073; // The size of dictionary
SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1); SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1); SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1); SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1); SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&first_word); cpu_feeds.push_back(&first_word);
......
...@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#include <map>
#include <random>
#include <string>
#include <vector>
#include <time.h>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
template <typename T> template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input, void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, paddle::framework::DDim dims, T lower, T upper) {
T lower, std::mt19937 rng(100); // An arbitrarily chosen but fixed seed.
T upper) { std::uniform_real_distribution<double> uniform_dist(0, 1);
srand(time(0));
T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace()); T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
for (int i = 0; i < input.numel(); ++i) { for (int i = 0; i < input->numel(); ++i) {
input_ptr[i] = input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
(static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
lower;
} }
} }
template <typename T> template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input, void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, paddle::framework::DDim dims, const std::vector<T>& data) {
std::vector<T>& data) {
CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size())); CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace()); T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
memcpy(input_ptr, data.data(), input.numel() * sizeof(T)); memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
} }
template <typename T> template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input, void SetupLoDTensor(paddle::framework::LoDTensor* input,
paddle::framework::LoD& lod, const paddle::framework::LoD& lod, T lower, T upper) {
T lower, input->set_lod(lod);
T upper) {
input.set_lod(lod);
int dim = lod[0][lod[0].size() - 1]; int dim = lod[0][lod[0].size() - 1];
SetupTensor<T>(input, {dim, 1}, lower, upper); SetupTensor<T>(input, {dim, 1}, lower, upper);
} }
template <typename T> template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input, void SetupLoDTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, paddle::framework::DDim dims,
paddle::framework::LoD lod, const paddle::framework::LoD lod,
std::vector<T>& data) { const std::vector<T>& data) {
const size_t level = lod.size() - 1; const size_t level = lod.size() - 1;
CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back())); CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
input.set_lod(lod); input->set_lod(lod);
SetupTensor<T>(input, dims, data); SetupTensor<T>(input, dims, data);
} }
template <typename T> template <typename T>
void CheckError(paddle::framework::LoDTensor& output1, void CheckError(const paddle::framework::LoDTensor& output1,
paddle::framework::LoDTensor& output2) { const paddle::framework::LoDTensor& output2) {
// Check lod information // Check lod information
EXPECT_EQ(output1.lod(), output2.lod()); EXPECT_EQ(output1.lod(), output2.lod());
...@@ -91,9 +91,8 @@ void CheckError(paddle::framework::LoDTensor& output1, ...@@ -91,9 +91,8 @@ void CheckError(paddle::framework::LoDTensor& output1,
template <typename Place> template <typename Place>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs, const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const int repeat = 1, const int repeat = 1, const bool is_combined = false) {
const bool is_combined = false) {
// 1. Define place, executor, scope // 1. Define place, executor, scope
auto place = Place(); auto place = Place();
auto executor = paddle::framework::Executor(place); auto executor = paddle::framework::Executor(place);
...@@ -132,10 +131,8 @@ void TestInference(const std::string& dirname, ...@@ -132,10 +131,8 @@ void TestInference(const std::string& dirname,
// `fluid.io.save_inference_model`. // `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__"; std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__"; std::string param_filename = "__params_combined__";
inference_program = inference_program = paddle::inference::Load(
paddle::inference::Load(executor, executor, *scope, dirname + "/" + prog_filename,
*scope,
dirname + "/" + prog_filename,
dirname + "/" + param_filename); dirname + "/" + param_filename);
} else { } else {
// Parameters are saved in separate files sited in the specified // Parameters are saved in separate files sited in the specified
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
add_subdirectory(detail) add_subdirectory(detail)
cc_library(memory SRCS memory.cc DEPS place enforce) cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(paddle_memory cc_library(memory
DEPS DEPS
memory malloc
memcpy memcpy)
meta_data
meta_cache
memory_block
buddy_allocator
system_allocator)
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory) cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
#if (WITH_GPU) #if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place paddle_memory) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory)
#endif() #endif()
cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
if(${WITH_GPU}) if(${WITH_GPU})
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
else(${WITH_GPU}) else(${WITH_GPU})
...@@ -6,10 +8,4 @@ endif(${WITH_GPU}) ...@@ -6,10 +8,4 @@ endif(${WITH_GPU})
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
cc_library(meta_data SRCS meta_data.cc) cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
cc_library(meta_cache SRCS meta_cache.cc)
cc_library(memory_block SRCS memory_block.cc)
cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
...@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) { ...@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) {
void* BuddyAllocator::Alloc(size_t unaligned_size) { void* BuddyAllocator::Alloc(size_t unaligned_size) {
// adjust allocation alignment // adjust allocation alignment
size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); size_t size =
align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);
// acquire the allocator lock // acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
...@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) { ...@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) {
return; return;
} }
block->mark_as_free(cache_); block->mark_as_free(&cache_);
total_used_ -= block->total_size(cache_); total_used_ -= block->total_size(cache_);
total_free_ += block->total_size(cache_); total_free_ += block->total_size(cache_);
...@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) { ...@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) {
right_buddy)); right_buddy));
// merge its right buddy to the block // merge its right buddy to the block
block->merge(cache_, right_buddy); block->merge(&cache_, right_buddy);
} }
} }
...@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) { ...@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) {
left_buddy->total_size(cache_), left_buddy)); left_buddy->total_size(cache_), left_buddy));
// merge the block to its left buddy // merge the block to its left buddy
left_buddy->merge(cache_, block); left_buddy->merge(&cache_, block);
block = left_buddy; block = left_buddy;
} }
} }
...@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; } ...@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; }
void* BuddyAllocator::SystemAlloc(size_t size) { void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0; size_t index = 0;
void* p = system_allocator_->Alloc(index, size); void* p = system_allocator_->Alloc(&index, size);
VLOG(10) << "Allocated " << p << " from system allocator."; VLOG(10) << "Allocated " << p << " from system allocator.";
if (p == nullptr) return nullptr; if (p == nullptr) return nullptr;
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
size, nullptr, nullptr); size, nullptr, nullptr);
return static_cast<MemoryBlock*>(p)->data(); return static_cast<MemoryBlock*>(p)->data();
...@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { ...@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
// Allocate a new maximum sized block // Allocate a new maximum sized block
size_t index = 0; size_t index = 0;
void* p = system_allocator_->Alloc(index, max_chunk_size_); void* p = system_allocator_->Alloc(&index, max_chunk_size_);
if (p == nullptr) return pool_.end(); if (p == nullptr) return pool_.end();
VLOG(10) << "Creating and inserting new block " << p VLOG(10) << "Creating and inserting new block " << p
<< " from system allocator"; << " from system allocator";
static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
max_chunk_size_, nullptr, nullptr); max_chunk_size_, nullptr, nullptr);
// gpu fallback allocation // gpu fallback allocation
...@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, ...@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
<< ") into"; << ") into";
block->split(cache_, size); block->split(&cache_, size);
VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
<< ")"; << ")";
block->set_type(cache_, MemoryBlock::ARENA_CHUNK); block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
// the rest of memory if exist // the rest of memory if exist
if (block->has_right_buddy(cache_)) { if (block->has_right_buddy(cache_)) {
......
...@@ -14,18 +14,18 @@ limitations under the License. */ ...@@ -14,18 +14,18 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/memory/detail/meta_cache.h" #include <mutex> // NOLINT
#include "paddle/fluid/memory/detail/meta_data.h" #include <set>
#include <tuple>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include <mutex>
#include <set>
#include <unordered_map>
#include <vector>
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
......
...@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and ...@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_cache.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size, void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size,
void* left_buddy, void* right_buddy) { void* left_buddy, void* right_buddy) {
cache.store(this, Metadata(t, index, size - sizeof(Metadata), size, cache->save(
this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size,
static_cast<MemoryBlock*>(left_buddy), static_cast<MemoryBlock*>(left_buddy),
static_cast<MemoryBlock*>(right_buddy))); static_cast<MemoryBlock*>(right_buddy)));
} }
MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const { MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const {
return cache.load(this).type; return cache.load(this).type;
} }
size_t MemoryBlock::size(MetadataCache& cache) const { size_t MemoryBlock::size(const MetadataCache& cache) const {
return cache.load(this).size; return cache.load(this).size;
} }
size_t MemoryBlock::total_size(MetadataCache& cache) const { size_t MemoryBlock::index(const MetadataCache& cache) const {
return cache.load(this).index;
}
size_t MemoryBlock::total_size(const MetadataCache& cache) const {
return cache.load(this).total_size; return cache.load(this).total_size;
} }
MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const { bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const {
return left_buddy(cache) != nullptr;
}
bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const {
return right_buddy(cache) != nullptr;
}
MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const {
return cache.load(this).left_buddy; return cache.load(this).left_buddy;
} }
MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const { MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
return cache.load(this).right_buddy; return cache.load(this).right_buddy;
} }
void MemoryBlock::split(MetadataCache& cache, size_t size) { void MemoryBlock::split(MetadataCache* cache, size_t size) {
// make sure the split fits // make sure the split fits
PADDLE_ASSERT(total_size(cache) >= size); PADDLE_ASSERT(total_size(*cache) >= size);
// bail out if there is no room for another partition // bail out if there is no room for another partition
if (total_size(cache) - size <= sizeof(Metadata)) { if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
return; return;
} }
// find the position of the split // find the position of the split
void* right_partition = reinterpret_cast<uint8_t*>(this) + size; void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
size_t remaining_size = total_size(cache) - size; size_t remaining_size = total_size(*cache) - size;
// Add the new block as a buddy // Add the new block as a buddy
auto metadata = cache.load(this); auto metadata = cache->load(this);
// Write the metadata for the new block // Write the metadata for the new block
auto new_block_right_buddy = metadata.right_buddy; auto new_block_right_buddy = metadata.right_buddy;
cache.store( cache->save(static_cast<MemoryBlock*>(right_partition),
static_cast<MemoryBlock*>(right_partition), MemoryBlock::Desc(FREE_CHUNK, index(*cache),
Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata), remaining_size - sizeof(MemoryBlock::Desc),
remaining_size, this, new_block_right_buddy)); remaining_size, this, new_block_right_buddy));
metadata.right_buddy = static_cast<MemoryBlock*>(right_partition); metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
metadata.size = size - sizeof(Metadata); metadata.size = size - sizeof(MemoryBlock::Desc);
metadata.total_size = size; metadata.total_size = size;
cache.store(this, metadata); cache->save(this, metadata);
// Write metadata for the new block's right buddy // Write metadata for the new block's right buddy
if (new_block_right_buddy != nullptr) { if (new_block_right_buddy != nullptr) {
auto buddy_metadata = cache.load(new_block_right_buddy); auto buddy_metadata = cache->load(new_block_right_buddy);
buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition); buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
cache.store(new_block_right_buddy, buddy_metadata); cache->save(new_block_right_buddy, buddy_metadata);
} }
} }
void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) { void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
// only free blocks can be merged // only free blocks can be merged
PADDLE_ASSERT(type(cache) == FREE_CHUNK); PADDLE_ASSERT(type(*cache) == FREE_CHUNK);
PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK); PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK);
auto metadata = cache.load(this); auto metadata = cache->load(this);
// link this->buddy's buddy // link this->buddy's buddy
metadata.right_buddy = right_buddy->right_buddy(cache); metadata.right_buddy = right_buddy->right_buddy(*cache);
// link buddy's buddy -> this // link buddy's buddy -> this
if (metadata.right_buddy != nullptr) { if (metadata.right_buddy != nullptr) {
auto buddy_metadata = cache.load(metadata.right_buddy); auto buddy_metadata = cache->load(metadata.right_buddy);
buddy_metadata.left_buddy = this; buddy_metadata.left_buddy = this;
cache.store(metadata.right_buddy, buddy_metadata); cache->save(metadata.right_buddy, buddy_metadata);
} }
metadata.size += right_buddy->total_size(cache); metadata.size += right_buddy->total_size(*cache);
metadata.total_size += right_buddy->total_size(cache); metadata.total_size += right_buddy->total_size(*cache);
cache.store(this, metadata); cache->save(this, metadata);
cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); cache->save(right_buddy,
MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
} }
void MemoryBlock::mark_as_free(MetadataCache& cache) { void MemoryBlock::mark_as_free(MetadataCache* cache) {
// check for double free or corruption // check for double free or corruption
PADDLE_ASSERT(type(cache) != FREE_CHUNK); PADDLE_ASSERT(type(*cache) != FREE_CHUNK);
PADDLE_ASSERT(type(cache) != INVALID_CHUNK); PADDLE_ASSERT(type(*cache) != INVALID_CHUNK);
set_type(cache, FREE_CHUNK); set_type(cache, FREE_CHUNK);
} }
void MemoryBlock::set_type(MetadataCache& cache, Type t) { void MemoryBlock::set_type(MetadataCache* cache, Type t) {
auto metadata = cache.load(this); auto metadata = cache->load(this);
metadata.type = t; metadata.type = t;
cache->save(this, metadata);
cache.store(this, metadata);
}
bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
return left_buddy(cache) != nullptr;
}
bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
return right_buddy(cache) != nullptr;
}
size_t MemoryBlock::index(MetadataCache& cache) const {
return cache.load(this).index;
} }
void* MemoryBlock::data() const { void* MemoryBlock::data() const {
return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1; return const_cast<MemoryBlock::Desc*>(
reinterpret_cast<const MemoryBlock::Desc*>(this)) +
1;
} }
MemoryBlock* MemoryBlock::metadata() const { MemoryBlock* MemoryBlock::metadata() const {
return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>( return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
reinterpret_cast<const Metadata*>(this) - 1)); reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
} }
} // namespace detail } // namespace detail
......
...@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cstddef> #include <cstdint>
#include <unordered_map>
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
// Forward Declarations // Forward declaration.
class MetadataCache; class MetadataCache;
/*! \brief A class used to interpret the contents of a memory block */ // MemoryBlock represents Each allocated memory block, which contains
class MemoryBlock { // MemoryBlock::Desc and the payload.
public: struct MemoryBlock {
enum Type { enum Type {
FREE_CHUNK, // memory is free and idle FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied ARENA_CHUNK, // memory is being occupied
...@@ -33,57 +33,96 @@ class MemoryBlock { ...@@ -33,57 +33,96 @@ class MemoryBlock {
INVALID_CHUNK // memory is invalid INVALID_CHUNK // memory is invalid
}; };
public: // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
void init(MetadataCache& cache, Type t, size_t index, size_t size, // If it is a CPU memory block, the MetadataCache writes the
// MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory
// block, the MetadataCache writes the Meatadata to a std::map in
// the CPU.
void init(MetadataCache* cache, Type t, size_t index, size_t size,
void* left_buddy, void* right_buddy); void* left_buddy, void* right_buddy);
public: // All these accessors returns fields in the MemoryBlock::Desc of the memory
/*! \brief The type of the allocation */ // block. They all need a MetadataCache instance as their first
Type type(MetadataCache& cache) const; // parameter because they read the MemoryBlock::Desc from the cache.
Type type(const MetadataCache& cache) const;
/*! \brief The size of the data region */ size_t size(const MetadataCache& cache) const;
size_t size(MetadataCache& cache) const; size_t index(const MetadataCache& cache) const;
size_t total_size(const MetadataCache& cache) const;
bool has_left_buddy(const MetadataCache& cache) const;
bool has_right_buddy(const MetadataCache& cache) const;
MemoryBlock* left_buddy(const MetadataCache& cache) const;
MemoryBlock* right_buddy(const MetadataCache& cache) const;
/*! \brief An index to track the allocator */ // Split the allocation into left/right blocks.
size_t index(MetadataCache& cache) const; void split(MetadataCache* cache, size_t size);
/*! \brief The total size of the block */ // Merge left and right blocks together.
size_t total_size(MetadataCache& cache) const; void merge(MetadataCache* cache, MemoryBlock* right_buddy);
/*! \brief Check the left buddy of the block */ // Mark the allocation as free.
bool has_left_buddy(MetadataCache& cache) const; void mark_as_free(MetadataCache* cache);
/*! \brief Check the right buddy of the block */ // Change the type of the allocation.
bool has_right_buddy(MetadataCache& cache) const; void set_type(MetadataCache* cache, Type t);
/*! \brief Get the left buddy */
MemoryBlock* left_buddy(MetadataCache& cache) const;
/*! \brief Get the right buddy */
MemoryBlock* right_buddy(MetadataCache& cache) const;
public:
/*! \brief Split the allocation into left/right blocks */
void split(MetadataCache& cache, size_t size);
/*! \brief Merge left and right blocks together */
void merge(MetadataCache& cache, MemoryBlock* right_buddy);
/*! \brief Mark the allocation as free */
void mark_as_free(MetadataCache& cache);
/*! \brief Change the type of the allocation */
void set_type(MetadataCache& cache, Type t);
public:
/*! \brief Get a pointer to the memory block's data */
void* data() const; void* data() const;
/*! \brief Get a pointer to the memory block's metadata */
MemoryBlock* metadata() const; MemoryBlock* metadata() const;
// MemoryBlock::Desc describes a MemoryBlock.
struct Desc {
Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
MemoryBlock* r);
Desc();
// Updates guard_begin and guard_end by hashes of the Metadata object.
void update_guards();
// Checks that guard_begin and guard_end are hashes of the Metadata object.
bool check_guards() const;
// TODO(gangliao): compress this
size_t guard_begin = 0;
MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
size_t index = 0;
size_t size = 0;
size_t total_size = 0;
MemoryBlock* left_buddy = nullptr;
MemoryBlock* right_buddy = nullptr;
size_t guard_end = 0;
};
};
// A cache for accessing memory block meta-data that may be expensive
// to access directly. This class exists to unify the
// MemoryBlock::Desc format between GPU and CPU allocations. It should
// be removed when the CPU can access all GPU allocations directly via
// UVM.
class MetadataCache {
public: public:
static size_t overhead(); explicit MetadataCache(bool uses_gpu);
// Disable copying and assignment.
MetadataCache(const MetadataCache&) = delete;
MetadataCache& operator=(const MetadataCache&) = delete;
// Returns the MemoryBlock::Desc for a memory block. When MetadataCache is
// used to manage CPU memory, the MemoryBlock::Desc resides at the beginning
// of the memory block; when used to manage GPU memory, the
// Meatadata resides in CPU memory indexed by cache_.
MemoryBlock::Desc load(const MemoryBlock* memory_block) const;
// Saves the MemoryBlock::Desc of a memory block into the cache. For CPU
// memory block, writes the MemoryBlock::Desc to the beginning of the memory
// block; whereas for GPU memory, writes it to cache_.
void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data);
// For GPU memory block, erases its MemoryBlock::Desc from cache_.
void invalidate(MemoryBlock* memory_block);
private:
typedef std::unordered_map<const MemoryBlock*, MemoryBlock::Desc> MetadataMap;
MetadataMap cache_;
bool uses_gpu_;
}; };
} // namespace detail } // namespace detail
......
...@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/meta_data.h"
#include <functional> #include <functional>
#include "paddle/fluid/memory/detail/memory_block.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
MemoryBlock* l, MemoryBlock* r) MemoryBlock* l, MemoryBlock* r)
: type(t), : type(t),
index(i), index(i),
...@@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, ...@@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
left_buddy(l), left_buddy(l),
right_buddy(r) {} right_buddy(r) {}
Metadata::Metadata() MemoryBlock::Desc::Desc()
: type(MemoryBlock::INVALID_CHUNK), : type(MemoryBlock::INVALID_CHUNK),
index(0), index(0),
size(0), size(0),
...@@ -37,32 +37,36 @@ Metadata::Metadata() ...@@ -37,32 +37,36 @@ Metadata::Metadata()
left_buddy(nullptr), left_buddy(nullptr),
right_buddy(nullptr) {} right_buddy(nullptr) {}
namespace {
template <class T> template <class T>
inline void hash_combine(std::size_t& seed, const T& v) { inline void hash_combine(std::size_t* seed, const T& v) {
std::hash<T> hasher; std::hash<T> hasher;
seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2);
} }
inline size_t hash(const Metadata* metadata, size_t initial_seed) { inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
size_t seed = initial_seed; size_t seed = initial_seed;
hash_combine(seed, (size_t)metadata->type); hash_combine(&seed, static_cast<size_t>(metadata.type));
hash_combine(seed, metadata->index); hash_combine(&seed, metadata.index);
hash_combine(seed, metadata->size); hash_combine(&seed, metadata.size);
hash_combine(seed, metadata->total_size); hash_combine(&seed, metadata.total_size);
hash_combine(seed, metadata->left_buddy); hash_combine(&seed, metadata.left_buddy);
hash_combine(seed, metadata->right_buddy); hash_combine(&seed, metadata.right_buddy);
return seed; return seed;
} }
void Metadata::update_guards() { } // namespace
guard_begin = hash(this, 1);
guard_end = hash(this, 2); void MemoryBlock::Desc::update_guards() {
guard_begin = hash(*this, 1);
guard_end = hash(*this, 2);
} }
bool Metadata::check_guards() const { bool MemoryBlock::Desc::check_guards() const {
return guard_begin == hash(this, 1) && guard_end == hash(this, 2); return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
} }
} // namespace detail } // namespace detail
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/meta_cache.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
...@@ -23,29 +22,28 @@ namespace detail { ...@@ -23,29 +22,28 @@ namespace detail {
MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
Metadata MetadataCache::load(const MemoryBlock* block) { MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
if (uses_gpu_) { if (uses_gpu_) {
auto existing_metadata = cache_.find(block); auto existing_desc = cache_.find(block);
PADDLE_ASSERT(existing_metadata->second.check_guards()); PADDLE_ASSERT(existing_desc->second.check_guards());
return existing_metadata->second; return existing_desc->second;
} else { } else {
auto* meta = reinterpret_cast<const Metadata*>(block); auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
VLOG(10) << "Load MetaData type=" << meta->type; VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
PADDLE_ASSERT(meta->check_guards()); PADDLE_ASSERT(desc->check_guards());
return *reinterpret_cast<const Metadata*>(block); return *reinterpret_cast<const MemoryBlock::Desc*>(block);
} }
} }
void MetadataCache::store(MemoryBlock* block, void MetadataCache::save(MemoryBlock* block,
const Metadata& original_metadata) { const MemoryBlock::Desc& original_desc) {
auto metadata = original_metadata; auto desc = original_desc;
desc.update_guards();
metadata.update_guards();
if (uses_gpu_) { if (uses_gpu_) {
cache_[block] = metadata; cache_[block] = desc;
} else { } else {
*reinterpret_cast<Metadata*>(block) = metadata; *reinterpret_cast<MemoryBlock::Desc*>(block) = desc;
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include <unordered_map>
namespace paddle {
namespace memory {
namespace detail {
/**
* \brief A cache for accessing memory block meta-data that may be expensive
* to access directly.
*
* \note This class exists to unify the metadata format between GPU and CPU
* allocations. It should be removed when the CPU can access all GPU
* allocations directly via UVM.
*/
class MetadataCache {
public:
explicit MetadataCache(bool uses_gpu);
public:
/*! \brief Load the associated metadata for the specified memory block. */
Metadata load(const MemoryBlock* memory_block);
/*! \brief Store the associated metadata for the specified memory block. */
void store(MemoryBlock* memory_block, const Metadata& meta_data);
/*! \brief Indicate that the specified metadata will no longer be used. */
void invalidate(MemoryBlock* memory_block);
public:
MetadataCache(const MetadataCache&) = delete;
MetadataCache& operator=(const MetadataCache&) = delete;
private:
bool uses_gpu_;
private:
typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
private:
MetadataMap cache_;
};
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/memory/detail/memory_block.h"
#include <stddef.h>
namespace paddle {
namespace memory {
namespace detail {
class Metadata {
public:
Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
MemoryBlock* r);
Metadata();
public:
/*! \brief Update the guards when metadata is changed */
void update_guards();
/*! \brief Check consistency to previous modification */
bool check_guards() const;
public:
// TODO(gangliao): compress this
// clang-format off
size_t guard_begin = 0;
MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
size_t index = 0;
size_t size = 0;
size_t total_size = 0;
MemoryBlock* left_buddy = nullptr;
MemoryBlock* right_buddy = nullptr;
size_t guard_end = 0;
// clang-format on
};
} // namespace detail
} // namespace memory
} // namespace paddle
...@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and ...@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include <stdlib.h> // for malloc and free #include <stdlib.h> // for malloc and free
#include <sys/mman.h> // for mlock and munlock #include <sys/mman.h> // for mlock and munlock
#include <algorithm> // for std::max #include <algorithm> // for std::max
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
// If use_pinned_memory is true, CPUAllocator calls mlock, which // If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange // returns pinned and locked memory as staging areas for data exchange
...@@ -35,13 +35,13 @@ namespace paddle { ...@@ -35,13 +35,13 @@ namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
void* CPUAllocator::Alloc(size_t& index, size_t size) { void* CPUAllocator::Alloc(size_t* index, size_t size) {
// According to http://www.cplusplus.com/reference/cstdlib/malloc/, // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
// malloc might not return nullptr if size is zero, but the returned // malloc might not return nullptr if size is zero, but the returned
// pointer shall not be dereferenced -- so we make it nullptr. // pointer shall not be dereferenced -- so we make it nullptr.
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
index = 0; // unlock memory *index = 0; // unlock memory
void* p; void* p;
...@@ -56,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) { ...@@ -56,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {
if (p != nullptr) { if (p != nullptr) {
if (FLAGS_use_pinned_memory) { if (FLAGS_use_pinned_memory) {
index = 1; *index = 1;
mlock(p, size); // lock memory mlock(p, size); // lock memory
} }
} }
...@@ -75,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; } ...@@ -75,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void* GPUAllocator::Alloc(size_t& index, size_t size) { void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr // CUDA documentation doesn't explain if cudaMalloc returns nullptr
// if size is 0. We just make sure it does. // if size is 0. We just make sure it does.
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
...@@ -93,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { ...@@ -93,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
} }
if (result == cudaSuccess) { if (result == cudaSuccess) {
index = 0; *index = 0;
gpu_alloc_size_ += size; gpu_alloc_size_ += size;
return p; return p;
} else { } else {
...@@ -133,7 +133,7 @@ bool GPUAllocator::UseGpu() const { return true; } ...@@ -133,7 +133,7 @@ bool GPUAllocator::UseGpu() const { return true; }
// PINNED memory allows direct DMA transfers by the GPU to and from system // PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address. // memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
// NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
...@@ -154,7 +154,7 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { ...@@ -154,7 +154,7 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
cudaError_t result = cudaMallocHost(&p, size); cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) { if (result == cudaSuccess) {
index = 1; // PINNED memory *index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size; cuda_pinnd_alloc_size_ += size;
return p; return p;
} else { } else {
......
...@@ -29,14 +29,14 @@ namespace detail { ...@@ -29,14 +29,14 @@ namespace detail {
class SystemAllocator { class SystemAllocator {
public: public:
virtual ~SystemAllocator() {} virtual ~SystemAllocator() {}
virtual void* Alloc(size_t& index, size_t size) = 0; virtual void* Alloc(size_t* index, size_t size) = 0;
virtual void Free(void* p, size_t size, size_t index) = 0; virtual void Free(void* p, size_t size, size_t index) = 0;
virtual bool UseGpu() const = 0; virtual bool UseGpu() const = 0;
}; };
class CPUAllocator : public SystemAllocator { class CPUAllocator : public SystemAllocator {
public: public:
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index); virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const; virtual bool UseGpu() const;
}; };
...@@ -46,7 +46,7 @@ class GPUAllocator : public SystemAllocator { ...@@ -46,7 +46,7 @@ class GPUAllocator : public SystemAllocator {
public: public:
explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index); virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const; virtual bool UseGpu() const;
...@@ -58,7 +58,7 @@ class GPUAllocator : public SystemAllocator { ...@@ -58,7 +58,7 @@ class GPUAllocator : public SystemAllocator {
class CUDAPinnedAllocator : public SystemAllocator { class CUDAPinnedAllocator : public SystemAllocator {
public: public:
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index); virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const; virtual bool UseGpu() const;
......
...@@ -22,11 +22,11 @@ limitations under the License. */ ...@@ -22,11 +22,11 @@ limitations under the License. */
DECLARE_bool(use_pinned_memory); DECLARE_bool(use_pinned_memory);
void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
bool freed = false; bool freed = false;
{ {
size_t index; size_t index;
void* p = a.Alloc(index, size); void* p = a->Alloc(&index, size);
if (size > 0) { if (size > 0) {
EXPECT_NE(p, nullptr); EXPECT_NE(p, nullptr);
} else { } else {
...@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { ...@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
int* i = static_cast<int*>(p); int* i = static_cast<int*>(p);
std::shared_ptr<int> ptr(i, [&](void* p) { std::shared_ptr<int> ptr(i, [&](void* p) {
freed = true; freed = true;
a.Free(p, size, index); a->Free(p, size, index);
}); });
} }
EXPECT_TRUE(freed); EXPECT_TRUE(freed);
...@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) { ...@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
TEST(CPUAllocator, NoLockMem) { TEST(CPUAllocator, NoLockMem) {
FLAGS_use_pinned_memory = false; FLAGS_use_pinned_memory = false;
paddle::memory::detail::CPUAllocator a; paddle::memory::detail::CPUAllocator a;
TestAllocator(a, 2048); TestAllocator(&a, 2048);
TestAllocator(a, 0); TestAllocator(&a, 0);
} }
TEST(CPUAllocator, LockMem) { TEST(CPUAllocator, LockMem) {
FLAGS_use_pinned_memory = true; FLAGS_use_pinned_memory = true;
paddle::memory::detail::CPUAllocator a; paddle::memory::detail::CPUAllocator a;
TestAllocator(a, 2048); TestAllocator(&a, 2048);
TestAllocator(a, 0); TestAllocator(&a, 0);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(GPUAllocator, Alloc) { TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a(0); paddle::memory::detail::GPUAllocator a(0);
TestAllocator(a, 2048); TestAllocator(&a, 2048);
TestAllocator(a, 0); TestAllocator(&a, 0);
} }
#endif #endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/malloc.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -95,7 +95,7 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { ...@@ -95,7 +95,7 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
int cur_dev = platform::GetCurrentDeviceId(); int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
size_t avail, total; size_t avail, total;
platform::GpuMemoryUsage(avail, total); platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes"; << place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total; LOG(WARNING) << "total " << total;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
/**
* \brief Allocate memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] size Allocation size.
*
* \return Allocated memory block address.
*
* \note If return nullptr, it indicates memory allocation failed
* because insufficient memory in current system. When Alloc
* function is invoked, you must check the returned memory
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
/**
* \brief Free memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] ptr Memory block address to free.
*
*/
template <typename Place>
void Free(Place place, void* ptr);
/**
* \brief Total size of used memory in one place.
*
* \param[in] place Allocation place (CPU or GPU).
*
*/
template <typename Place>
size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
/**
* \brief Free memory block in one place.
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory
} // namespace paddle
...@@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h" #include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include <gtest/gtest.h>
#include <unordered_map>
inline bool is_aligned(void const *p) { inline bool is_aligned(void const *p) {
return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3); return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
} }
size_t align(size_t size, paddle::platform::CPUPlace place) { size_t align(size_t size, paddle::platform::CPUPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CpuMinChunkSize(); size_t alignment = paddle::platform::CpuMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
...@@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::CUDAPlace place) { size_t align(size_t size, paddle::platform::CUDAPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::GpuMinChunkSize(); size_t alignment = paddle::platform::GpuMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
...@@ -142,7 +141,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { ...@@ -142,7 +141,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
} }
size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) { size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
size_t alignment = paddle::platform::CUDAPinnedMinChunkSize(); size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
size_t remaining = size % alignment; size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining); return remaining == 0 ? size : size + (alignment - remaining);
......
...@@ -14,91 +14,5 @@ limitations under the License. */ ...@@ -14,91 +14,5 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
namespace paddle {
namespace memory {
/**
* \brief Allocate memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] size Allocation size.
*
* \return Allocated memory block address.
*
* \note If return nullptr, it indicates memory allocation failed
* because insufficient memory in current system. When Alloc
* function is invoked, you must check the returned memory
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size);
/**
* \brief Free memory block in one place.
*
* \param[in] place Allocation place (CPU or GPU).
* \param[in] ptr Memory block address to free.
*
*/
template <typename Place>
void Free(Place place, void* ptr);
/**
* \brief Total size of used memory in one place.
*
* \param[in] place Allocation place (CPU or GPU).
*
*/
template <typename Place>
size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
/**
* \brief Free memory block in one place.
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
/**
* \brief Free memory block in one place does not meet POD
*
* \note In some cases, custom deleter is used to
* deallocate the memory automatically for
* std::unique_ptr<T> in tensor.h.
*
*/
template <typename T, typename Place>
class PlainDeleter {
public:
explicit PlainDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory
} // namespace paddle
...@@ -15,7 +15,6 @@ limitations under the License. */ ...@@ -15,7 +15,6 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
...@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}") ...@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
list(REMOVE_DUPLICATES GENERAL_OPS) list(REMOVE_DUPLICATES GENERAL_OPS)
set(DEPS_OPS "") set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h) set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
function(op_library TARGET) function(op_library TARGET)
# op_library is a function to create op library. The interface is same as # op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library # cc_library. But it handle split GPU/CPU code and link some common library
...@@ -263,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) ...@@ -263,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
limitations under the License. */ limitations under the License. */
#include "mkldnn.hpp" #include "mkldnn.hpp"
#include "mkldnn_activation_op.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mkldnn_activation_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, ...@@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace()); const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
// get memory dim // get memory dim
PADDLE_ENFORCE(src->dims().size() == 4, PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
"Input dim must be with 4, i.e. NCHW"); "Input dim must be with 2 or 4");
std::vector<int> src_tz = framework::vectorize2int(src->dims()); std::vector<int> src_tz = framework::vectorize2int(src->dims());
// create memory description // create memory description
// TODO(kbinias-intel): support more formats auto data_md = src_tz.size() == 2
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nc)
: platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw); mkldnn::memory::format::nchw);
// create memory primitives // create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data); auto src_memory =
auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data); mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(src_data)));
auto dst_memory =
mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(dst_data)));
auto forward_desc = mkldnn::eltwise_forward::desc( auto forward_desc = mkldnn::eltwise_forward::desc(
mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta); mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
...@@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, ...@@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
std::vector<int> src_tz = framework::vectorize2int(x->dims()); std::vector<int> src_tz = framework::vectorize2int(x->dims());
// create memory description // create memory description
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, auto data_md = src_tz.size() == 2
? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nc)
: platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw); mkldnn::memory::format::nchw);
// create memory primitives // create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src); auto src_memory = mkldnn::memory(
{data_md, mkldnn_engine}, static_cast<void *>(const_cast<float *>(src)));
auto diff_src_memory = auto diff_src_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src); mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(diff_src)));
auto diff_dst_memory = auto diff_dst_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst); mkldnn::memory({data_md, mkldnn_engine},
static_cast<void *>(const_cast<float *>(diff_dst)));
auto backward_desc = auto backward_desc =
mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta); mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
......
...@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo)); workspace_size_limit, &algo));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
}
#endif
// get workspace size able to allocate // get workspace size able to allocate
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes)); cudnn_output_desc, algo, &workspace_size_in_bytes));
// It is possible for float16 on Volta GPU to allocate more memory than
// the limit because the algo is overrided to use tensor core.
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
// Allocate on GPU memory // Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
......
...@@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE) ...@@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc) cares zlib protobuf sendrecvop_grpc)
cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
endif() endif()
...@@ -138,7 +138,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, ...@@ -138,7 +138,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
auto* var = p_scope->FindVar(in_var_name_val); auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req; ::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req); SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
// var handle // var handle
VarHandle var_h; VarHandle var_h;
......
...@@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase { ...@@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase {
framework::Scope* scope, framework::Scope* scope,
const platform::DeviceContext* dev_ctx, const platform::DeviceContext* dev_ctx,
framework::Executor* executor, framework::Executor* executor,
framework::ProgramDesc* program, int blkid) framework::ProgramDesc* program,
framework::ExecutorPrepareContext* prefetch_ctx)
: RequestBase(service, cq, dev_ctx), : RequestBase(service, cq, dev_ctx),
responder_(&ctx_), responder_(&ctx_),
scope_(scope), scope_(scope),
executor_(executor), executor_(executor),
program_(program), program_(program),
blkid_(blkid) { prefetch_ctx_(prefetch_ctx) {
request_.reset(new VariableResponse(scope, dev_ctx_));
int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable); int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
cq_, this); cq_, cq_, this);
} }
virtual ~RequestPrefetch() {} virtual ~RequestPrefetch() {}
virtual std::string GetReqName() { return request_.varname(); } virtual std::string GetReqName() { return request_->Varname(); }
virtual void Process() { virtual void Process() {
// prefetch process... // prefetch process...
::grpc::ByteBuffer reply; ::grpc::ByteBuffer reply;
// TODO(Yancey1989): execute the Block which containers prefetch ops
VLOG(3) << "RequestPrefetch Process in"; std::string var_name = request_->OutVarname();
auto var_desc = program_->Block(0).FindVar(var_name);
framework::Scope* local_scope = &scope_->NewScope();
auto* var = local_scope->FindVar(var_name);
InitializeVariable(var, var_desc->GetType());
executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
responder_.Finish(reply, ::grpc::Status::OK, this); responder_.Finish(reply, ::grpc::Status::OK, this);
status_ = FINISH; status_ = FINISH;
} }
protected: protected:
sendrecv::VariableMessage request_; std::shared_ptr<VariableResponse> request_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
framework::Scope* scope_; framework::Scope* scope_;
framework::Executor* executor_; framework::Executor* executor_;
framework::ProgramDesc* program_; framework::ProgramDesc* program_;
framework::ExecutorPrepareContext* prefetch_ctx_;
int blkid_; int blkid_;
}; };
...@@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { ...@@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
} }
RequestPrefetch* prefetch = RequestPrefetch* prefetch =
new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_, new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
executor_, program_, prefetch_blk_id_); executor_, program_, prefetch_ctx_);
VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status(); VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
} }
......
...@@ -63,6 +63,10 @@ class AsyncGRPCServer final { ...@@ -63,6 +63,10 @@ class AsyncGRPCServer final {
void SetExecutor(framework::Executor *executor) { executor_ = executor; } void SetExecutor(framework::Executor *executor) { executor_ = executor; }
void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
prefetch_ctx_ = prepared;
}
int GetSelectedPort() { return selected_port_; } int GetSelectedPort() { return selected_port_; }
const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); } const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
...@@ -111,6 +115,7 @@ class AsyncGRPCServer final { ...@@ -111,6 +115,7 @@ class AsyncGRPCServer final {
std::unique_ptr<std::thread> t_prefetch_; std::unique_ptr<std::thread> t_prefetch_;
int prefetch_blk_id_; int prefetch_blk_id_;
framework::ExecutorPrepareContext *prefetch_ctx_;
framework::ProgramDesc *program_; framework::ProgramDesc *program_;
framework::Executor *executor_; framework::Executor *executor_;
int selected_port_; int selected_port_;
......
...@@ -20,43 +20,121 @@ limitations under the License. */ ...@@ -20,43 +20,121 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/operators/detail/grpc_server.h" #include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
namespace framework = paddle::framework; namespace framework = paddle::framework;
namespace platform = paddle::platform; namespace platform = paddle::platform;
namespace detail = paddle::operators::detail; namespace detail = paddle::operators::detail;
USE_OP(lookup_table);
std::unique_ptr<detail::AsyncGRPCServer> rpc_service_; std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0);
auto* block = program->AppendBlock(*root_block);
framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
framework::VariableNameMap output({{"Output", {"out"}}});
auto op = block->AppendOp();
op->SetType("lookup_table");
op->SetInput("W", {"w"});
op->SetInput("Ids", {"ids"});
op->SetOutput("Out", {"out"});
auto& out = *root_block->Var("out");
out.SetType(framework::proto::VarType::SELECTED_ROWS);
out.SetShape({10, 10});
return block;
}
void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
auto w_var = scope->Var("w");
w_var->GetMutable<framework::SelectedRows>();
auto out_var = scope->Var("out");
out_var->GetMutable<framework::SelectedRows>();
auto ids_var = scope->Var("ids");
ids_var->GetMutable<framework::SelectedRows>();
}
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
auto rows = ids_var->mutable_rows();
for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
ids_var->mutable_value()->Resize({rows_numel, 1});
ids_var->mutable_value()->mutable_data<float>(*place);
}
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
auto rows = w->mutable_rows();
for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
auto w_value = w->mutable_value();
w_value->Resize({rows_numel, 10});
auto ptr = w_value->mutable_data<float>(*place);
for (int64_t i = 0; i < w_value->numel(); ++i) {
ptr[i] = static_cast<float>(i / 10);
}
}
void StartServer(const std::string& endpoint) { void StartServer(const std::string& endpoint) {
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
framework::ProgramDesc program;
framework::Scope scope;
platform::CPUPlace place;
framework::Executor exe(place);
platform::CPUDeviceContext ctx(place);
auto* block = AppendPrefetchBlcok(&program);
auto prepared = exe.Prepare(program, block->ID());
InitTensorsOnServer(&scope, &place, 10);
rpc_service_->SetProgram(&program);
rpc_service_->SetPrefetchPreparedCtx(prepared.get());
rpc_service_->SetDevCtx(&ctx);
rpc_service_->SetScope(&scope);
rpc_service_->SetExecutor(&exe);
rpc_service_->RunSyncUpdate(); rpc_service_->RunSyncUpdate();
} }
TEST(PREFETCH, CPU) { TEST(PREFETCH, CPU) {
// start up a server instance backend // start up a server instance backend
// TODO(Yancey1989): Need to start a server with optimize blocks and
// prefetch blocks.
std::thread server_thread(StartServer, "127.0.0.1:8889"); std::thread server_thread(StartServer, "127.0.0.1:8889");
sleep(2);
framework::Scope scope; framework::Scope scope;
platform::CPUPlace place; platform::CPUPlace place;
platform::CPUDeviceContext ctx(place); platform::CPUDeviceContext ctx(place);
// create var on local scope // create var on local scope
std::string in_var_name("in"); int64_t rows_numel = 5;
InitTensorsOnClient(&scope, &place, rows_numel);
std::string in_var_name("ids");
std::string out_var_name("out"); std::string out_var_name("out");
auto* in_var = scope.Var(in_var_name);
auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
in_tensor->Resize({10, 10});
VLOG(3) << "before mutable_data";
in_tensor->mutable_data<int>(place);
scope.Var(out_var_name);
VLOG(3) << "before fetch";
detail::RPCClient client; detail::RPCClient client;
client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name, client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
out_var_name); out_var_name);
client.Wait(); client.Wait();
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::SelectedRows>()->value();
auto ptr = value.mutable_data<float>(place);
rpc_service_->ShutDown(); rpc_service_->ShutDown();
server_thread.join(); server_thread.join();
rpc_service_.reset(nullptr); rpc_service_.reset(nullptr);
for (int64_t i = 0; i < rows_numel; ++i) {
EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
}
} }
...@@ -21,7 +21,7 @@ service SendRecvService { ...@@ -21,7 +21,7 @@ service SendRecvService {
rpc SendVariable(VariableMessage) returns (VoidMessage) {} rpc SendVariable(VariableMessage) returns (VoidMessage) {}
// Argument VariableMessage for GetVariable should only contain varname. // Argument VariableMessage for GetVariable should only contain varname.
rpc GetVariable(VariableMessage) returns (VariableMessage) {} rpc GetVariable(VariableMessage) returns (VariableMessage) {}
// Prefetch variable by Ids // pre-fetch variable by given variable name and Ids
rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
} }
...@@ -67,6 +67,8 @@ message VariableMessage { ...@@ -67,6 +67,8 @@ message VariableMessage {
bytes serialized = 8; bytes serialized = 8;
// selected_rows data // selected_rows data
bytes rows = 9; bytes rows = 9;
// Look up table block execution output variable name.
string out_varname = 10;
} }
message VoidMessage {} message VoidMessage {}
...@@ -30,11 +30,9 @@ namespace detail { ...@@ -30,11 +30,9 @@ namespace detail {
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg) { ::grpc::ByteBuffer* msg,
const std::string& out_name) {
using VarMsg = sendrecv::VariableMessage; using VarMsg = sendrecv::VariableMessage;
sendrecv::VariableMessage request;
std::string header;
request.AppendToString(&header);
// When using GPU, need to free the copied CPU buffer // When using GPU, need to free the copied CPU buffer
// when the ByteBuffer destroies // when the ByteBuffer destroies
// TODO(typhoonzero): add unref here, if we have dependent // TODO(typhoonzero): add unref here, if we have dependent
...@@ -52,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -52,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
e.WriteUint64(VarMsg::kTypeFieldNumber, 1); e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
} }
if (!out_name.empty()) {
e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name);
}
switch (framework::ToVarType(var->Type())) { switch (framework::ToVarType(var->Type())) {
case framework::proto::VarType_Type_LOD_TENSOR: { case framework::proto::VarType_Type_LOD_TENSOR: {
auto tensor = var->Get<framework::LoDTensor>(); auto tensor = var->Get<framework::LoDTensor>();
......
...@@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*); ...@@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg); ::grpc::ByteBuffer* msg,
const std::string& out_varname = std::string());
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
......
...@@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) { ...@@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) {
} }
break; break;
} }
case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
meta_.set_out_varname(temp);
break;
}
default: { default: {
// Unknown tag, return unknown error. // Unknown tag, return unknown error.
......
...@@ -55,6 +55,7 @@ class VariableResponse { ...@@ -55,6 +55,7 @@ class VariableResponse {
int Parse(const ::grpc::ByteBuffer& byte_buffer); int Parse(const ::grpc::ByteBuffer& byte_buffer);
inline std::string Varname() { return meta_.varname(); } inline std::string Varname() { return meta_.varname(); }
inline std::string OutVarname() { return meta_.out_varname(); }
// should call parse first. // should call parse first.
framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); } framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
......
...@@ -27,8 +27,8 @@ template <typename T> ...@@ -27,8 +27,8 @@ template <typename T>
class MKLDNNMD { class MKLDNNMD {
public: public:
explicit MKLDNNMD(const T* in, const T* w, bool bias) explicit MKLDNNMD(const T* in, const T* w, bool bias)
: in{paddle::framework::vectorize2int(in->dims())}, : in(paddle::framework::vectorize2int(in->dims())),
w{paddle::framework::vectorize2int(w->dims())} { w(paddle::framework::vectorize2int(w->dims())) {
with_bias_ = bias; with_bias_ = bias;
} }
...@@ -78,7 +78,7 @@ class MKLDNNMD { ...@@ -78,7 +78,7 @@ class MKLDNNMD {
class MKLDNNMemory { class MKLDNNMemory {
public: public:
MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e) MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
: md_{t}, engine_{e} {} : md_(t), engine_(e) {}
virtual ~MKLDNNMemory() = default; virtual ~MKLDNNMemory() = default;
template <typename Output> template <typename Output>
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
if (lod_t->lod().size() > 0) { if (lod_t->lod().size() > 0) {
auto y_lod = lod_t->lod(); auto y_lod = lod_t->lod();
auto last_level = y_lod[y_lod.size() - 1]; auto last_level = y_lod[y_lod.size() - 1];
PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0], PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
"Last value of `Y`'s last level LoD should be equal " "Last value of `Y`'s last level LoD should be equal "
"to the first dimension of `X`"); "to the first dimension of `X`");
out->set_lod(y_lod); out->set_lod(y_lod);
......
...@@ -18,22 +18,6 @@ limitations under the License. */ ...@@ -18,22 +18,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static inline framework::OpKernelType ExpectedKernelType(
const framework::ExecutionContext& ctx) {
auto* table_var = ctx.InputVar("W");
if (table_var->IsType<LoDTensor>()) {
return framework::OpKernelType(
framework::ToDataType(table_var->Get<LoDTensor>().type()),
ctx.device_context());
} else if (table_var->IsType<SelectedRows>()) {
return framework::OpKernelType(
framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
ctx.device_context());
} else {
PADDLE_THROW("W should be LoDTensor or SelectedRows");
}
}
class LookupTableOp : public framework::OperatorWithKernel { class LookupTableOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -67,7 +51,8 @@ class LookupTableOp : public framework::OperatorWithKernel { ...@@ -67,7 +51,8 @@ class LookupTableOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return ExpectedKernelType(ctx); auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
} }
}; };
...@@ -138,7 +123,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { ...@@ -138,7 +123,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return ExpectedKernelType(ctx); auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
} }
}; };
......
...@@ -30,13 +30,7 @@ using LoDTensor = framework::LoDTensor; ...@@ -30,13 +30,7 @@ using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows; using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim; using DDim = framework::DDim;
static constexpr int64_t kNoPadding = -1; constexpr int64_t kNoPadding = -1;
inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
auto it = std::find(rows.begin(), rows.end(), value);
PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
return static_cast<size_t>(std::distance(rows.begin(), it));
}
template <typename T> template <typename T>
class LookupTableKernel : public framework::OpKernel<T> { class LookupTableKernel : public framework::OpKernel<T> {
...@@ -55,7 +49,9 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -55,7 +49,9 @@ class LookupTableKernel : public framework::OpKernel<T> {
auto *table_t = context.Input<SelectedRows>("W"); auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims(); table_dim = table_t->value().dims();
} else { } else {
PADDLE_THROW("table only support LoDTensor and SelectedRows"); PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
} }
int64_t *ids; int64_t *ids;
...@@ -107,7 +103,7 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -107,7 +103,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
memset(output + i * row_width, 0, row_width * sizeof(T)); memset(output + i * row_width, 0, row_width * sizeof(T));
} else { } else {
PADDLE_ENFORCE_GE(ids[i], 0); PADDLE_ENFORCE_GE(ids[i], 0);
auto id_index = getIndex(table_t.rows(), ids[i]); auto id_index = table_t.index(ids[i]);
memcpy(output + i * row_width, table + id_index * row_width, memcpy(output + i * row_width, table + id_index * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} }
...@@ -128,7 +124,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -128,7 +124,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *table_t = context.Input<SelectedRows>("W"); auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims(); table_dim = table_t->value().dims();
} else { } else {
PADDLE_THROW("table only support LoDTensor and SelectedRows"); PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
} }
bool is_sparse = context.Attr<bool>("is_sparse"); bool is_sparse = context.Attr<bool>("is_sparse");
......
...@@ -39,18 +39,33 @@ void gemm<platform::CUDADeviceContext, float16>( ...@@ -39,18 +39,33 @@ void gemm<platform::CUDADeviceContext, float16>(
cublasOperation_t cuTransB = cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha); float h_alpha = static_cast<float>(alpha);
const half h_beta = static_cast<const half>(beta); float h_beta = static_cast<float>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case // TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53"); "cublas fp16 gemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
h_A, lda, &h_beta, h_C, N)); #if CUDA_VERSION >= 9000
if (context.GetComputeCapability() >= 70) {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
CUBLAS_TENSOR_OP_MATH));
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
} else {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
CUBLAS_DEFAULT_MATH));
}
#endif
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
CUDA_R_32F, algo));
} }
template <> template <>
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include <vector>
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h" #include "paddle/fluid/operators/math/softmax_impl.h"
...@@ -95,6 +97,7 @@ template class SoftmaxCUDNNFunctor<double>; ...@@ -95,6 +97,7 @@ template class SoftmaxCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<float>; template class SoftmaxGradCUDNNFunctor<float>;
template class SoftmaxGradCUDNNFunctor<double>; template class SoftmaxGradCUDNNFunctor<double>;
template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float>; template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double>; template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>; template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
......
...@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; ...@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename T>
struct ValueClip { struct ValueClip {
HOSTDEVICE T operator()(const T& x) const { HOSTDEVICE T operator()(const T& x) const {
const T kThreshold = -64.; const T kThreshold = static_cast<T>(-64.);
return x < kThreshold ? kThreshold : x; return x < kThreshold ? kThreshold : x;
} }
}; };
......
...@@ -73,7 +73,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { ...@@ -73,7 +73,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
platform::CPUPlace()); ctx.device_context());
} }
}; };
...@@ -171,6 +171,5 @@ namespace ops = paddle::operators; ...@@ -171,6 +171,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker, REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
paddle::framework::EmptyGradOpMaker); paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float>,
prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>, ops::PriorBoxOpKernel<double>);
ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prior_box_op.h"
namespace paddle {
namespace operators {
template <typename T>
__device__ inline T clip(T in) {
return min(max(in, 0.), 1.);
}
template <typename T>
__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
const int width, const int im_height,
const int im_width, const int as_num,
const T offset, const T step_width,
const T step_height, const T* min_sizes,
const T* max_sizes, const int min_num,
bool is_clip) {
int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
int box_num = height * width * num_priors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
i += blockDim.x * gridDim.x) {
int h = i / (num_priors * width);
int w = (i / num_priors) % width;
int p = i % num_priors;
int m = max_sizes ? p / (as_num + 1) : p / as_num;
T cx = (w + offset) * step_width;
T cy = (h + offset) * step_height;
T bw, bh;
T min_size = min_sizes[m];
if (max_sizes) {
int s = p % (as_num + 1);
if (s < as_num) {
T ar = aspect_ratios[s];
bw = min_size * sqrt(ar) / 2.;
bh = min_size / sqrt(ar) / 2.;
} else {
T max_size = max_sizes[m];
bw = sqrt(min_size * max_size) / 2.;
bh = bw;
}
} else {
int s = p % as_num;
T ar = aspect_ratios[s];
bw = min_size * sqrt(ar) / 2.;
bh = min_size / sqrt(ar) / 2.;
}
T xmin = (cx - bw) / im_width;
T ymin = (cy - bh) / im_height;
T xmax = (cx + bw) / im_width;
T ymax = (cy + bh) / im_height;
out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
}
}
template <typename T>
__global__ void SetVariance(T* out, const T* var, const int vnum,
const int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
out[i] = var[i % vnum];
}
}
template <typename T>
class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<paddle::framework::Tensor>("Input");
auto* image = ctx.Input<paddle::framework::Tensor>("Image");
auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
auto variances = ctx.Attr<std::vector<float>>("variances");
auto flip = ctx.Attr<bool>("flip");
auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
T offset = static_cast<T>(ctx.Attr<float>("offset"));
auto im_width = image->dims()[3];
auto im_height = image->dims()[2];
auto width = input->dims()[3];
auto height = input->dims()[2];
T step_width, step_height;
if (step_w == 0 || step_h == 0) {
step_width = static_cast<T>(im_width) / width;
step_height = static_cast<T>(im_height) / height;
} else {
step_width = step_w;
step_height = step_h;
}
int num_priors = aspect_ratios.size() * min_sizes.size();
if (max_sizes.size() > 0) {
num_priors += max_sizes.size();
}
int min_num = static_cast<int>(min_sizes.size());
int box_num = width * height * num_priors;
int block = 512;
int grid = (box_num + block - 1) / block;
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
framework::Tensor r;
framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
framework::Tensor min;
framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
T* max_data = nullptr;
framework::Tensor max;
if (max_sizes.size() > 0) {
framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
max_data = max.data<T>();
}
GenPriorBox<T><<<grid, block, 0, stream>>>(
boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
max_data, min_num, clip);
framework::Tensor v;
framework::TensorFromVector(variances, ctx.device_context(), &v);
grid = (box_num * 4 + block - 1) / block;
SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
variances.size(), box_num * 4);
}
}; // namespace operators
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
ops::PriorBoxOpCUDAKernel<double>);
...@@ -51,7 +51,7 @@ struct ClipFunctor { ...@@ -51,7 +51,7 @@ struct ClipFunctor {
} }
}; };
template <typename Place, typename T> template <typename T>
class PriorBoxOpKernel : public framework::OpKernel<T> { class PriorBoxOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -106,49 +106,24 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -106,49 +106,24 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
int idx = 0; int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) { for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s]; auto min_size = min_sizes[s];
// first prior: aspect_ratio = 1, size = min_size // priors with different aspect ratios
box_width = box_height = min_size / 2.; for (size_t r = 0; r < aspect_ratios.size(); ++r) {
// xmin float ar = aspect_ratios[r];
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++; idx++;
}
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// second prior: aspect_ratio = 1, // square prior with size sqrt(minSize * maxSize)
// size = sqrt(min_size * max_size)
box_width = box_height = sqrt(min_size * max_size) / 2.; box_width = box_height = sqrt(min_size * max_size) / 2.;
// xmin
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
// rest of priors
for (size_t r = 0; r < aspect_ratios.size(); ++r) {
float ar = aspect_ratios[r];
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
// xmin
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width; e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height; e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width; e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height; e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++; idx++;
} }
......
...@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase { ...@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out")) auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>(); ->template GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
out->Reset( out->Reset(
new BatchReader(underlying_reader.Get(), Attr<int>("batch_size"))); new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
} }
......
...@@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { ...@@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out")) auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>(); ->template GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto place_str = Attr<std::string>("place"); auto place_str = Attr<std::string>("place");
platform::Place place; platform::Place place;
......
...@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase { ...@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
auto* out = detail::Ref(scope.FindVar(Output("Out")))
.GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
auto& out = detail::Ref(scope.FindVar(Output("Out")));
int pass_num = Attr<int>("pass_num"); int pass_num = Attr<int>("pass_num");
out.GetMutable<framework::ReaderHolder>()->Reset( out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
new MultiPassReader(underlying_reader.Get(), pass_num));
} }
}; };
......
...@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase { ...@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
private: private:
void RunImpl(const framework::Scope& scope, void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
auto* out = detail::Ref(scope.FindVar(Output("Out")))
.GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
auto& var = detail::Ref(scope.FindVar(Output("Out"))); out->Reset(
var.GetMutable<framework::ReaderHolder>()->Reset(
new ShuffleReader(underlying_reader.Get(), new ShuffleReader(underlying_reader.Get(),
static_cast<size_t>(Attr<int>("buffer_size")))); static_cast<size_t>(Attr<int>("buffer_size"))));
} }
......
...@@ -43,9 +43,8 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -43,9 +43,8 @@ class SGDOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()), return framework::OpKernelType(data_type, ctx.device_context());
ctx.GetPlace());
} }
}; };
...@@ -53,10 +52,12 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -53,10 +52,12 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker) SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter"); AddInput("Param", "(Tensor or SelectedRows) Input parameter");
AddInput("LearningRate", "(Tensor) Learning rate of SGD"); AddInput("LearningRate", "(Tensor) Learning rate of SGD");
AddInput("Grad", "(Tensor) Input gradient"); AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
AddOutput("ParamOut", "(Tensor) Output parameter"); AddOutput("ParamOut",
"(Tensor or SelectedRows, same with Param) "
"Output parameter, should share the same memory with Param");
AddComment(R"DOC( AddComment(R"DOC(
SGD operator SGD operator
......
...@@ -23,21 +23,25 @@ namespace operators { ...@@ -23,21 +23,25 @@ namespace operators {
template <typename T> template <typename T>
class SGDOpKernel : public framework::OpKernel<T> { class SGDOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto* param = ctx.Input<framework::Tensor>("Param"); const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate"); const auto *param_var = ctx.InputVar("Param");
const auto *grad_var = ctx.InputVar("Grad");
if (param_var->IsType<framework::LoDTensor>()) {
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
auto* grad_var = ctx.InputVar("Grad");
// Actually, all tensors are LoDTensor except SelectedRows. // Actually, all tensors are LoDTensor except SelectedRows.
if (grad_var->IsType<framework::LoDTensor>()) { if (grad_var->IsType<framework::LoDTensor>()) {
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
auto* grad = ctx.Input<framework::Tensor>("Grad"); const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto p = framework::EigenVector<T>::Flatten(*param); auto p = framework::EigenVector<T>::Flatten(*param);
auto g = framework::EigenVector<T>::Flatten(*grad); auto g = framework::EigenVector<T>::Flatten(*grad);
auto o = framework::EigenVector<T>::Flatten(*param_out); auto o = framework::EigenVector<T>::Flatten(*param_out);
auto* lr = learning_rate->data<T>(); auto *lr = learning_rate->data<T>();
o = p - lr[0] * g; o = p - lr[0] * g;
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
...@@ -45,7 +49,7 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -45,7 +49,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
// This manual optimization brings difficulty to track data dependency. // This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution. // It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out); PADDLE_ENFORCE_EQ(param, param_out);
auto* grad = ctx.Input<framework::SelectedRows>("Grad"); const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
// for distributed training, a sparse var may be empty, // for distributed training, a sparse var may be empty,
// just skip updating. // just skip updating.
...@@ -53,31 +57,64 @@ class SGDOpKernel : public framework::OpKernel<T> { ...@@ -53,31 +57,64 @@ class SGDOpKernel : public framework::OpKernel<T> {
return; return;
} }
auto in_height = grad->height(); auto grad_height = grad->height();
auto out_dims = param_out->dims(); auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
auto& in_value = grad->value(); auto &grad_value = grad->value();
auto& in_rows = grad->rows(); auto &grad_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size(); size_t grad_row_numel = grad_value.numel() / grad_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height);
auto* in_data = in_value.data<T>(); auto *grad_data = grad_value.data<T>();
auto* out_data = param_out->data<T>(); auto *out_data = param_out->data<T>();
auto* lr = learning_rate->data<T>(); auto *lr = learning_rate->data<T>();
for (size_t i = 0; i < in_rows.size(); i++) { for (size_t i = 0; i < grad_rows.size(); i++) {
PADDLE_ENFORCE(in_rows[i] < in_height, PADDLE_ENFORCE(grad_rows[i] < grad_height,
"Input rows index should less than height"); "Input rows index should less than height");
for (int64_t j = 0; j < in_row_numel; j++) { for (int64_t j = 0; j < grad_row_numel; j++) {
out_data[in_rows[i] * in_row_numel + j] -= out_data[grad_rows[i] * grad_row_numel + j] -=
lr[0] * in_data[i * in_row_numel + j]; lr[0] * grad_data[i * grad_row_numel + j];
} }
} }
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_THROW("Unsupported Variable Type of Grad");
} }
} else if (param_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
"when param "
"is SelectedRows, gradient should also be SelectedRows");
const auto &param = param_var->Get<framework::SelectedRows>();
auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
const auto &grad = grad_var->Get<framework::SelectedRows>();
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad.rows().size() == 0) {
return;
}
size_t param_row_width = param.value().numel() / param.rows().size();
size_t grad_row_width = grad.value().numel() / grad.rows().size();
PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
"param_row should have the same size with grad_row");
const auto *lr = learning_rate->data<T>();
const auto *grad_data = grad.value().data<T>();
auto *out_data = param_out->mutable_value()->data<T>();
for (size_t i = 0; i < grad.rows().size(); i++) {
PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
"Input rows index should less than height");
int64_t id_index = param.index(grad.rows()[i]);
for (int64_t j = 0; j < grad_row_width; j++) {
out_data[id_index * grad_row_width + j] -=
lr[0] * grad_data[i * grad_row_width + j];
}
}
} else {
PADDLE_THROW("Unsupported Variable Type of Parameter");
}
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/operators/softmax_op.h"
#include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#endif #endif
...@@ -20,6 +23,7 @@ limitations under the License. */ ...@@ -20,6 +23,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -60,8 +64,8 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -60,8 +64,8 @@ class SoftmaxOp : public framework::OperatorWithKernel {
auto input_data_type = auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type()); framework::ToDataType(ctx.Input<Tensor>("X")->type());
if (input_data_type == framework::proto::VarType::FP16) { if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"float16 can only be used when CUDNN is used"); "float16 can only be used on GPU place");
} }
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
...@@ -70,6 +74,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -70,6 +74,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
library_); library_);
} }
}; };
class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
......
...@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
softmax_grad, softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>); ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(softmax_grad,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>);
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
...@@ -6,8 +6,8 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _ ...@@ -6,8 +6,8 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
add_dependencies(profiler_py_proto profiler_py_proto_init) add_dependencies(profiler_py_proto profiler_py_proto_init)
add_custom_command(TARGET profiler_py_proto POST_BUILD add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
...@@ -42,12 +42,12 @@ ENDIF() ...@@ -42,12 +42,12 @@ ENDIF()
# memcpy depends on device_context, here add deps individually for # memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies # avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator cc_library(device_context SRCS device_context.cc DEPS malloc
system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/string/printf.h"
#include <ostream> #include <ostream>
#include <sstream> #include <sstream>
...@@ -20,6 +19,7 @@ ...@@ -20,6 +19,7 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/string/printf.h"
DECLARE_double(fraction_of_cpu_memory_to_use); DECLARE_double(fraction_of_cpu_memory_to_use);
......
...@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor { ...@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {
} }
#endif #endif
cudnnDataType_t compute_type =
(type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
desc_, pads.size(), pads.data(), strides.data(), dilations.data(), desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
CUDNN_CROSS_CORRELATION, type)); CUDNN_CROSS_CORRELATION, compute_type));
return desc_; return desc_;
} }
......
...@@ -24,6 +24,10 @@ void *cublas_dso_handle = nullptr; ...@@ -24,6 +24,10 @@ void *cublas_dso_handle = nullptr;
CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda.h>
#include <dlfcn.h> #include <dlfcn.h>
#include <mutex> #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
...@@ -39,9 +40,9 @@ extern void *cublas_dso_handle; ...@@ -39,9 +40,9 @@ extern void *cublas_dso_handle;
template <typename... Args> \ template <typename... Args> \
inline cublasStatus_t operator()(Args... args) { \ inline cublasStatus_t operator()(Args... args) { \
typedef cublasStatus_t (*cublasFunc)(Args...); \ typedef cublasStatus_t (*cublasFunc)(Args...); \
std::call_once(cublas_dso_flag, \ std::call_once(cublas_dso_flag, []() { \
paddle::platform::dynload::GetCublasDsoHandle, \ cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
&cublas_dso_handle); \ }); \
void *p_##__name = dlsym(cublas_dso_handle, #__name); \ void *p_##__name = dlsym(cublas_dso_handle, #__name); \
return reinterpret_cast<cublasFunc>(p_##__name)(args...); \ return reinterpret_cast<cublasFunc>(p_##__name)(args...); \
} \ } \
...@@ -70,6 +71,7 @@ extern void *cublas_dso_handle; ...@@ -70,6 +71,7 @@ extern void *cublas_dso_handle;
__macro(cublasDgemm_v2); \ __macro(cublasDgemm_v2); \
__macro(cublasHgemm); \ __macro(cublasHgemm); \
__macro(cublasSgemmEx); \ __macro(cublasSgemmEx); \
__macro(cublasGemmEx); \
__macro(cublasSgeam_v2); \ __macro(cublasSgeam_v2); \
__macro(cublasDgeam_v2); \ __macro(cublasDgeam_v2); \
__macro(cublasCreate_v2); \ __macro(cublasCreate_v2); \
...@@ -89,9 +91,15 @@ extern void *cublas_dso_handle; ...@@ -89,9 +91,15 @@ extern void *cublas_dso_handle;
__macro(cublasSgetrfBatched); \ __macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \ __macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \ __macro(cublasDgetrfBatched); \
__macro(cublasDgetriBatched) __macro(cublasDgetriBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP); CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload } // namespace dynload
......
...@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); ...@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#ifdef PADDLE_USE_DSO #ifdef PADDLE_USE_DSO
bool HasCUDNN() { bool HasCUDNN() {
std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle); std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
return cudnn_dso_handle != nullptr; return cudnn_dso_handle != nullptr;
} }
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <cudnn.h> #include <cudnn.h>
#include <dlfcn.h> #include <dlfcn.h>
#include <mutex> #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
...@@ -35,9 +35,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -35,9 +35,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudnn_func = decltype(__name(args...)) (*)(Args...); \ using cudnn_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(cudnn_dso_flag, \ std::call_once(cudnn_dso_flag, []() { \
paddle::platform::dynload::GetCUDNNDsoHandle, \ cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
&cudnn_dso_handle); \ }); \
EnforceCUDNNLoaded(#__name); \ EnforceCUDNNLoaded(#__name); \
void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \ return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
...@@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount); __macro(cudnnSetConvolutionGroupCount); \
__macro(cudnnSetConvolutionMathType);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
......
...@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
#include <cuda.h> #include <cuda.h>
#include <cupti.h> #include <cupti.h>
#include <dlfcn.h> #include <dlfcn.h>
#include <mutex> #include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
...@@ -41,9 +42,9 @@ extern void *cupti_dso_handle; ...@@ -41,9 +42,9 @@ extern void *cupti_dso_handle;
template <typename... Args> \ template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \ inline CUptiResult CUPTIAPI operator()(Args... args) { \
typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...); \ typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...); \
std::call_once(cupti_dso_flag, \ std::call_once(cupti_dso_flag, []() { \
paddle::platform::dynload::GetCUPTIDsoHandle, \ cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
&cupti_dso_handle); \ }); \
void *p_##__name = dlsym(cupti_dso_handle, #__name); \ void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \ return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \ } \
......
...@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <curand.h> #include <curand.h>
#include <dlfcn.h> #include <dlfcn.h>
#include <mutex>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle { namespace paddle {
...@@ -30,9 +31,9 @@ extern void *curand_dso_handle; ...@@ -30,9 +31,9 @@ extern void *curand_dso_handle;
template <typename... Args> \ template <typename... Args> \
curandStatus_t operator()(Args... args) { \ curandStatus_t operator()(Args... args) { \
typedef curandStatus_t (*curandFunc)(Args...); \ typedef curandStatus_t (*curandFunc)(Args...); \
std::call_once(curand_dso_flag, \ std::call_once(curand_dso_flag, []() { \
paddle::platform::dynload::GetCurandDsoHandle, \ curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
&curand_dso_handle); \ }); \
void *p_##__name = dlsym(curand_dso_handle, #__name); \ void *p_##__name = dlsym(curand_dso_handle, #__name); \
return reinterpret_cast<curandFunc>(p_##__name)(args...); \ return reinterpret_cast<curandFunc>(p_##__name)(args...); \
} \ } \
......
...@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include <dlfcn.h> #include <dlfcn.h>
#include <memory> #include <memory>
#include <mutex> #include <mutex> // NOLINT
#include <string> #include <string>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h" #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
...@@ -65,22 +67,21 @@ static inline std::string join(const std::string& part1, ...@@ -65,22 +67,21 @@ static inline std::string join(const std::string& part1,
return ret; return ret;
} }
static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
void** dso_handle,
int dynload_flags) { int dynload_flags) {
VLOG(3) << "Try to find library: " << dso_path VLOG(3) << "Try to find library: " << dso_path
<< " from default system path."; << " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
*dso_handle = dlopen(dso_path.c_str(), dynload_flags); void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle // bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS. // is null, search from default package path in Mac OS.
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
if (nullptr == *dso_handle) { if (nullptr == dso_handle) {
dso_path = join("/usr/local/cuda/lib/", dso_path); dso_handle =
*dso_handle = dlopen(dso_path.c_str(), dynload_flags); dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
if (nullptr == *dso_handle) { if (nullptr == dso_handle) {
if (dso_path == "libcudnn.dylib") { if (dso_path == "libcudnn.dylib") {
LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
"For instance, sudo tar -xzf " "For instance, sudo tar -xzf "
...@@ -91,28 +92,29 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, ...@@ -91,28 +92,29 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
} }
} }
#endif #endif
return dso_handle;
} }
static inline void GetDsoHandleFromSearchPath(const std::string& search_root, static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
const std::string& dso_name, const std::string& dso_name,
void** dso_handle,
bool throw_on_error = true) { bool throw_on_error = true) {
int dynload_flags = RTLD_LAZY | RTLD_LOCAL; int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
*dso_handle = nullptr; void* dso_handle = nullptr;
std::string dlPath = dso_name; std::string dlPath = dso_name;
if (search_root.empty()) { if (search_root.empty()) {
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
} else { } else {
// search xxx.so from custom path // search xxx.so from custom path
dlPath = join(search_root, dso_name); dlPath = join(search_root, dso_name);
*dso_handle = dlopen(dlPath.c_str(), dynload_flags); dso_handle = dlopen(dlPath.c_str(), dynload_flags);
// if not found, search from default path // if not found, search from default path
if (nullptr == *dso_handle) { if (nullptr == dso_handle) {
LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
<< dlerror() << ")"; << dlerror() << ")";
dlPath = dso_name; dlPath = dso_name;
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
} }
} }
auto error_msg = auto error_msg =
...@@ -124,70 +126,71 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, ...@@ -124,70 +126,71 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
"using the DYLD_LIBRARY_PATH is impossible unless System " "using the DYLD_LIBRARY_PATH is impossible unless System "
"Integrity Protection (SIP) is disabled."; "Integrity Protection (SIP) is disabled.";
if (throw_on_error) { if (throw_on_error) {
PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror()); PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror());
} else if (nullptr == *dso_handle) { } else if (nullptr == dso_handle) {
LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
} }
return dso_handle;
} }
void GetCublasDsoHandle(void** dso_handle) { void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
#else #else
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
#endif #endif
} }
void GetCUDNNDsoHandle(void** dso_handle) { void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle, return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
false);
#else #else
GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false); return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
#endif #endif
} }
void GetCUPTIDsoHandle(void** dso_handle) { void* GetCUPTIDsoHandle() {
std::string cupti_path = cupti_lib_path; std::string cupti_path = cupti_lib_path;
if (!FLAGS_cupti_dir.empty()) { if (!FLAGS_cupti_dir.empty()) {
cupti_path = FLAGS_cupti_dir; cupti_path = FLAGS_cupti_dir;
} }
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false); return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
#else #else
GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false); return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
#endif #endif
} }
void GetCurandDsoHandle(void** dso_handle) { void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
#else #else
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
#endif #endif
} }
void GetWarpCTCDsoHandle(void** dso_handle) { void* GetWarpCTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
#else #else
GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
#endif #endif
} }
void GetLapackDsoHandle(void** dso_handle) { void* GetLapackDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
#else #else
GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
#endif #endif
} }
void GetNCCLDsoHandle(void** dso_handle) { void* GetNCCLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
#else #else
GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle); return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
#endif #endif
} }
......
...@@ -18,55 +18,13 @@ namespace paddle { ...@@ -18,55 +18,13 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
/** void* GetCublasDsoHandle();
* @brief load the DSO of CUBLAS void* GetCUDNNDsoHandle();
* void* GetCUPTIDsoHandle();
* @param **dso_handle dso handler void* GetCurandDsoHandle();
* void* GetWarpCTCDsoHandle();
*/ void* GetLapackDsoHandle();
void GetCublasDsoHandle(void** dso_handle); void* GetNCCLDsoHandle();
/**
* @brief load the DSO of CUDNN
*
* @param **dso_handle dso handler
*
*/
void GetCUDNNDsoHandle(void** dso_handle);
void GetCUPTIDsoHandle(void** dso_handle);
/**
* @brief load the DSO of CURAND
*
* @param **dso_handle dso handler
*
*/
void GetCurandDsoHandle(void** dso_handle);
/**
* @brief load the DSO of warp-ctc
*
* @param **dso_handle dso handler
*
*/
void GetWarpCTCDsoHandle(void** dso_handle);
/**
* @brief load the DSO of lapack
*
* @param **dso_handle dso handler
*
*/
void GetLapackDsoHandle(void** dso_handle);
/**
* @brief load the DSO of NVIDIA nccl
*
* @param **dso_handle dso handler
*
*/
void GetNCCLDsoHandle(void** dso_handle);
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
......
...@@ -25,11 +25,6 @@ void *nccl_dso_handle; ...@@ -25,11 +25,6 @@ void *nccl_dso_handle;
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
void LoadNCCLDSO() {
platform::call_once(nccl_dso_flag,
[] { GetNCCLDsoHandle(&nccl_dso_handle); });
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <dlfcn.h> #include <dlfcn.h>
#include <nccl.h> #include <nccl.h>
#include <mutex>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/call_once.h" #include "paddle/fluid/platform/call_once.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
...@@ -28,14 +29,15 @@ extern std::once_flag nccl_dso_flag; ...@@ -28,14 +29,15 @@ extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle; extern void* nccl_dso_handle;
#ifdef PADDLE_USE_DSO #ifdef PADDLE_USE_DSO
extern void LoadNCCLDSO();
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(__name(args...)) (*)(Args...); \ using nccl_func = decltype(__name(args...)) (*)(Args...); \
paddle::platform::dynload::LoadNCCLDSO(); \ std::call_once(nccl_dso_flag, []() { \
nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
void* p_##__name = dlsym(nccl_dso_handle, #__name); \ void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \ return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \ } \
......
...@@ -15,9 +15,10 @@ limitations under the License. */ ...@@ -15,9 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <dlfcn.h> #include <dlfcn.h>
#include <mutex> #include <mutex> // NOLINT
#include "ctc.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "warpctc/include/ctc.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -36,9 +37,9 @@ extern void* warpctc_dso_handle; ...@@ -36,9 +37,9 @@ extern void* warpctc_dso_handle;
template <typename... Args> \ template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \ auto operator()(Args... args) -> decltype(__name(args...)) { \
using warpctcFunc = decltype(__name(args...)) (*)(Args...); \ using warpctcFunc = decltype(__name(args...)) (*)(Args...); \
std::call_once(warpctc_dso_flag, \ std::call_once(warpctc_dso_flag, []() { \
paddle::platform::dynload::GetWarpCTCDsoHandle, \ warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
&warpctc_dso_handle); \ }); \
void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ void* p_##_name = dlsym(warpctc_dso_handle, #__name); \
return reinterpret_cast<warpctcFunc>(p_##_name)(args...); \ return reinterpret_cast<warpctcFunc>(p_##_name)(args...); \
} \ } \
......
...@@ -16,35 +16,35 @@ limitations under the License. */ ...@@ -16,35 +16,35 @@ limitations under the License. */
#include <dlfcn.h> // for dladdr #include <dlfcn.h> // for dladdr
#include <execinfo.h> // for backtrace #include <execinfo.h> // for backtrace
#ifdef __GNUC__
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
#include <curand.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#endif // PADDLE_WITH_CUDA
#include <iomanip> #include <iomanip>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include "glog/logging.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/to_string.h" #include "paddle/fluid/string/to_string.h"
#ifdef __GNUC__
#include <cxxabi.h> // for __cxa_demangle
#endif
#include <glog/logging.h>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/curand.h" #include "paddle/fluid/platform/dynload/curand.h"
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#include <cublas_v2.h>
#include <cudnn.h>
#include <curand.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#endif #endif
namespace paddle { namespace paddle {
...@@ -185,7 +185,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -185,7 +185,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
} }
} }
#endif // PADDLE_ONLY_CPU #endif // PADDLE_WITH_CUDA
template <typename T> template <typename T>
inline void throw_on_error(T e) { inline void throw_on_error(T e) {
......
...@@ -96,7 +96,6 @@ TEST(ENFORCE_GT, FAIL) { ...@@ -96,7 +96,6 @@ TEST(ENFORCE_GT, FAIL) {
bool caught_exception = false; bool caught_exception = false;
try { try {
PADDLE_ENFORCE_GT(1, 2UL); PADDLE_ENFORCE_GT(1, 2UL);
} catch (paddle::platform::EnforceNotMet error) { } catch (paddle::platform::EnforceNotMet error) {
caught_exception = true; caught_exception = true;
EXPECT_TRUE( EXPECT_TRUE(
...@@ -115,7 +114,6 @@ TEST(ENFORCE_GE, FAIL) { ...@@ -115,7 +114,6 @@ TEST(ENFORCE_GE, FAIL) {
bool caught_exception = false; bool caught_exception = false;
try { try {
PADDLE_ENFORCE_GE(1, 2UL); PADDLE_ENFORCE_GE(1, 2UL);
} catch (paddle::platform::EnforceNotMet error) { } catch (paddle::platform::EnforceNotMet error) {
caught_exception = true; caught_exception = true;
EXPECT_TRUE( EXPECT_TRUE(
...@@ -135,7 +133,6 @@ TEST(ENFORCE_LE, FAIL) { ...@@ -135,7 +133,6 @@ TEST(ENFORCE_LE, FAIL) {
bool caught_exception = false; bool caught_exception = false;
try { try {
PADDLE_ENFORCE_GT(1, 2UL); PADDLE_ENFORCE_GT(1, 2UL);
} catch (paddle::platform::EnforceNotMet error) { } catch (paddle::platform::EnforceNotMet error) {
caught_exception = true; caught_exception = true;
EXPECT_TRUE( EXPECT_TRUE(
...@@ -171,7 +168,6 @@ TEST(ENFORCE_NOT_NULL, FAIL) { ...@@ -171,7 +168,6 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
try { try {
int* a = nullptr; int* a = nullptr;
PADDLE_ENFORCE_NOT_NULL(a); PADDLE_ENFORCE_NOT_NULL(a);
} catch (paddle::platform::EnforceNotMet error) { } catch (paddle::platform::EnforceNotMet error) {
caught_exception = true; caught_exception = true;
EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null")); EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <stdint.h> #include <stdint.h>
#include <limits>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
...@@ -293,39 +294,39 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -293,39 +294,39 @@ struct PADDLE_ALIGN(2) float16 {
HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
HOSTDEVICE inline explicit operator int8_t() const { HOSTDEVICE inline explicit operator int8_t() const {
return static_cast<int8_t>(float(*this)); return static_cast<int8_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator uint8_t() const { HOSTDEVICE inline explicit operator uint8_t() const {
return static_cast<uint8_t>(float(*this)); return static_cast<uint8_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator int16_t() const { HOSTDEVICE inline explicit operator int16_t() const {
return static_cast<int16_t>(float(*this)); return static_cast<int16_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator uint16_t() const { HOSTDEVICE inline explicit operator uint16_t() const {
return static_cast<uint16_t>(float(*this)); return static_cast<uint16_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator int32_t() const { HOSTDEVICE inline explicit operator int32_t() const {
return static_cast<int32_t>(float(*this)); return static_cast<int32_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator uint32_t() const { HOSTDEVICE inline explicit operator uint32_t() const {
return static_cast<uint32_t>(float(*this)); return static_cast<uint32_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator int64_t() const { HOSTDEVICE inline explicit operator int64_t() const {
return static_cast<int64_t>(float(*this)); return static_cast<int64_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator uint64_t() const { HOSTDEVICE inline explicit operator uint64_t() const {
return static_cast<uint64_t>(float(*this)); return static_cast<uint64_t>(static_cast<float>(*this));
} }
HOSTDEVICE inline explicit operator double() const { HOSTDEVICE inline explicit operator double() const {
return static_cast<double>(float(*this)); return static_cast<double>(static_cast<float>(*this));
} }
private: private:
...@@ -370,7 +371,7 @@ DEVICE inline half operator+(const half& a, const half& b) { ...@@ -370,7 +371,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hadd(a, b); return __hadd(a, b);
#else #else
float res = float(float16(a)) + float(float16(b)); float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
return half(float16(res)); return half(float16(res));
#endif #endif
} }
...@@ -379,7 +380,7 @@ DEVICE inline half operator-(const half& a, const half& b) { ...@@ -379,7 +380,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hsub(a, b); return __hsub(a, b);
#else #else
float res = float(float16(a)) - float(float16(b)); float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
return half(float16(res)); return half(float16(res));
#endif #endif
} }
...@@ -388,7 +389,7 @@ DEVICE inline half operator*(const half& a, const half& b) { ...@@ -388,7 +389,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hmul(a, b); return __hmul(a, b);
#else #else
float res = float(float16(a)) * float(float16(b)); float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
return half(float16(res)); return half(float16(res));
#endif #endif
} }
...@@ -399,7 +400,7 @@ DEVICE inline half operator/(const half& a, const half& b) { ...@@ -399,7 +400,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
float denom = __half2float(b); float denom = __half2float(b);
return __float2half(num / denom); return __float2half(num / denom);
#else #else
float res = float(float16(a)) / float(float16(b)); float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
return half(float16(res)); return half(float16(res));
#endif #endif
} }
...@@ -408,27 +409,27 @@ DEVICE inline half operator-(const half& a) { ...@@ -408,27 +409,27 @@ DEVICE inline half operator-(const half& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hneg(a); return __hneg(a);
#else #else
float res = -float(float16(a)); float res = -static_cast<float>(float16(a));
return half(float16(res)); return half(float16(res));
#endif #endif
} }
DEVICE inline half& operator+=(half& a, const half& b) { DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT
a = a + b; a = a + b;
return a; return a;
} }
DEVICE inline half& operator-=(half& a, const half& b) { DEVICE inline half& operator-=(half& a, const half& b) { // NOLINT
a = a - b; a = a - b;
return a; return a;
} }
DEVICE inline half& operator*=(half& a, const half& b) { DEVICE inline half& operator*=(half& a, const half& b) { // NOLINT
a = a * b; a = a * b;
return a; return a;
} }
DEVICE inline half& operator/=(half& a, const half& b) { DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT
a = a / b; a = a / b;
return a; return a;
} }
...@@ -437,7 +438,7 @@ DEVICE inline bool operator==(const half& a, const half& b) { ...@@ -437,7 +438,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(a, b); return __heq(a, b);
#else #else
return float(float16(a)) == float(float16(b)); return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
#endif #endif
} }
...@@ -445,7 +446,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) { ...@@ -445,7 +446,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(a, b); return __hne(a, b);
#else #else
return float(float16(a)) != float(float16(b)); return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
#endif #endif
} }
...@@ -453,7 +454,7 @@ DEVICE inline bool operator<(const half& a, const half& b) { ...@@ -453,7 +454,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(a, b); return __hlt(a, b);
#else #else
return float(float16(a)) < float(float16(b)); return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
#endif #endif
} }
...@@ -461,7 +462,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) { ...@@ -461,7 +462,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(a, b); return __hle(a, b);
#else #else
return float(float16(a)) <= float(float16(b)); return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
#endif #endif
} }
...@@ -469,7 +470,7 @@ DEVICE inline bool operator>(const half& a, const half& b) { ...@@ -469,7 +470,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(a, b); return __hgt(a, b);
#else #else
return float(float16(a)) > float(float16(b)); return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
#endif #endif
} }
...@@ -477,7 +478,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) { ...@@ -477,7 +478,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(a, b); return __hge(a, b);
#else #else
return float(float16(a)) >= float(float16(b)); return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
#endif #endif
} }
...@@ -489,7 +490,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { ...@@ -489,7 +490,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hadd(half(a), half(b))); return float16(__hadd(half(a), half(b)));
#else #else
return float16(float(a) + float(b)); return float16(static_cast<float>(a) + static_cast<float>(b));
#endif #endif
} }
...@@ -497,7 +498,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { ...@@ -497,7 +498,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hsub(half(a), half(b))); return float16(__hsub(half(a), half(b)));
#else #else
return float16(float(a) - float(b)); return float16(static_cast<float>(a) - static_cast<float>(b));
#endif #endif
} }
...@@ -505,7 +506,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { ...@@ -505,7 +506,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hmul(half(a), half(b))); return float16(__hmul(half(a), half(b)));
#else #else
return float16(float(a) * float(b)); return float16(static_cast<float>(a) * static_cast<float>(b));
#endif #endif
} }
...@@ -516,7 +517,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { ...@@ -516,7 +517,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
float denom = __half2float(half(b)); float denom = __half2float(half(b));
return float16(num / denom); return float16(num / denom);
#else #else
return float16(float(a) / float(b)); return float16(static_cast<float>(a) / static_cast<float>(b));
#endif #endif
} }
...@@ -530,22 +531,22 @@ HOSTDEVICE inline float16 operator-(const float16& a) { ...@@ -530,22 +531,22 @@ HOSTDEVICE inline float16 operator-(const float16& a) {
#endif #endif
} }
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { // NOLINT
a = a + b; a = a + b;
return a; return a;
} }
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { // NOLINT
a = a - b; a = a - b;
return a; return a;
} }
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { // NOLINT
a = a * b; a = a * b;
return a; return a;
} }
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT
a = a / b; a = a / b;
return a; return a;
} }
...@@ -554,7 +555,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { ...@@ -554,7 +555,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(half(a), half(b)); return __heq(half(a), half(b));
#else #else
return float(a) == float(b); return static_cast<float>(a) == static_cast<float>(b);
#endif #endif
} }
...@@ -562,7 +563,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { ...@@ -562,7 +563,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(half(a), half(b)); return __hne(half(a), half(b));
#else #else
return float(a) != float(b); return static_cast<float>(a) != static_cast<float>(b);
#endif #endif
} }
...@@ -570,7 +571,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { ...@@ -570,7 +571,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(half(a), half(b)); return __hlt(half(a), half(b));
#else #else
return float(a) < float(b); return static_cast<float>(a) < static_cast<float>(b);
#endif #endif
} }
...@@ -578,7 +579,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { ...@@ -578,7 +579,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(half(a), half(b)); return __hle(half(a), half(b));
#else #else
return float(a) <= float(b); return static_cast<float>(a) <= static_cast<float>(b);
#endif #endif
} }
...@@ -586,7 +587,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { ...@@ -586,7 +587,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(half(a), half(b)); return __hgt(half(a), half(b));
#else #else
return float(a) > float(b); return static_cast<float>(a) > static_cast<float>(b);
#endif #endif
} }
...@@ -594,7 +595,7 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { ...@@ -594,7 +595,7 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(half(a), half(b)); return __hge(half(a), half(b));
#else #else
return float(a) >= float(b); return static_cast<float>(a) >= static_cast<float>(b);
#endif #endif
} }
...@@ -679,22 +680,22 @@ inline float16 operator-(const float16& a) { ...@@ -679,22 +680,22 @@ inline float16 operator-(const float16& a) {
return res; return res;
} }
inline float16& operator+=(float16& a, const float16& b) { inline float16& operator+=(float16& a, const float16& b) { // NOLINT
a = a + b; a = a + b;
return a; return a;
} }
inline float16& operator-=(float16& a, const float16& b) { inline float16& operator-=(float16& a, const float16& b) { // NOLINT
a = a - b; a = a - b;
return a; return a;
} }
inline float16& operator*=(float16& a, const float16& b) { inline float16& operator*=(float16& a, const float16& b) { // NOLINT
a = a * b; a = a * b;
return a; return a;
} }
inline float16& operator/=(float16& a, const float16& b) { inline float16& operator/=(float16& a, const float16& b) { // NOLINT
a = a / b; a = a / b;
return a; return a;
} }
...@@ -784,19 +785,19 @@ inline bool operator>=(const float16& a, const float16& b) { ...@@ -784,19 +785,19 @@ inline bool operator>=(const float16& a, const float16& b) {
// Arithmetic operators for float16, software emulated on other CPU // Arithmetic operators for float16, software emulated on other CPU
#else #else
inline float16 operator+(const float16& a, const float16& b) { inline float16 operator+(const float16& a, const float16& b) {
return float16(float(a) + float(b)); return float16(static_cast<float>(a) + static_cast<float>(b));
} }
inline float16 operator-(const float16& a, const float16& b) { inline float16 operator-(const float16& a, const float16& b) {
return float16(float(a) - float(b)); return float16(static_cast<float>(a) - static_cast<float>(b));
} }
inline float16 operator*(const float16& a, const float16& b) { inline float16 operator*(const float16& a, const float16& b) {
return float16(float(a) * float(b)); return float16(static_cast<float>(a) * static_cast<float>(b));
} }
inline float16 operator/(const float16& a, const float16& b) { inline float16 operator/(const float16& a, const float16& b) {
return float16(float(a) / float(b)); return float16(static_cast<float>(a) / static_cast<float>(b));
} }
inline float16 operator-(const float16& a) { inline float16 operator-(const float16& a) {
...@@ -805,51 +806,57 @@ inline float16 operator-(const float16& a) { ...@@ -805,51 +806,57 @@ inline float16 operator-(const float16& a) {
return res; return res;
} }
inline float16& operator+=(float16& a, const float16& b) { inline float16& operator+=(float16& a, const float16& b) { // NOLINT
a = float16(float(a) + float(b)); a = float16(static_cast<float>(a) + static_cast<float>(b));
return a; return a;
} }
inline float16& operator-=(float16& a, const float16& b) { inline float16& operator-=(float16& a, const float16& b) { // NOLINT
a = float16(float(a) - float(b)); a = float16(static_cast<float>(a) - static_cast<float>(b));
return a; return a;
} }
inline float16& operator*=(float16& a, const float16& b) { inline float16& operator*=(float16& a, const float16& b) { // NOLINT
a = float16(float(a) * float(b)); a = float16(static_cast<float>(a) * static_cast<float>(b));
return a; return a;
} }
inline float16& operator/=(float16& a, const float16& b) { inline float16& operator/=(float16& a, const float16& b) { // NOLINT
a = float16(float(a) / float(b)); a = float16(static_cast<float>(a) / static_cast<float>(b));
return a; return a;
} }
inline bool operator==(const float16& a, const float16& b) { inline bool operator==(const float16& a, const float16& b) {
return float(a) == float(b); return static_cast<float>(a) == static_cast<float>(b);
} }
inline bool operator!=(const float16& a, const float16& b) { inline bool operator!=(const float16& a, const float16& b) {
return float(a) != float(b); return static_cast<float>(a) != static_cast<float>(b);
} }
inline bool operator<(const float16& a, const float16& b) { inline bool operator<(const float16& a, const float16& b) {
return float(a) < float(b); return static_cast<float>(a) < static_cast<float>(b);
} }
inline bool operator<=(const float16& a, const float16& b) { inline bool operator<=(const float16& a, const float16& b) {
return float(a) <= float(b); return static_cast<float>(a) <= static_cast<float>(b);
} }
inline bool operator>(const float16& a, const float16& b) { inline bool operator>(const float16& a, const float16& b) {
return float(a) > float(b); return static_cast<float>(a) > static_cast<float>(b);
} }
inline bool operator>=(const float16& a, const float16& b) { inline bool operator>=(const float16& a, const float16& b) {
return float(a) >= float(b); return static_cast<float>(a) >= static_cast<float>(b);
} }
#endif #endif
HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
float16 res;
res.x = a;
return res;
}
HOSTDEVICE inline bool(isnan)(const float16& a) { HOSTDEVICE inline bool(isnan)(const float16& a) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hisnan(half(a)); return __hisnan(half(a));
...@@ -886,28 +893,116 @@ struct is_pod<paddle::platform::float16> { ...@@ -886,28 +893,116 @@ struct is_pod<paddle::platform::float16> {
is_standard_layout<paddle::platform::float16>::value; is_standard_layout<paddle::platform::float16>::value;
}; };
template <>
struct numeric_limits<paddle::platform::float16> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3;
static const int max_digits10 = 5;
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static paddle::platform::float16(min)() {
return paddle::platform::raw_uint16_to_float16(0x400);
}
static paddle::platform::float16 lowest() {
return paddle::platform::raw_uint16_to_float16(0xfbff);
}
static paddle::platform::float16(max)() {
return paddle::platform::raw_uint16_to_float16(0x7bff);
}
static paddle::platform::float16 epsilon() {
return paddle::platform::raw_uint16_to_float16(0x0800);
}
static paddle::platform::float16 round_error() {
return paddle::platform::float16(0.5);
}
static paddle::platform::float16 infinity() {
return paddle::platform::raw_uint16_to_float16(0x7c00);
}
static paddle::platform::float16 quiet_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7e00);
}
static paddle::platform::float16 signaling_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7e00);
}
static paddle::platform::float16 denorm_min() {
return paddle::platform::raw_uint16_to_float16(0x1);
}
};
} // namespace std } // namespace std
namespace Eigen { namespace Eigen {
using float16 = paddle::platform::float16;
template <>
struct NumTraits<float16> : GenericNumTraits<float16> {
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
HOSTDEVICE static inline float16 epsilon() {
return paddle::platform::raw_uint16_to_float16(0x0800);
}
HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
HOSTDEVICE static inline float16 highest() {
return paddle::platform::raw_uint16_to_float16(0x7bff);
}
HOSTDEVICE static inline float16 lowest() {
return paddle::platform::raw_uint16_to_float16(0xfbff);
}
HOSTDEVICE static inline float16 infinity() {
return paddle::platform::raw_uint16_to_float16(0x7c00);
}
HOSTDEVICE static inline float16 quiet_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7c01);
}
};
namespace numext { namespace numext {
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)( HOSTDEVICE inline bool(isnan)(const float16& a) {
const paddle::platform::float16& a) {
return (paddle::platform::isnan)(a); return (paddle::platform::isnan)(a);
} }
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)( HOSTDEVICE inline bool(isinf)(const float16& a) {
const paddle::platform::float16& a) {
return (paddle::platform::isinf)(a); return (paddle::platform::isinf)(a);
} }
template <> template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)( HOSTDEVICE inline bool(isfinite)(const float16& a) {
const paddle::platform::float16& a) {
return (paddle::platform::isfinite)(a); return (paddle::platform::isfinite)(a);
} }
template <>
HOSTDEVICE inline float16 exp(const float16& a) {
return float16(::expf(static_cast<float>(a)));
}
} // namespace numext } // namespace numext
} // namespace Eigen } // namespace Eigen
...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include <gtest/gtest.h>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) { ...@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {
// Conversion operator // Conversion operator
EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
EXPECT_EQ(float(float16(0.5f)), 0.5f); EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
EXPECT_EQ(int(float16(-1)), -1); EXPECT_EQ(static_cast<int>(float16(-1)), -1);
EXPECT_EQ(bool(float16(true)), true); EXPECT_EQ(static_cast<bool>(float16(true)), true);
} }
TEST(float16, arithmetic_cpu) { TEST(float16, arithmetic_cpu) {
EXPECT_EQ(float(float16(1) + float16(1)), 2); EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
EXPECT_EQ(float(float16(5) + float16(-5)), 0); EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
EXPECT_EQ(float(float16(3) - float16(5)), -2); 0.001);
EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); 0.33334f, 0.001);
EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
EXPECT_EQ(float(-float16(512.0f)), -512.0f); EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
EXPECT_EQ(float(-float16(-512.0f)), 512.0f); 0.001);
EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
} }
TEST(float16, comparison_cpu) { TEST(float16, comparison_cpu) {
......
...@@ -36,19 +36,19 @@ limitations under the License. */ ...@@ -36,19 +36,19 @@ limitations under the License. */
half *in1, *in2, *out; \ half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \ half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, size); \ cudaMalloc(reinterpret_cast<void**>(&d_out), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (half*)malloc(size); \ out = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(out[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(out[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
free(out); \ free(out); \
...@@ -63,17 +63,17 @@ limitations under the License. */ ...@@ -63,17 +63,17 @@ limitations under the License. */
half *in1, *in2; \ half *in1, *in2; \
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
op_type<<<1, 1>>>(d_in1, d_in2); \ op_type<<<1, 1>>>(d_in1, d_in2); \
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
EXPECT_EQ(float(float16(in1[0])), v_out); \ EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out); \
free(in1); \ free(in1); \
free(in2); \ free(in2); \
cudaFree(d_in1); \ cudaFree(d_in1); \
...@@ -87,12 +87,12 @@ limitations under the License. */ ...@@ -87,12 +87,12 @@ limitations under the License. */
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
bool *out, *d_out; \ bool *out, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc((void**)&d_in1, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
cudaMalloc((void**)&d_in2, size); \ cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
cudaMalloc((void**)&d_out, 1); \ cudaMalloc(reinterpret_cast<void**>(&d_out), 1); \
in1 = (half*)malloc(size); \ in1 = reinterpret_cast<half*>(malloc(size)); \
in2 = (half*)malloc(size); \ in2 = reinterpret_cast<half*>(malloc(size)); \
out = (bool*)malloc(1); \ out = reinterpret_cast<bool*>(malloc(1)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) { ...@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!"; LOG(INFO) << "Test Neg on GPU!";
half *in, *d_in; half *in, *d_in;
int size = sizeof(half); int size = sizeof(half);
cudaMalloc((void**)&d_in, size); cudaMalloc(reinterpret_cast<void**>(&d_in), size);
in = (half*)malloc(size); in = reinterpret_cast<half*>(malloc(size));
in[0] = half(float16(v_in)); in[0] = half(float16(v_in));
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
Neg<<<1, 1>>>(d_in); Neg<<<1, 1>>>(d_in);
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
EXPECT_EQ(float(float16(in[0])), v_out); EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
free(in); free(in);
cudaFree(d_in); cudaFree(d_in);
} }
......
...@@ -14,8 +14,9 @@ limitations under the License. */ ...@@ -14,8 +14,9 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "gflags/gflags.h" #include <algorithm>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
...@@ -77,8 +78,8 @@ void SetDeviceId(int id) { ...@@ -77,8 +78,8 @@ void SetDeviceId(int id) {
"cudaSetDevice failed in paddle::platform::SetDeviceId"); "cudaSetDevice failed in paddle::platform::SetDeviceId");
} }
void GpuMemoryUsage(size_t &available, size_t &total) { void GpuMemoryUsage(size_t *available, size_t *total) {
PADDLE_ENFORCE(cudaMemGetInfo(&available, &total), PADDLE_ENFORCE(cudaMemGetInfo(available, total),
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
} }
...@@ -86,7 +87,7 @@ size_t GpuMaxAllocSize() { ...@@ -86,7 +87,7 @@ size_t GpuMaxAllocSize() {
size_t total = 0; size_t total = 0;
size_t available = 0; size_t available = 0;
GpuMemoryUsage(available, total); GpuMemoryUsage(&available, &total);
// Reserve the rest for page tables, etc. // Reserve the rest for page tables, etc.
return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use); return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
...@@ -101,7 +102,7 @@ size_t GpuMaxChunkSize() { ...@@ -101,7 +102,7 @@ size_t GpuMaxChunkSize() {
size_t total = 0; size_t total = 0;
size_t available = 0; size_t available = 0;
GpuMemoryUsage(available, total); GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M"; << total / 1024 / 1024 << "M";
size_t reserving = static_cast<size_t>(0.05 * total); size_t reserving = static_cast<size_t>(0.05 * total);
......
...@@ -23,10 +23,6 @@ limitations under the License. */ ...@@ -23,10 +23,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
//! Environment variable: fraction of GPU memory to use on each device.
const std::string kEnvFractionGpuMemoryToUse =
"PADDLE_FRACTION_GPU_MEMORY_TO_USE";
//! Get the total number of GPU devices in system. //! Get the total number of GPU devices in system.
int GetCUDADeviceCount(); int GetCUDADeviceCount();
...@@ -46,7 +42,7 @@ int GetCurrentDeviceId(); ...@@ -46,7 +42,7 @@ int GetCurrentDeviceId();
void SetDeviceId(int device_id); void SetDeviceId(int device_id);
//! Get the memory usage of current GPU device. //! Get the memory usage of current GPU device.
void GpuMemoryUsage(size_t &available, size_t &total); void GpuMemoryUsage(size_t *available, size_t *total);
//! Get the maximum allocation size of current GPU device. //! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize(); size_t GpuMaxAllocSize();
......
...@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include <iostream>
#include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h" #include "paddle/fluid/platform/variant.h"
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
...@@ -2,17 +2,19 @@ if(WITH_PYTHON) ...@@ -2,17 +2,19 @@ if(WITH_PYTHON)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED hip_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
parallel_executor parallel_executor
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
else() else()
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
parallel_executor parallel_executor
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID) if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt) target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT ANDROID) endif(NOT APPLE AND NOT ANDROID)
endif(WITH_AMD_GPU) endif(WITH_AMD_GPU)
cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
endif(WITH_PYTHON) endif(WITH_PYTHON)
...@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "const_value.h" #include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
void BindConstValue(pybind11::module& m) { void BindConstValue(pybind11::module* m) {
m.def("kEmptyVarName", [] { return framework::kEmptyVarName; }); m->def("kEmptyVarName", [] { return framework::kEmptyVarName; });
m.def("kTempVarName", [] { return framework::kTempVarName; }); m->def("kTempVarName", [] { return framework::kTempVarName; });
m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
} }
} // namespace pybind } // namespace pybind
......
...@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <Python.h> #include <Python.h>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
namespace py = pybind11;
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
extern void BindConstValue(pybind11::module& m);
void BindConstValue(pybind11::module* m);
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
void BindException(pybind11::module& m) { void BindException(pybind11::module* m) {
static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet"); static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
pybind11::register_exception_translator([](std::exception_ptr p) { pybind11::register_exception_translator([](std::exception_ptr p) {
try { try {
if (p) std::rethrow_exception(p); if (p) std::rethrow_exception(p);
...@@ -27,7 +27,8 @@ void BindException(pybind11::module& m) { ...@@ -27,7 +27,8 @@ void BindException(pybind11::module& m) {
} }
}); });
m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); }); m->def("__unittest_throw_exception__",
[] { PADDLE_THROW("test exception"); });
} }
} // namespace pybind } // namespace pybind
......
...@@ -11,14 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,14 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <Python.h> #include <Python.h>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
extern void BindException(pybind11::module& m); void BindException(pybind11::module* m);
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/protobuf.h"
#include <deque> #include <deque>
#include <iostream> #include <iostream>
#include <string>
#include <tuple>
#include "paddle/fluid/framework/backward.h" #include "paddle/fluid/framework/backward.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
...@@ -95,10 +98,11 @@ struct type_caster<boost::variant<Args...>> ...@@ -95,10 +98,11 @@ struct type_caster<boost::variant<Args...>>
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
using namespace paddle::framework; // NOLINT namespace pd = paddle::framework;
template <typename T> template <typename T>
static py::bytes SerializeMessage(T &self) { static pybind11::bytes SerializeMessage(
T &self) { // NOLINT due to pybind11 convention.
// Check IsInitialized in Python // Check IsInitialized in Python
std::string retv; std::string retv;
PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv), PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
...@@ -107,24 +111,24 @@ static py::bytes SerializeMessage(T &self) { ...@@ -107,24 +111,24 @@ static py::bytes SerializeMessage(T &self) {
} }
// Bind Methods // Bind Methods
void BindProgramDesc(py::module &m) { void BindProgramDesc(pybind11::module *m) {
py::class_<ProgramDesc>(m, "ProgramDesc", "") pybind11::class_<pd::ProgramDesc>(*m, "ProgramDesc", "")
.def(py::init<>()) .def(pybind11::init<>())
.def("__init__", .def("__init__",
[](ProgramDesc &self, const ProgramDesc &other) { [](pd::ProgramDesc &self, const pd::ProgramDesc &other) {
new (&self) ProgramDesc(other); new (&self) pd::ProgramDesc(other);
}) })
.def("__init__", .def("__init__",
[](ProgramDesc &self, const py::bytes &binary_str) { [](pd::ProgramDesc &self, const pybind11::bytes &binary_str) {
std::string str(binary_str); std::string str(binary_str);
new (&self) ProgramDesc(str); new (&self) pd::ProgramDesc(str);
}) })
.def("append_block", &ProgramDesc::AppendBlock, .def("append_block", &pd::ProgramDesc::AppendBlock,
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("append_backward", .def("append_backward",
[](ProgramDesc &program_desc, const VarDesc &target, [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
const std::unordered_set<std::string> &no_grad_vars) { const std::unordered_set<std::string> &no_grad_vars) {
ParamGradInfoMap param_grad_map = pd::ParamGradInfoMap param_grad_map =
AppendBackward(program_desc, target, no_grad_vars); AppendBackward(program_desc, target, no_grad_vars);
std::unordered_map< std::unordered_map<
std::string, std::tuple<std::string /* grad_var_name */, std::string, std::tuple<std::string /* grad_var_name */,
...@@ -138,172 +142,184 @@ void BindProgramDesc(py::module &m) { ...@@ -138,172 +142,184 @@ void BindProgramDesc(py::module &m) {
} }
return retv; return retv;
}) })
.def("block", &ProgramDesc::MutableBlock, .def("block", &pd::ProgramDesc::MutableBlock,
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("num_blocks", &ProgramDesc::Size) .def("num_blocks", &pd::ProgramDesc::Size)
.def("serialize_to_string", SerializeMessage<ProgramDesc>) .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
.def("parse_from_string", .def("parse_from_string",
[](ProgramDesc &program_desc, const std::string &data) { [](pd::ProgramDesc &program_desc, const std::string &data) {
proto::ProgramDesc *desc = program_desc.Proto(); pd::proto::ProgramDesc *desc = program_desc.Proto();
PADDLE_ENFORCE(desc->ParseFromString(data), PADDLE_ENFORCE(desc->ParseFromString(data),
"Fail to parse ProgramDesc from string. This could " "Fail to parse ProgramDesc from string. This could "
"be a bug of Paddle."); "be a bug of Paddle.");
}); });
} }
void BindBlockDesc(py::module &m) { void BindBlockDesc(pybind11::module *m) {
py::class_<BlockDesc>(m, "BlockDesc", "") pybind11::class_<pd::BlockDesc>(*m, "BlockDesc", "")
.def_property_readonly("id", &BlockDesc::ID) .def_property_readonly("id", &pd::BlockDesc::ID)
.def_property_readonly("parent", &BlockDesc::Parent) .def_property_readonly("parent", &pd::BlockDesc::Parent)
.def("get_forward_block_idx", &BlockDesc::ForwardBlockID) .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
.def("set_forward_block_idx", &BlockDesc::SetForwardBlockID) .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
.def("append_op", &BlockDesc::AppendOp, .def("append_op", &pd::BlockDesc::AppendOp,
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("prepend_op", &BlockDesc::PrependOp, .def("prepend_op", &pd::BlockDesc::PrependOp,
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("insert_op", &BlockDesc::InsertOp, .def("insert_op", &pd::BlockDesc::InsertOp,
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("remove_op", &BlockDesc::RemoveOp) .def("remove_op", &pd::BlockDesc::RemoveOp)
.def("var", .def("var",
[](BlockDesc &self, py::bytes byte_name) { [](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.Var(name); return self.Var(name);
}, },
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("has_var", .def("has_var",
[](BlockDesc &self, py::bytes byte_name) { [](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.HasVar(name); return self.HasVar(name);
}, },
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("rename_var", .def("rename_var",
[](BlockDesc &self, const py::bytes &byte_name, [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
const py::bytes &byte_name_new) { const pybind11::bytes &byte_name_new) {
std::string name = byte_name; std::string name = byte_name;
std::string new_name = byte_name_new; std::string new_name = byte_name_new;
self.RenameVar(name, new_name); self.RenameVar(name, new_name);
}) })
.def("has_var_recursive", .def("has_var_recursive",
[](BlockDesc &self, py::bytes byte_name) { [](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.HasVarRecursive(name); return self.HasVarRecursive(name);
}) })
.def("find_var", .def("find_var",
[](BlockDesc &self, py::bytes byte_name) { [](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.FindVar(name); return self.FindVar(name);
}, },
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("find_var_recursive", .def("find_var_recursive",
[](BlockDesc &self, py::bytes byte_name) { [](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.FindVarRecursive(name); return self.FindVarRecursive(name);
}, },
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference) .def("remove_var",
.def("op_size", &BlockDesc::OpSize) [](pd::BlockDesc &self, pybind11::bytes byte_name) {
.def("op", &BlockDesc::Op, py::return_value_policy::reference) std::string name = byte_name;
.def("serialize_to_string", SerializeMessage<BlockDesc>); return self.RemoveVar(name);
},
pybind11::return_value_policy::reference)
.def("all_vars", &pd::BlockDesc::AllVars,
pybind11::return_value_policy::reference)
.def("op_size", &pd::BlockDesc::OpSize)
.def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference)
.def("serialize_to_string", SerializeMessage<pd::BlockDesc>);
} }
void BindVarDsec(py::module &m) { void BindVarDsec(pybind11::module *m) {
py::class_<VarDesc> var_desc(m, "VarDesc", ""); pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
var_desc var_desc
.def("name", .def("name",
[](VarDesc &self) { [](pd::VarDesc &self) {
py::bytes name = self.Name(); pybind11::bytes name = self.Name();
return name; return name;
}, },
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("set_name", &VarDesc::SetName) .def("set_name", &pd::VarDesc::SetName)
.def("set_shape", &VarDesc::SetShape) .def("set_shape", &pd::VarDesc::SetShape)
.def("set_shapes", &VarDesc::SetShapes) .def("set_shapes", &pd::VarDesc::SetShapes)
.def("set_dtype", &VarDesc::SetDataType) .def("set_dtype", &pd::VarDesc::SetDataType)
.def("set_dtypes", &VarDesc::SetDataTypes) .def("set_dtypes", &pd::VarDesc::SetDataTypes)
.def("set_capacity", &VarDesc::SetCapacity) .def("set_capacity", &pd::VarDesc::SetCapacity)
.def("shape", &VarDesc::GetShape, py::return_value_policy::reference) .def("shape", &pd::VarDesc::GetShape,
.def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference) .def("shapes", &pd::VarDesc::GetShapes,
.def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("lod_level", &VarDesc::GetLoDLevel) .def("dtype", &pd::VarDesc::GetDataType,
.def("lod_levels", &VarDesc::GetLoDLevels, pybind11::return_value_policy::reference)
py::return_value_policy::reference) .def("dtypes", &pd::VarDesc::GetDataTypes,
.def("set_lod_level", &VarDesc::SetLoDLevel) pybind11::return_value_policy::reference)
.def("set_lod_levels", &VarDesc::SetLoDLevels) .def("lod_level", &pd::VarDesc::GetLoDLevel)
.def("type", &VarDesc::GetType) .def("lod_levels", &pd::VarDesc::GetLoDLevels,
.def("set_type", &VarDesc::SetType) pybind11::return_value_policy::reference)
.def("serialize_to_string", SerializeMessage<VarDesc>) .def("set_lod_level", &pd::VarDesc::SetLoDLevel)
.def("persistable", &VarDesc::Persistable) .def("set_lod_levels", &pd::VarDesc::SetLoDLevels)
.def("set_persistable", &VarDesc::SetPersistable); .def("type", &pd::VarDesc::GetType)
.def("set_type", &pd::VarDesc::SetType)
.def("serialize_to_string", SerializeMessage<pd::VarDesc>)
.def("persistable", &pd::VarDesc::Persistable)
.def("set_persistable", &pd::VarDesc::SetPersistable);
py::enum_<proto::VarType::Type>(var_desc, "VarType", "") pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
.value("BOOL", proto::VarType::BOOL) .value("BOOL", pd::proto::VarType::BOOL)
.value("INT16", proto::VarType::INT16) .value("INT16", pd::proto::VarType::INT16)
.value("INT32", proto::VarType::INT32) .value("INT32", pd::proto::VarType::INT32)
.value("INT64", proto::VarType::INT64) .value("INT64", pd::proto::VarType::INT64)
.value("FP16", proto::VarType::FP16) .value("FP16", pd::proto::VarType::FP16)
.value("FP32", proto::VarType::FP32) .value("FP32", pd::proto::VarType::FP32)
.value("FP64", proto::VarType::FP64) .value("FP64", pd::proto::VarType::FP64)
.value("LOD_TENSOR", proto::VarType::LOD_TENSOR) .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
.value("SELECTED_ROWS", proto::VarType::SELECTED_ROWS) .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
.value("FEED_MINIBATCH", proto::VarType::FEED_MINIBATCH) .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
.value("FETCH_LIST", proto::VarType::FETCH_LIST) .value("FETCH_LIST", pd::proto::VarType::FETCH_LIST)
.value("STEP_SCOPES", proto::VarType::STEP_SCOPES) .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
.value("LOD_RANK_TABLE", proto::VarType::LOD_RANK_TABLE) .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
.value("LOD_TENSOR_ARRAY", proto::VarType::LOD_TENSOR_ARRAY) .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
.value("CHANNEL", proto::VarType::CHANNEL) .value("CHANNEL", pd::proto::VarType::CHANNEL)
.value("PLACE_LIST", proto::VarType::PLACE_LIST) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
.value("READER", proto::VarType::READER) .value("READER", pd::proto::VarType::READER)
.value("RAW", proto::VarType::RAW); .value("RAW", pd::proto::VarType::RAW);
} }
void BindOpDesc(py::module &m) { void BindOpDesc(pybind11::module *m) {
py::enum_<proto::AttrType>(m, "AttrType", "") pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
.value("INT", proto::AttrType::INT) .value("INT", pd::proto::AttrType::INT)
.value("INTS", proto::AttrType::INTS) .value("INTS", pd::proto::AttrType::INTS)
.value("FLOAT", proto::AttrType::FLOAT) .value("FLOAT", pd::proto::AttrType::FLOAT)
.value("FLOATS", proto::AttrType::FLOATS) .value("FLOATS", pd::proto::AttrType::FLOATS)
.value("STRING", proto::AttrType::STRING) .value("STRING", pd::proto::AttrType::STRING)
.value("STRINGS", proto::AttrType::STRINGS) .value("STRINGS", pd::proto::AttrType::STRINGS)
.value("BOOL", proto::AttrType::BOOLEAN) .value("BOOL", pd::proto::AttrType::BOOLEAN)
.value("BOOLS", proto::AttrType::BOOLEANS) .value("BOOLS", pd::proto::AttrType::BOOLEANS)
.value("BLOCK", proto::AttrType::BLOCK); .value("BLOCK", pd::proto::AttrType::BLOCK);
py::class_<OpDesc> op_desc(m, "OpDesc", ""); pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
op_desc op_desc
.def("__init__", [](OpDesc &self) { new (&self) OpDesc(); }, .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
py::return_value_policy::reference) pybind11::return_value_policy::reference)
.def("copy_from", &OpDesc::CopyFrom) .def("copy_from", &pd::OpDesc::CopyFrom)
.def("type", &OpDesc::Type) .def("type", &pd::OpDesc::Type)
.def("set_type", &OpDesc::SetType) .def("set_type", &pd::OpDesc::SetType)
.def("input", &OpDesc::Input) .def("input", &pd::OpDesc::Input)
.def("input_names", &OpDesc::InputNames) .def("input_names", &pd::OpDesc::InputNames)
.def("output", &OpDesc::Output) .def("output", &pd::OpDesc::Output)
.def("output_names", &OpDesc::OutputNames) .def("output_names", &pd::OpDesc::OutputNames)
.def("set_input", &OpDesc::SetInput) .def("set_input", &pd::OpDesc::SetInput)
.def("set_output", &OpDesc::SetOutput) .def("set_output", &pd::OpDesc::SetOutput)
.def("input_arg_names", &OpDesc::InputArgumentNames) .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
.def("output_arg_names", &OpDesc::OutputArgumentNames) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
.def("rename_input", &OpDesc::RenameInput) .def("rename_input", &pd::OpDesc::RenameInput)
.def("rename_output", &OpDesc::RenameOutput) .def("rename_output", &pd::OpDesc::RenameOutput)
.def("has_attr", &OpDesc::HasAttr) .def("has_attr", &pd::OpDesc::HasAttr)
.def("attr_type", &OpDesc::GetAttrType) .def("attr_type", &pd::OpDesc::GetAttrType)
.def("attr_names", &OpDesc::AttrNames) .def("attr_names", &pd::OpDesc::AttrNames)
.def("set_attr", &OpDesc::SetAttr) .def("set_attr", &pd::OpDesc::SetAttr)
.def("attr", &OpDesc::GetAttr) .def("attr", &pd::OpDesc::GetAttr)
.def("set_block_attr", &OpDesc::SetBlockAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
.def("set_serialized_attr", .def("set_serialized_attr",
[](OpDesc &self, const std::string &name, [](pd::OpDesc &self, const std::string &name,
const py::bytes &seriralized) { const pybind11::bytes &seriralized) {
std::string ser(seriralized); std::string ser(seriralized);
self.SetAttr(name, ser); self.SetAttr(name, ser);
}) })
.def("block_attr", &OpDesc::GetBlockAttr) .def("block_attr", &pd::OpDesc::GetBlockAttr)
.def("check_attrs", &OpDesc::CheckAttrs) .def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &OpDesc::InferShape) .def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &OpDesc::InferVarType) .def("infer_var_type", &pd::OpDesc::InferVarType)
.def("serialize_to_string", SerializeMessage<OpDesc>) .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
.def("block", &OpDesc::Block, py::return_value_policy::reference); .def("block", &pd::OpDesc::Block,
pybind11::return_value_policy::reference);
} }
} // namespace pybind } // namespace pybind
......
...@@ -11,25 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,25 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <Python.h> #include <Python.h>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include "paddle/fluid/platform/variant.h" #include "paddle/fluid/platform/variant.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
void BindProgramDesc(py::module& m); void BindProgramDesc(pybind11::module* m);
void BindBlockDesc(py::module& m); void BindBlockDesc(pybind11::module* m);
void BindVarDsec(py::module& m); void BindVarDsec(pybind11::module* m);
void BindOpDesc(py::module& m); void BindOpDesc(pybind11::module* m);
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -11,11 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,11 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <Python.h>
#include <algorithm>
#include <map>
#include <mutex> // NOLINT // for call_once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/protobuf.h"
#include <mutex> // for call_once
#include <unordered_map>
#include "paddle/fluid/framework/backward.h" #include "paddle/fluid/framework/backward.h"
#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
...@@ -32,7 +38,6 @@ limitations under the License. */ ...@@ -32,7 +38,6 @@ limitations under the License. */
#include "paddle/fluid/operators/cond_op.h" #include "paddle/fluid/operators/cond_op.h"
#include "paddle/fluid/operators/net_op.h" #include "paddle/fluid/operators/net_op.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/const_value.h"
...@@ -69,7 +74,7 @@ PYBIND11_PLUGIN(core) { ...@@ -69,7 +74,7 @@ PYBIND11_PLUGIN(core) {
// not cause namespace pollution. // not cause namespace pollution.
using namespace paddle::framework; // NOLINT using namespace paddle::framework; // NOLINT
BindException(m); BindException(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol()) py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer( .def_buffer(
...@@ -100,6 +105,14 @@ PYBIND11_PLUGIN(core) { ...@@ -100,6 +105,14 @@ PYBIND11_PLUGIN(core) {
[](Tensor &self, paddle::platform::CUDAPlace &place) { [](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("alloc_int",
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<int>(place);
})
.def("alloc_float",
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<float>(place);
})
.def("set", PyCPUTensorSetFromArray<float>) .def("set", PyCPUTensorSetFromArray<float>)
.def("set", PyCPUTensorSetFromArray<int>) .def("set", PyCPUTensorSetFromArray<int>)
.def("set", PyCPUTensorSetFromArray<double>) .def("set", PyCPUTensorSetFromArray<double>)
...@@ -113,6 +126,12 @@ PYBIND11_PLUGIN(core) { ...@@ -113,6 +126,12 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDATensorSetFromArray<int64_t>) .def("set", PyCUDATensorSetFromArray<int64_t>)
.def("set", PyCUDATensorSetFromArray<bool>) .def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>) .def("set", PyCUDATensorSetFromArray<uint16_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<float>)
.def("set", PyCUDAPinnedTensorSetFromArray<int>)
.def("set", PyCUDAPinnedTensorSetFromArray<double>)
.def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<bool>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
#endif #endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>) .def("set_float_element", TensorSetElement<float>)
...@@ -317,7 +336,17 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -317,7 +336,17 @@ All parameter, weight, gradient are variables in Paddle.
#else #else
return new paddle::platform::CUDADeviceContext(place); return new paddle::platform::CUDADeviceContext(place);
#endif #endif
}); })
.def_static("create",
[](paddle::platform::CUDAPinnedPlace& place)
-> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW(
"CUDAPinnedPlace is not supported in CPU device.");
#else
return new paddle::platform::CUDAPinnedDeviceContext(place);
#endif
});;
// clang-format on // clang-format on
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
...@@ -330,6 +359,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -330,6 +359,10 @@ All parameter, weight, gradient are variables in Paddle.
.def(py::init<>()) .def(py::init<>())
.def("__str__", string::to_string<const platform::CPUPlace &>); .def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
.def(py::init<>())
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
py::class_<platform::Place>(m, "Place") py::class_<platform::Place>(m, "Place")
.def(py::init<>()) .def(py::init<>())
.def("set_place", .def("set_place",
...@@ -339,6 +372,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -339,6 +372,10 @@ All parameter, weight, gradient are variables in Paddle.
.def("set_place", .def("set_place",
[](platform::Place &self, const platform::CUDAPlace &gpu_place) { [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
self = gpu_place; self = gpu_place;
})
.def("set_place", [](platform::Place &self,
const platform::CUDAPinnedPlace &cuda_pinned_place) {
self = cuda_pinned_place;
}); });
py::class_<OperatorBase>(m, "Operator") py::class_<OperatorBase>(m, "Operator")
...@@ -363,6 +400,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -363,6 +400,11 @@ All parameter, weight, gradient are variables in Paddle.
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self, const Scope &scope,
const platform::CUDAPlace &place) { self.Run(scope, place); }) const platform::CUDAPlace &place) { self.Run(scope, place); })
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CUDAPinnedPlace &place) {
self.Run(scope, place);
})
.def("type", .def("type",
[](const OperatorBase &op) -> std::string { return op.Type(); }) [](const OperatorBase &op) -> std::string { return op.Type(); })
.def("outputs", .def("outputs",
...@@ -436,11 +478,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -436,11 +478,11 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
BindProgramDesc(m); BindProgramDesc(&m);
BindBlockDesc(m); BindBlockDesc(&m);
BindVarDsec(m); BindVarDsec(&m);
BindOpDesc(m); BindOpDesc(&m);
BindConstValue(m); BindConstValue(&m);
py::class_<framework::LoDRankTable>(m, "LodRankTable") py::class_<framework::LoDRankTable>(m, "LodRankTable")
.def("items", [](framework::LoDRankTable &table) { .def("items", [](framework::LoDRankTable &table) {
...@@ -502,16 +544,23 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -502,16 +544,23 @@ All parameter, weight, gradient are variables in Paddle.
[](ParallelExecutor &self, size_t num_threads, bool use_event, [](ParallelExecutor &self, size_t num_threads, bool use_event,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params, const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const std::unordered_set<std::string> &bcast_vars,
const ProgramDesc &main_program, const std::string &loss_var_name, const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope, bool allow_op_delay) { Scope *scope, std::vector<Scope *> &local_scopes,
new (&self) ParallelExecutor(num_threads, use_event, places, bool allow_op_delay) {
params, startup_program, main_program, new (&self)
loss_var_name, scope, allow_op_delay); ParallelExecutor(num_threads, use_event, places, params,
bcast_vars, main_program, loss_var_name,
scope, local_scopes, allow_op_delay);
}) })
.def("local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes();
},
py::return_value_policy::reference)
.def("run", &ParallelExecutor::Run); .def("run", &ParallelExecutor::Run);
BindRecordIOWriter(m); BindRecordIOWriter(&m);
return m.ptr(); return m.ptr();
} }
} // namespace pybind } // namespace pybind
......
...@@ -13,13 +13,19 @@ ...@@ -13,13 +13,19 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/pybind/recordio.h" #include "paddle/fluid/pybind/recordio.h"
#include <fstream> #include <fstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/writer.h" #include "paddle/fluid/recordio/writer.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
namespace {
class RecordIOWriter { class RecordIOWriter {
public: public:
RecordIOWriter(const std::string& filename, recordio::Compressor compressor, RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
...@@ -49,8 +55,10 @@ class RecordIOWriter { ...@@ -49,8 +55,10 @@ class RecordIOWriter {
recordio::Writer writer_; recordio::Writer writer_;
}; };
void BindRecordIOWriter(py::module& m) { } // namespace
py::class_<RecordIOWriter> writer(m, "RecordIOWriter", "");
void BindRecordIOWriter(py::module* m) {
py::class_<RecordIOWriter> writer(*m, "RecordIOWriter", "");
py::enum_<recordio::Compressor>(writer, "Compressor", "") py::enum_<recordio::Compressor>(writer, "Compressor", "")
.value("Snappy", recordio::Compressor::kSnappy) .value("Snappy", recordio::Compressor::kSnappy)
.value("NoCompress", recordio::Compressor::kNoCompress); .value("NoCompress", recordio::Compressor::kNoCompress);
......
...@@ -21,6 +21,7 @@ namespace py = pybind11; ...@@ -21,6 +21,7 @@ namespace py = pybind11;
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
extern void BindRecordIOWriter(py::module& m); void BindRecordIOWriter(py::module* m);
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <Python.h>
#include <string> #include <string>
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -21,12 +24,8 @@ limitations under the License. */ ...@@ -21,12 +24,8 @@ limitations under the License. */
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
namespace py = pybind11;
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
namespace details { namespace details {
template <bool less, size_t I, typename... ARGS> template <bool less, size_t I, typename... ARGS>
...@@ -34,16 +33,16 @@ struct CastToPyBufferImpl; ...@@ -34,16 +33,16 @@ struct CastToPyBufferImpl;
template <size_t I, typename... ARGS> template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<false, I, ARGS...> { struct CastToPyBufferImpl<false, I, ARGS...> {
py::buffer_info operator()(framework::Tensor &tensor) { pybind11::buffer_info operator()(const framework::Tensor &tensor) {
PADDLE_THROW("This type of tensor cannot be expose to Python"); PADDLE_THROW("This type of tensor cannot be expose to Python");
return py::buffer_info(); return pybind11::buffer_info();
} }
}; };
template <size_t I, typename... ARGS> template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<true, I, ARGS...> { struct CastToPyBufferImpl<true, I, ARGS...> {
using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type; using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
py::buffer_info operator()(framework::Tensor &tensor) { pybind11::buffer_info operator()(const framework::Tensor &tensor) {
if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) { if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
auto dim_vec = framework::vectorize(tensor.dims()); auto dim_vec = framework::vectorize(tensor.dims());
std::vector<size_t> dims_outside; std::vector<size_t> dims_outside;
...@@ -82,15 +81,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -82,15 +81,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
if (std::type_index(typeid(CUR_TYPE)) == if (std::type_index(typeid(CUR_TYPE)) ==
std::type_index(typeid(platform::float16))) { std::type_index(typeid(platform::float16))) {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE), return pybind11::buffer_info(
dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
"e", /* np.dtype('e') == np.float16 */ "e", /* np.dtype('e') == np.float16 */
(size_t)framework::arity(dst_tensor.dims()), (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
dims_outside, strides);
} else { } else {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE), return pybind11::buffer_info(
py::format_descriptor<CUR_TYPE>::format(), dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
(size_t)framework::arity(dst_tensor.dims()), pybind11::format_descriptor<CUR_TYPE>::format(),
dims_outside, strides); (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
} }
} else { } else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value; constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
...@@ -101,7 +100,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -101,7 +100,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
} // namespace details } // namespace details
inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
auto buffer_info = auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool, details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
platform::float16>()(tensor); platform::float16>()(tensor);
...@@ -109,7 +108,7 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { ...@@ -109,7 +108,7 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
} }
template <typename T> template <typename T>
T TensorGetElement(framework::Tensor &self, size_t offset) { T TensorGetElement(const framework::Tensor &self, size_t offset) {
if (platform::is_cpu_place(self.place())) { if (platform::is_cpu_place(self.place())) {
return self.data<T>()[offset]; return self.data<T>()[offset];
} else { } else {
...@@ -121,64 +120,70 @@ T TensorGetElement(framework::Tensor &self, size_t offset) { ...@@ -121,64 +120,70 @@ T TensorGetElement(framework::Tensor &self, size_t offset) {
// TODO(dzhwinter) : fix the redundent Tensor allocate and free // TODO(dzhwinter) : fix the redundent Tensor allocate and free
template <typename T> template <typename T>
void TensorSetElement(framework::Tensor &self, size_t offset, T elem) { void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self.place())) { if (platform::is_gpu_place(self->place())) {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor); std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopy(self, platform::CPUPlace(), dst.get()); framework::TensorCopy(*self, platform::CPUPlace(), dst.get());
dst->data<T>()[offset] = elem; dst->data<T>()[offset] = elem;
framework::TensorCopy(*dst.get(), self.place(), &self); framework::TensorCopy(*dst.get(), self->place(), self);
} else if (platform::is_cpu_place(self.place())) { } else if (platform::is_cpu_place(self->place())) {
self.data<T>()[offset] = elem; self->data<T>()[offset] = elem;
} }
} }
template <typename T> template <typename T>
void PyCPUTensorSetFromArray( void PyCPUTensorSetFromArray(
framework::Tensor &self, framework::Tensor *self,
py::array_t<T, py::array::c_style | py::array::forcecast> array, pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
paddle::platform::CPUPlace &place) { array,
paddle::platform::CPUPlace place) {
std::vector<int64_t> dims; std::vector<int64_t> dims;
dims.reserve(array.ndim()); dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) { for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]); dims.push_back(static_cast<int>(array.shape()[i]));
} }
self.Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self->mutable_data<T>(place);
std::memcpy(dst, array.data(), sizeof(T) * array.size()); std::memcpy(dst, array.data(), sizeof(T) * array.size());
} }
template <> template <>
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
void PyCPUTensorSetFromArray( void PyCPUTensorSetFromArray(
framework::Tensor &self, framework::Tensor *self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array, pybind11::array_t<uint16_t,
paddle::platform::CPUPlace &place) { pybind11::array::c_style | pybind11::array::forcecast>
array,
paddle::platform::CPUPlace place) {
std::vector<int64_t> dims; std::vector<int64_t> dims;
dims.reserve(array.ndim()); dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) { for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]); dims.push_back(static_cast<int>(array.shape()[i]));
} }
self.Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place); auto *dst = self->mutable_data<platform::float16>(place);
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size()); std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
framework::Tensor &self, framework::Tensor *self,
py::array_t<T, py::array::c_style | py::array::forcecast> array, pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
paddle::platform::CUDAPlace &place) { array,
paddle::platform::CUDAPlace place) {
std::vector<int64_t> dims; std::vector<int64_t> dims;
dims.reserve(array.ndim()); dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) { for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]); dims.push_back(static_cast<int>(array.shape()[i]));
} }
self.Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self->mutable_data<T>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx = auto dev_ctx =
...@@ -188,18 +193,22 @@ void PyCUDATensorSetFromArray( ...@@ -188,18 +193,22 @@ void PyCUDATensorSetFromArray(
} }
template <> template <>
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
framework::Tensor &self, framework::Tensor *self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array, pybind11::array_t<uint16_t,
paddle::platform::CUDAPlace &place) { pybind11::array::c_style | pybind11::array::forcecast>
array,
paddle::platform::CUDAPlace place) {
std::vector<int64_t> dims; std::vector<int64_t> dims;
dims.reserve(array.ndim()); dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) { for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]); dims.push_back(static_cast<int>(array.shape()[i]));
} }
self.Resize(framework::make_ddim(dims)); self->Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place); auto *dst = self->mutable_data<platform::float16>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx = auto dev_ctx =
...@@ -208,6 +217,43 @@ void PyCUDATensorSetFromArray( ...@@ -208,6 +217,43 @@ void PyCUDATensorSetFromArray(
sizeof(uint16_t) * array.size(), sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice, dev_ctx->stream());
} }
template <typename T>
void PyCUDAPinnedTensorSetFromArray(
framework::Tensor *self,
pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
array,
const paddle::platform::CUDAPinnedPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back(static_cast<int>(array.shape()[i]));
}
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<T>(place);
std::memcpy(dst, array.data(), sizeof(T) * array.size());
}
template <>
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
void PyCUDAPinnedTensorSetFromArray(
framework::Tensor *self,
pybind11::array_t<uint16_t,
pybind11::array::c_style | pybind11::array::forcecast>
array,
const paddle::platform::CUDAPinnedPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back(static_cast<int>(array.shape()[i]));
}
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<platform::float16>(place);
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
#endif #endif
} // namespace pybind } // namespace pybind
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/pybind/tensor_py.h"
#include <iostream>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
TEST(TensorPy, CastToPyBufferImpl) {
typedef int ElemType;
paddle::framework::Tensor t;
auto d = paddle::framework::make_ddim({1, 2, 3});
int* p = t.mutable_data<ElemType>(d, paddle::platform::CPUPlace());
for (int i = 0; i < paddle::framework::product(d); ++i) {
p[i] = i;
}
pybind11::buffer_info bi = paddle::pybind::CastToPyBuffer(t);
EXPECT_EQ(bi.itemsize, static_cast<size_t>(sizeof(ElemType)));
EXPECT_EQ(bi.size, static_cast<size_t>(paddle::framework::product(d)));
EXPECT_EQ(bi.ndim, static_cast<size_t>(3)); // 3-dimensional as d.
EXPECT_EQ(bi.shape.size(), 3U); // as Dim d.
EXPECT_EQ(bi.shape[0], static_cast<size_t>(1));
EXPECT_EQ(bi.shape[1], static_cast<size_t>(2));
EXPECT_EQ(bi.shape[2], static_cast<size_t>(3));
EXPECT_EQ(bi.strides.size(), 3U); // 3-dimensional as d.
EXPECT_EQ(bi.strides[2], static_cast<size_t>(sizeof(ElemType)));
EXPECT_EQ(bi.strides[1], static_cast<size_t>(sizeof(ElemType) * 3));
EXPECT_EQ(bi.strides[0], static_cast<size_t>(sizeof(ElemType) * 2 * 3));
}
...@@ -14,11 +14,13 @@ ...@@ -14,11 +14,13 @@
#include "paddle/fluid/recordio/chunk.h" #include "paddle/fluid/recordio/chunk.h"
#include <algorithm>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "snappystream.hpp" #include "snappy_stream/include/snappystream.hpp"
#include "zlib.h" #include "zlib/include/zlib.h"
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
...@@ -58,8 +60,8 @@ static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) { ...@@ -58,8 +60,8 @@ static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
* Copy stream in to another stream * Copy stream in to another stream
*/ */
static void PipeStream(std::istream& in, std::ostream& os) { static void PipeStream(std::istream& in, std::ostream& os) {
ReadStreamByBuf( ReadStreamByBuf(in, 0,
in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); }); [&os](const char* buf, size_t len) { os.write(buf, len); });
} }
/** /**
...@@ -68,8 +70,8 @@ static void PipeStream(std::istream& in, std::ostream& os) { ...@@ -68,8 +70,8 @@ static void PipeStream(std::istream& in, std::ostream& os) {
static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) { static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0)); uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) { ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
crc = static_cast<uint32_t>(crc32( crc = static_cast<uint32_t>(crc32(crc, reinterpret_cast<const Bytef*>(buf),
crc, reinterpret_cast<const Bytef*>(buf), static_cast<uInt>(len))); static_cast<uInt>(len)));
}); });
return crc; return crc;
} }
......
...@@ -24,7 +24,7 @@ namespace recordio { ...@@ -24,7 +24,7 @@ namespace recordio {
// A Chunk contains the Header and optionally compressed records. // A Chunk contains the Header and optionally compressed records.
class Chunk { class Chunk {
public: public:
Chunk() : num_bytes_(0) {} Chunk() : num_bytes_(0) {}
void Add(const std::string& buf) { void Add(const std::string& buf) {
num_bytes_ += buf.size(); num_bytes_ += buf.size();
...@@ -46,7 +46,7 @@ public: ...@@ -46,7 +46,7 @@ public:
bool Empty() const { return records_.empty(); } bool Empty() const { return records_.empty(); }
private: private:
std::vector<std::string> records_; std::vector<std::string> records_;
// sum of record lengths in bytes. // sum of record lengths in bytes.
size_t num_bytes_; size_t num_bytes_;
......
...@@ -18,29 +18,27 @@ ...@@ -18,29 +18,27 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
using namespace paddle::recordio;
TEST(Chunk, SaveLoad) { TEST(Chunk, SaveLoad) {
Chunk ch; paddle::recordio::Chunk ch;
ch.Add(std::string("12345", 6)); ch.Add(std::string("12345", 6));
ch.Add(std::string("123", 4)); ch.Add(std::string("123", 4));
std::stringstream ss; std::stringstream ss;
ch.Write(ss, Compressor::kNoCompress); ch.Write(ss, paddle::recordio::Compressor::kNoCompress);
ss.seekg(0); ss.seekg(0);
ch.Parse(ss); ch.Parse(ss);
ASSERT_EQ(ch.NumBytes(), 10U); ASSERT_EQ(ch.NumBytes(), 10U);
} }
TEST(Chunk, Compressor) { TEST(Chunk, Compressor) {
Chunk ch; paddle::recordio::Chunk ch;
ch.Add(std::string("12345", 6)); ch.Add(std::string("12345", 6));
ch.Add(std::string("123", 4)); ch.Add(std::string("123", 4));
ch.Add(std::string("123", 4)); ch.Add(std::string("123", 4));
ch.Add(std::string("123", 4)); ch.Add(std::string("123", 4));
std::stringstream ss; std::stringstream ss;
ch.Write(ss, Compressor::kSnappy); ch.Write(ss, paddle::recordio::Compressor::kSnappy);
std::stringstream ss2; std::stringstream ss2;
ch.Write(ss2, Compressor::kNoCompress); ch.Write(ss2, paddle::recordio::Compressor::kNoCompress);
ASSERT_LE(ss.tellp(), ss2.tellp()); // Compress should contain less data; ASSERT_LE(ss.tellp(), ss2.tellp()); // Compress should contain less data;
ch.Clear(); ch.Clear();
......
...@@ -37,7 +37,7 @@ enum class Compressor : uint32_t { ...@@ -37,7 +37,7 @@ enum class Compressor : uint32_t {
// Header is the metadata of Chunk // Header is the metadata of Chunk
class Header { class Header {
public: public:
Header(); Header();
Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs); Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
...@@ -51,7 +51,7 @@ public: ...@@ -51,7 +51,7 @@ public:
Compressor CompressType() const { return compressor_; } Compressor CompressType() const { return compressor_; }
uint32_t CompressSize() const { return compress_size_; } uint32_t CompressSize() const { return compress_size_; }
private: private:
uint32_t num_records_; uint32_t num_records_;
uint32_t checksum_; uint32_t checksum_;
Compressor compressor_; Compressor compressor_;
......
...@@ -18,14 +18,12 @@ ...@@ -18,14 +18,12 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
using namespace paddle::recordio;
TEST(Recordio, ChunkHead) { TEST(Recordio, ChunkHead) {
Header hdr(0, 1, Compressor::kGzip, 3); paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3);
std::stringstream ss; std::stringstream ss;
hdr.Write(ss); hdr.Write(ss);
ss.seekg(0, std::ios::beg); ss.seekg(0, std::ios::beg);
Header hdr2; paddle::recordio::Header hdr2;
hdr2.Parse(ss); hdr2.Parse(ss);
EXPECT_TRUE(hdr == hdr2); EXPECT_TRUE(hdr == hdr2);
} }
...@@ -13,10 +13,14 @@ ...@@ -13,10 +13,14 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
#include <string>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
Scanner::Scanner(std::unique_ptr<std::istream> &&stream) Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
: stream_(std::move(stream)) { : stream_(std::move(stream)) {
Reset(); Reset();
......
...@@ -16,12 +16,15 @@ ...@@ -16,12 +16,15 @@
#include <fstream> #include <fstream>
#include <memory> #include <memory>
#include <string>
#include "paddle/fluid/recordio/chunk.h" #include "paddle/fluid/recordio/chunk.h"
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
class Scanner { class Scanner {
public: public:
explicit Scanner(std::unique_ptr<std::istream>&& stream); explicit Scanner(std::unique_ptr<std::istream>&& stream);
explicit Scanner(const std::string& filename); explicit Scanner(const std::string& filename);
...@@ -32,7 +35,7 @@ public: ...@@ -32,7 +35,7 @@ public:
bool HasNext() const; bool HasNext() const;
private: private:
std::unique_ptr<std::istream> stream_; std::unique_ptr<std::istream> stream_;
Chunk cur_chunk_; Chunk cur_chunk_;
size_t offset_; size_t offset_;
......
...@@ -12,9 +12,14 @@ ...@@ -12,9 +12,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/recordio/writer.h" #include "paddle/fluid/recordio/writer.h"
#include <string>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
void Writer::Write(const std::string& record) { void Writer::Write(const std::string& record) {
cur_chunk_.Add(record); cur_chunk_.Add(record);
if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) { if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {
......
...@@ -11,16 +11,17 @@ ...@@ -11,16 +11,17 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <string>
#include "paddle/fluid/recordio/chunk.h" #include "paddle/fluid/recordio/chunk.h"
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
class Writer { class Writer {
public: public:
Writer(std::ostream* sout, Writer(std::ostream* sout, Compressor compressor,
Compressor compressor,
size_t max_num_records_in_chunk = 1000) size_t max_num_records_in_chunk = 1000)
: stream_(*sout), : stream_(*sout),
max_num_records_in_chunk_(max_num_records_in_chunk), max_num_records_in_chunk_(max_num_records_in_chunk),
...@@ -32,7 +33,7 @@ public: ...@@ -32,7 +33,7 @@ public:
~Writer(); ~Writer();
private: private:
std::ostream& stream_; std::ostream& stream_;
size_t max_num_records_in_chunk_; size_t max_num_records_in_chunk_;
Chunk cur_chunk_; Chunk cur_chunk_;
......
...@@ -12,9 +12,10 @@ ...@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "gtest/gtest.h"
#include <sstream> #include <sstream>
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h" #include "paddle/fluid/recordio/writer.h"
......
../framework/.clang-format
\ No newline at end of file
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "piece.h" #include "paddle/fluid/string/piece.h"
#include <string.h> #include <string.h>
......
...@@ -71,6 +71,8 @@ ...@@ -71,6 +71,8 @@
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <string>
#include "tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat #include "tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat
namespace paddle { namespace paddle {
......
...@@ -11,7 +11,8 @@ ...@@ -11,7 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "printf.h"
#include "paddle/fluid/string/printf.h"
#include <string> #include <string>
...@@ -21,7 +22,7 @@ TEST(StringPrintf, StringPrintf) { ...@@ -21,7 +22,7 @@ TEST(StringPrintf, StringPrintf) {
std::string weekday = "Wednesday"; std::string weekday = "Wednesday";
const char* month = "July"; const char* month = "July";
size_t day = 27; size_t day = 27;
long hour = 14; int hour = 14;
int min = 44; int min = 44;
EXPECT_EQ(std::string("Wednesday, July 27, 14:44"), EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day, paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "to_string.h" #include "paddle/fluid/string/to_string.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
constexpr char kOutputString[] = "User Defined Output"; constexpr char kOutputString[] = "User Defined Output";
...@@ -26,14 +26,13 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) { ...@@ -26,14 +26,13 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
} }
TEST(to_string, normal) { TEST(to_string, normal) {
using namespace paddle::string; using paddle::string::to_string;
ASSERT_EQ("10", to_string(10)); ASSERT_EQ("10", to_string(10));
ASSERT_EQ("abc", to_string("abc")); ASSERT_EQ("abc", to_string("abc"));
ASSERT_EQ("1.2", to_string(1.2)); ASSERT_EQ("1.2", to_string(1.2));
} }
TEST(to_string, user_defined) { TEST(to_string, user_defined) {
using namespace paddle::string;
UserDefinedClass instance; UserDefinedClass instance;
ASSERT_EQ(kOutputString, to_string(instance)); ASSERT_EQ(kOutputString, paddle::string::to_string(instance));
} }
...@@ -14,6 +14,11 @@ function(gserver_test TARGET) ...@@ -14,6 +14,11 @@ function(gserver_test TARGET)
COMMAND ${TARGET}) COMMAND ${TARGET})
endfunction() endfunction()
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
)
add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
gserver_test(test_LayerGrad) gserver_test(test_LayerGrad)
gserver_test(test_CRFLayerGrad) gserver_test(test_CRFLayerGrad)
gserver_test(test_CrossEntropyOverBeamGrad) gserver_test(test_CrossEntropyOverBeamGrad)
...@@ -31,12 +36,12 @@ gserver_test(test_Upsample) ...@@ -31,12 +36,12 @@ gserver_test(test_Upsample)
set(PYTHON_PATH set(PYTHON_PATH
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests) ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/gserver/tests)
function(gserver_test_with_python TARGET) function(gserver_test_with_python TARGET)
add_unittest_without_exec(${TARGET} ${TARGET}.cpp) add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
add_test(NAME ${TARGET} add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endfunction() endfunction()
gserver_test_with_python(test_PyDataProvider2) gserver_test_with_python(test_PyDataProvider2)
...@@ -57,7 +62,7 @@ if(WITH_MKLDNN) ...@@ -57,7 +62,7 @@ if(WITH_MKLDNN)
LayerGradUtil.cpp) LayerGradUtil.cpp)
add_test(NAME test_MKLDNN add_test(NAME test_MKLDNN
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
endif() endif()
############### test_WarpCTCLayer ####################### ############### test_WarpCTCLayer #######################
...@@ -66,7 +71,7 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) ...@@ -66,7 +71,7 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
test_WarpCTCLayer.cpp) test_WarpCTCLayer.cpp)
add_test(NAME test_WarpCTCLayer add_test(NAME test_WarpCTCLayer
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
endif() endif()
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
...@@ -84,15 +89,15 @@ if(NOT MOBILE_INFERENCE) ...@@ -84,15 +89,15 @@ if(NOT MOBILE_INFERENCE)
endif() endif()
add_test(NAME test_NetworkCompare add_test(NAME test_NetworkCompare
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu} COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
############ test_CompareSparse ################ ############ test_CompareSparse ################
add_unittest_without_exec(test_CompareSparse add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp) test_CompareSparse.cpp)
if(NOT ON_TRAVIS) if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse add_test(NAME test_CompareSparse
COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6 COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endif() endif()
endif() endif()
...@@ -20,10 +20,8 @@ limitations under the License. */ ...@@ -20,10 +20,8 @@ limitations under the License. */
#include "paddle/math/MathUtils.h" #include "paddle/math/MathUtils.h"
#include "paddle/testing/TestUtil.h" #include "paddle/testing/TestUtil.h"
using namespace paddle; void setPoolConfig(paddle::TestConfig* config,
paddle::PoolConfig* pool,
void setPoolConfig(TestConfig* config,
PoolConfig* pool,
const string& poolType) { const string& poolType) {
(*config).biasSize = 0; (*config).biasSize = 0;
(*config).layerConfig.set_type("pool"); (*config).layerConfig.set_type("pool");
...@@ -42,21 +40,23 @@ void setPoolConfig(TestConfig* config, ...@@ -42,21 +40,23 @@ void setPoolConfig(TestConfig* config,
pool->set_stride(sw); pool->set_stride(sw);
pool->set_stride_y(sh); pool->set_stride_y(sh);
int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false); int ow =
int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false); paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
int oh =
paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
pool->set_output_x(ow); pool->set_output_x(ow);
pool->set_output_y(oh); pool->set_output_y(oh);
} }
LayerPtr doOneUpsampleTest(MatrixPtr& inputMat, paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
const string& poolType, const string& poolType,
bool use_gpu, bool use_gpu,
real* tempGradData) { real* tempGradData) {
/* prepare maxPoolWithMaskLayer */ /* prepare maxPoolWithMaskLayer */
TestConfig config; paddle::TestConfig config;
config.inputDefs.push_back({INPUT_DATA, "layer_0", 128, 0}); config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
LayerInputConfig* input = config.layerConfig.add_inputs(); paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
PoolConfig* pool = input->mutable_pool_conf(); paddle::PoolConfig* pool = input->mutable_pool_conf();
pool->set_img_size(8); pool->set_img_size(8);
pool->set_img_size_y(8); pool->set_img_size_y(8);
...@@ -66,9 +66,9 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat, ...@@ -66,9 +66,9 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
config.layerConfig.set_name("MaxPoolWithMask"); config.layerConfig.set_name("MaxPoolWithMask");
std::vector<DataLayerPtr> dataLayers; std::vector<paddle::DataLayerPtr> dataLayers;
LayerMap layerMap; paddle::LayerMap layerMap;
vector<Argument> datas; vector<paddle::Argument> datas;
initDataLayer(config, initDataLayer(config,
&dataLayers, &dataLayers,
...@@ -82,20 +82,20 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat, ...@@ -82,20 +82,20 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
dataLayers[0]->getOutputValue()->copyFrom(*inputMat); dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
FLAGS_use_gpu = use_gpu; FLAGS_use_gpu = use_gpu;
std::vector<ParameterPtr> parameters; std::vector<paddle::ParameterPtr> parameters;
LayerPtr maxPoolingWithMaskOutputLayer; paddle::LayerPtr maxPoolingWithMaskOutputLayer;
initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer); initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
maxPoolingWithMaskOutputLayer->forward(PASS_GC); maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
/* prepare the upsample layer */ /* prepare the upsample layer */
LayerConfig upsampleLayerConfig; paddle::LayerConfig upsampleLayerConfig;
upsampleLayerConfig.set_type("upsample"); upsampleLayerConfig.set_type("upsample");
LayerInputConfig* input1 = upsampleLayerConfig.add_inputs(); paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
upsampleLayerConfig.add_inputs(); upsampleLayerConfig.add_inputs();
UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf(); paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
upsampleConfig->set_scale(2); upsampleConfig->set_scale(2);
ImageConfig* imageConfig = upsampleConfig->mutable_image_conf(); paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
imageConfig->set_channels(2); imageConfig->set_channels(2);
imageConfig->set_img_size(4); imageConfig->set_img_size(4);
imageConfig->set_img_size_y(4); imageConfig->set_img_size_y(4);
...@@ -103,17 +103,18 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat, ...@@ -103,17 +103,18 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
upsampleLayerConfig.set_name("upsample"); upsampleLayerConfig.set_name("upsample");
for (size_t i = 0; i < 2; i++) { for (size_t i = 0; i < 2; i++) {
LayerInputConfig& inputTemp = *(upsampleLayerConfig.mutable_inputs(i)); paddle::LayerInputConfig& inputTemp =
*(upsampleLayerConfig.mutable_inputs(i));
inputTemp.set_input_layer_name("MaxPoolWithMask"); inputTemp.set_input_layer_name("MaxPoolWithMask");
} }
LayerPtr upsampleLayer; paddle::LayerPtr upsampleLayer;
ParameterMap parameterMap; paddle::ParameterMap parameterMap;
upsampleLayer = Layer::create(upsampleLayerConfig); upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
layerMap[upsampleLayerConfig.name()] = upsampleLayer; layerMap[upsampleLayerConfig.name()] = upsampleLayer;
upsampleLayer->init(layerMap, parameterMap); upsampleLayer->init(layerMap, parameterMap);
upsampleLayer->setNeedGradient(true); upsampleLayer->setNeedGradient(true);
upsampleLayer->forward(PASS_GC); upsampleLayer->forward(paddle::PASS_GC);
upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128); upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
upsampleLayer->backward(); upsampleLayer->backward();
...@@ -122,31 +123,31 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat, ...@@ -122,31 +123,31 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
TEST(Layer, maxPoolingWithMaskOutputLayerFwd) { TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
bool useGpu = false; bool useGpu = false;
MatrixPtr inputMat; paddle::MatrixPtr inputMat;
MatrixPtr inputGPUMat; paddle::MatrixPtr inputGPUMat;
MatrixPtr tempGradMat; paddle::MatrixPtr tempGradMat;
inputMat = Matrix::create(1, 128, false, useGpu); inputMat = paddle::Matrix::create(1, 128, false, useGpu);
inputMat->randomizeUniform(); inputMat->randomizeUniform();
tempGradMat = Matrix::create(1, 128, false, useGpu); tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
tempGradMat->randomizeUniform(); tempGradMat->randomizeUniform();
real* data = inputMat->getData();
real* tempGradData = tempGradMat->getData(); real* tempGradData = tempGradMat->getData();
LayerPtr upsampleLayerCPU = paddle::LayerPtr upsampleLayerCPU =
doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData); doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
useGpu = true; useGpu = true;
inputGPUMat = Matrix::create(1, 128, false, useGpu); real* data = inputMat->getData();
inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
inputGPUMat->copyFrom(data, 128); inputGPUMat->copyFrom(data, 128);
LayerPtr upsampleLayerGPU = doOneUpsampleTest( paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
inputGPUMat, "max-pool-with-mask", useGpu, tempGradData); inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
checkMatrixEqual(upsampleLayerCPU->getOutput("").value, paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
upsampleLayerGPU->getOutput("").value); upsampleLayerGPU->getOutput("").value);
checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(), paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
upsampleLayerGPU->getPrev(0)->getOutputGrad()); upsampleLayerGPU->getPrev(0)->getOutputGrad());
#endif #endif
} }
...@@ -6,6 +6,6 @@ if(WITH_TESTING) ...@@ -6,6 +6,6 @@ if(WITH_TESTING)
add_library(paddle_test_util STATIC TestUtil.cpp) add_library(paddle_test_util STATIC TestUtil.cpp)
add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags) cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
endif() endif()
endif() endif()
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
)
add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
set(PYTHON_PATH set(PYTHON_PATH
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests) ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/trainer/tests)
function(trainer_test TARGET) function(trainer_test TARGET)
add_unittest_without_exec(${TARGET} ${TARGET}.cpp) add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
add_test(NAME ${TARGET} add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endfunction() endfunction()
trainer_test(test_Compare) trainer_test(test_Compare)
...@@ -22,11 +27,11 @@ if(WITH_PYTHON) ...@@ -22,11 +27,11 @@ if(WITH_PYTHON)
add_test(NAME test_TrainerOnePass add_test(NAME test_TrainerOnePass
COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endif() endif()
#################### test_config_parser ######################### #################### test_config_parser #########################
add_test(NAME test_config_parser add_test(NAME test_config_parser
COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE}
${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
file(GLOB UTIL_HEADERS . *.h) file(GLOB UTIL_HEADERS . *.h)
file(GLOB UTIL_SOURCES . *.cpp) file(GLOB UTIL_SOURCES . *.cpp)
create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c) ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
set(UTIL_RES ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c) set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
if(APPLE) if(APPLE)
file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp) file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
......
...@@ -15,13 +15,14 @@ foreach(filename ${proto_filenames}) ...@@ -15,13 +15,14 @@ foreach(filename ${proto_filenames})
get_filename_component(ABS_FIL ${filename} ABSOLUTE) get_filename_component(ABS_FIL ${filename} ABSOLUTE)
get_filename_component(FIL_WE ${filename} NAME_WE) get_filename_component(FIL_WE ${filename} NAME_WE)
set(CUR_PROTO_GEN_PY set(CUR_PROTO_GEN_PY
${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py) ${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
set(PROTO_GEN_PY set(PROTO_GEN_PY
${CUR_PROTO_GEN_PY} ${CUR_PROTO_GEN_PY}
${PROTO_GEN_PY}) ${PROTO_GEN_PY})
add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY} add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto" ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
"-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL} "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
DEPENDS ${ABS_FIL} protoc) DEPENDS ${ABS_FIL} protoc)
endforeach() endforeach()
......
...@@ -47,14 +47,16 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ...@@ -47,14 +47,16 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py) ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
DEPENDS paddle_pybind) DEPENDS paddle_pybind)
add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so) add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND touch stub.cc COMMAND touch stub.cc
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
......
...@@ -31,7 +31,7 @@ import regularizer ...@@ -31,7 +31,7 @@ import regularizer
import average import average
from param_attr import ParamAttr, WeightNormParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
from distribute_transpiler import DistributeTranspiler from distribute_transpiler import DistributeTranspiler
from distribute_transpiler_simple import SimpleDistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
...@@ -57,6 +57,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ ...@@ -57,6 +57,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
'LoDTensor', 'LoDTensor',
'CPUPlace', 'CPUPlace',
'CUDAPlace', 'CUDAPlace',
'CUDAPinnedPlace',
'Tensor', 'Tensor',
'ParamAttr', 'ParamAttr',
'WeightNormParamAttr', 'WeightNormParamAttr',
......
...@@ -16,6 +16,7 @@ import sys ...@@ -16,6 +16,7 @@ import sys
import re import re
from graphviz import GraphPreviewGenerator from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
from google.protobuf import text_format
_vartype2str_ = [ _vartype2str_ = [
"UNK", "UNK",
...@@ -100,7 +101,7 @@ def repr_var(vardesc): ...@@ -100,7 +101,7 @@ def repr_var(vardesc):
def pprint_program_codes(program_desc): def pprint_program_codes(program_desc):
reprs = [] reprs = []
for block_idx in range(program_desc.num_blocks()): for block_idx in range(program_desc.desc.num_blocks()):
block_desc = program_desc.block(block_idx) block_desc = program_desc.block(block_idx)
block_repr = pprint_block_codes(block_desc) block_repr = pprint_block_codes(block_desc)
reprs.append(block_repr) reprs.append(block_repr)
...@@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False): ...@@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False):
if type(block_desc) is not framework_pb2.BlockDesc: if type(block_desc) is not framework_pb2.BlockDesc:
block_desc = framework_pb2.BlockDesc.FromString( block_desc = framework_pb2.BlockDesc.FromString(
block_desc.serialize_to_string()) block_desc.desc.serialize_to_string())
var_reprs = [] var_reprs = []
op_reprs = [] op_reprs = []
for var in block_desc.vars: for var in block_desc.vars:
...@@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): ...@@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
# draw parameters and args # draw parameters and args
vars = {} vars = {}
for var in desc.vars: for var in desc.vars:
shape = [str(i) for i in var.lod_tensor.tensor.dims] # TODO(gongwb): format the var.type
if not shape:
shape = ['null']
# create var # create var
if var.persistable: if var.persistable:
varn = graph.add_param( varn = graph.add_param(
var.name, var.type, shape, highlight=need_highlight(var.name)) var.name,
str(var.type).replace("\n", "<br />", 1),
highlight=need_highlight(var.name))
else: else:
varn = graph.add_arg(var.name, highlight=need_highlight(var.name)) varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
vars[var.name] = varn vars[var.name] = varn
...@@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): ...@@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
for var in op.outputs: for var in op.outputs:
add_op_link_var(opn, var, True) add_op_link_var(opn, var, True)
graph(path, show=True) graph(path, show=False)
...@@ -17,7 +17,7 @@ import framework ...@@ -17,7 +17,7 @@ import framework
from framework import Program, default_main_program, default_startup_program, Parameter, Variable from framework import Program, default_main_program, default_startup_program, Parameter, Variable
import optimizer import optimizer
from layer_helper import LayerHelper from layer_helper import LayerHelper
from distributed_spliter import * import distributed_splitter as splitter
import math import math
from . import core from . import core
import debuger import debuger
...@@ -138,7 +138,7 @@ class DistributeTranspiler: ...@@ -138,7 +138,7 @@ class DistributeTranspiler:
program=None, program=None,
pservers="127.0.0.1:6174", pservers="127.0.0.1:6174",
trainers=1, trainers=1,
split_method=round_robin): split_method=splitter.round_robin):
""" """
Transpile the program to distributed data-parallelism programs. Transpile the program to distributed data-parallelism programs.
The main_program will be transformed to use a remote parameter server The main_program will be transformed to use a remote parameter server
...@@ -408,11 +408,7 @@ class DistributeTranspiler: ...@@ -408,11 +408,7 @@ class DistributeTranspiler:
pserver_vars = pserver_program.global_block().vars pserver_vars = pserver_program.global_block().vars
created_var_map = dict() created_var_map = dict()
for _, var in pserver_vars.iteritems(): for _, var in pserver_vars.iteritems():
tmpvar = s_prog.global_block().create_var( tmpvar = s_prog.global_block().clone_variable(var)
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
created_var_map[var.name] = tmpvar created_var_map[var.name] = tmpvar
# 2. rename op outputs # 2. rename op outputs
...@@ -708,11 +704,7 @@ class DistributeTranspiler: ...@@ -708,11 +704,7 @@ class DistributeTranspiler:
varlist = [varlist] varlist = [varlist]
for var in varlist: for var in varlist:
program.global_block().create_var( program.global_block().clone_variable(var)
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
optimize_block.append_op( optimize_block.append_op(
type=opt_op.type, type=opt_op.type,
......
...@@ -17,8 +17,10 @@ def hash_name(varlist, pserver_endpoints): ...@@ -17,8 +17,10 @@ def hash_name(varlist, pserver_endpoints):
""" """
hash variable names to several endpoints. hash variable names to several endpoints.
:param varlist: a list of Variables Args:
:return: a map of pserver endpoint -> varname varlist(list): a list of Variables
Returns(dict): a map of pserver endpoint -> varname
""" """
def _hash_block(block_str, total): def _hash_block(block_str, total):
...@@ -34,9 +36,14 @@ def hash_name(varlist, pserver_endpoints): ...@@ -34,9 +36,14 @@ def hash_name(varlist, pserver_endpoints):
def round_robin(varlist, pserver_endpoints): def round_robin(varlist, pserver_endpoints):
""" """
distribute variables to several endpoints. Distribute variables to several endpoints.
Args:
varlist(list): a list of variables
pserver_endpoints(list): a list of pserver endpoints
Returns(list[int]): the endpoint for each variable
""" """
assert (len(varlist) > len(pserver_endpoints)) assert (len(varlist) >= len(pserver_endpoints))
eplist = [] eplist = []
pserver_idx = 0 pserver_idx = 0
......
...@@ -640,6 +640,20 @@ class Operator(object): ...@@ -640,6 +640,20 @@ class Operator(object):
""" """
return self.desc.block_attr(name) return self.desc.block_attr(name)
def all_attrs(self):
"""
Get the attribute dict
Returns(dict): The Operator's attribute dict
"""
attr_names = self.attr_names
attr_map = {}
for n in attr_names:
if n == 'sub_block':
attr_map[n] = self.block_attr(n)
else:
attr_map[n] = self.attr(n)
return attr_map
class Block(object): class Block(object):
def __init__(self, program, idx): def __init__(self, program, idx):
...@@ -838,7 +852,7 @@ class Block(object): ...@@ -838,7 +852,7 @@ class Block(object):
def sync_with_cpp(self): def sync_with_cpp(self):
""" """
Sync with the desc on the c++ end. Sync from the desc on the c++ end.
This method is used to synchronize the c++ desc instance generated by backward. This method is used to synchronize the c++ desc instance generated by backward.
""" """
...@@ -946,13 +960,27 @@ class Block(object): ...@@ -946,13 +960,27 @@ class Block(object):
The new variable cloned from 'var' in current block. The new variable cloned from 'var' in current block.
""" """
assert isinstance(var, Variable) assert isinstance(var, Variable)
return self.create_var( ret_var = None
# make STEP_SCOPES var can be safely cloned.
if var.type == core.VarDesc.VarType.STEP_SCOPES:
ret_var = self.create_var(
name=var.name, persistable=var.persistable, type=var.type)
elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
ret_var = self.create_var(
name=var.name,
shape=var.shape,
dtype=var.dtype,
type=var.type,
persistable=True)
else:
ret_var = self.create_var(
name=var.name, name=var.name,
shape=var.shape, shape=var.shape,
dtype=var.dtype, dtype=var.dtype,
type=var.type, type=var.type,
lod_level=var.lod_level, lod_level=var.lod_level,
persistable=True) persistable=True)
return ret_var
class Program(object): class Program(object):
......
...@@ -83,7 +83,7 @@ class Graph(object): ...@@ -83,7 +83,7 @@ class Graph(object):
file = open(dot_path, 'w') file = open(dot_path, 'w')
file.write(self.__str__()) file.write(self.__str__())
image_path = os.path.join( image_path = os.path.join(
os.path.dirname(__file__), dot_path[:-3] + "pdf") os.path.dirname(dot_path), dot_path[:-3] + "pdf")
cmd = ["dot", "-Tpdf", dot_path, "-o", image_path] cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
subprocess.Popen( subprocess.Popen(
cmd, cmd,
...@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object): ...@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object):
else: else:
self.graph.show(path) self.graph.show(path)
def add_param(self, name, data_type, shape, highlight=False): def add_param(self, name, data_type, highlight=False):
label = '\n'.join([ label = '\n'.join([
'<<table cellpadding="5">', '<<table cellpadding="5">',
' <tr>', ' <tr>',
...@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object): ...@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object):
str(data_type), str(data_type),
' </td>' ' </td>'
' </tr>', ' </tr>',
' <tr>',
' <td>',
'[%s]' % 'x'.join(shape),
' </td>'
' </tr>',
'</table>>', '</table>>',
]) ])
return self.graph.node( return self.graph.node(
......
...@@ -255,7 +255,32 @@ def _copy_reader_var_(block, var): ...@@ -255,7 +255,32 @@ def _copy_reader_var_(block, var):
new_var.desc.set_shapes(var.desc.shapes()) new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes()) new_var.desc.set_dtypes(var.desc.dtypes())
new_var.persistable = True new_var.persistable = True
return monkey_patch_reader_methods(new_var) return new_var
def _copy_reader_create_op_(block, op):
input_param_names = op.input_names
new_input_map = {}
for param_name in input_param_names:
new_input_map[param_name] = []
arg_names = op.input(param_name)
for arg_name in arg_names:
new_input_map[param_name].append(block.var(arg_name))
output_param_names = op.output_names
new_output_map = {}
for param_name in output_param_names:
new_output_map[param_name] = []
arg_names = op.output(param_name)
for arg_name in arg_names:
new_output_map[param_name].append(block.var(arg_name))
new_op = block.append_op(
type=op.type,
inputs=new_input_map,
outputs=new_output_map,
attrs=op.all_attrs())
return new_op
def open_recordio_file(filename, shapes, lod_levels, dtypes): def open_recordio_file(filename, shapes, lod_levels, dtypes):
...@@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes): ...@@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
startup_var.desc.set_dtypes(dtypes) startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var) startup_var)
return monkey_patch_reader_methods(main_prog_var)
def open_files(filenames, thread_num, shapes, lod_levels, dtypes): def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
...@@ -313,22 +339,25 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes): ...@@ -313,22 +339,25 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
startup_var.desc.set_dtypes(dtypes) startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var) startup_var)
return monkey_patch_reader_methods(main_prog_var)
def __create_decorated_reader__(op_type, reader, attrs): def __create_decorated_reader__(op_type, reader, attrs):
var_name = unique_name(op_type) var_name = unique_name(op_type)
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name) startup_var = startup_blk.create_var(name=var_name)
startup_blk.append_op( startop_op = startup_blk.append_op(
type=op_type, type=op_type,
inputs={'UnderlyingReader': reader}, inputs={'UnderlyingReader': reader},
outputs={'Out': [startup_var]}, outputs={'Out': [startup_var]},
attrs=attrs) attrs=attrs)
startup_var.persistable = True startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(), main_prog_block = default_main_program().current_block()
startup_var) main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
_copy_reader_create_op_(main_prog_block, startop_op)
return monkey_patch_reader_methods(main_prog_var)
def create_shuffle_reader(reader, buffer_size): def create_shuffle_reader(reader, buffer_size):
......
...@@ -22,51 +22,123 @@ __all__ = ['ParallelExecutor'] ...@@ -22,51 +22,123 @@ __all__ = ['ParallelExecutor']
class ParallelExecutor(object): class ParallelExecutor(object):
def __init__(self, def __init__(self,
loss_name,
use_cuda, use_cuda,
loss_name=None,
main_program=None,
num_threads=None, num_threads=None,
allow_op_delay=False): allow_op_delay=False,
places = [] share_vars_from=None):
"""
ParallelExecutor can run program in parallel.
Args:
use_cuda(bool): Whether to use CUDA or not.
loss_name(str, default None): The loss name must set in training.
main_program(Program, default None): The program that need to run,
if not provided, then default_main_program will be used.
num_threads(int, default None): How many threads are used for
training.
allow_op_delay(bool, default False): Whether to delay and buffer
some operators together for scheduling or not, which may
improve performance in some cases, defalut False.
share_vars_from(ParallelExecutor, default None): If provied,
it will share variables from the specified ParallelExecutor.
Returns:
A ParallelExecutor object.
Raises:
TypeError: If share_vars_from is provided, but not ParallelExecutor
object.
Examples:
.. code-block:: python
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
"""
self._places = []
self._act_places = []
if use_cuda: if use_cuda:
for i in xrange(core.get_cuda_device_count()): for i in xrange(core.get_cuda_device_count()):
p = core.Place() p = core.Place()
p.set_place(core.CUDAPlace(i)) self._act_places.append(core.CUDAPlace(i))
places.append(p) p.set_place(self._act_places[-1])
self._places.append(p)
else: else:
for i in xrange(multiprocessing.cpu_count()): for i in xrange(multiprocessing.cpu_count()):
p = core.Place() p = core.Place()
p.set_place(core.CPUPlace()) self._act_places.append(core.CPUPlace(i))
places.append(p) p.set_place(self._act_places[-1])
self._places.append(p)
assert self._places, "no place for execution"
if num_threads is None: if num_threads is None:
if use_cuda: if use_cuda:
# Experiments on se-resnext shows that too many threads hurt # Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future. # performance. Worth tunning for other models in the future.
num_threads = len(places) num_threads = len(self._places)
else: else:
min(len(places) * 2, multiprocessing.cpu_count()) min(len(self._places) * 2, multiprocessing.cpu_count())
startup = framework.default_startup_program() main = main_program
main = framework.default_main_program() main = main if main else framework.default_main_program()
scope = executor.global_scope() scope = executor.global_scope()
if share_vars_from and not isinstance(share_vars_from,
ParallelExecutor):
raise TypeError("share_vars_from must be ParallelExecutor.")
local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else []
persistable_vars = [
v.name
for v in filter(lambda var: var.persistable, main.list_vars())
]
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
num_threads, num_threads,
True if use_cuda else False, # use_event True if use_cuda else False, # use_event
places, self._places,
set([ set([
p.name for p in main.global_block().iter_parameters() p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient if not p.stop_gradient
]), ]),
startup.desc, set(persistable_vars),
main.desc, main.desc,
loss_name, loss_name if loss_name else '',
scope, scope,
local_scopes,
allow_op_delay) allow_op_delay)
self.scope = scope self.scope = scope
def run(self, fetch_list): def run(self, fetch_list, feed_dict={}):
"""
:param fetch_list: A list of variable names that will be fetched.
:param feed_dict: A dict mapping for feed variable name to LoDTensor
or numpy array.
:return: fetched value list.
"""
if not isinstance(feed_dict, dict):
raise TypeError("feed_dict should be a dict")
feed_tensor_dict = {}
for i, feed_name in enumerate(feed_dict):
feed_tensor = feed_dict[feed_name]
if not isinstance(feed_tensor, core.LoDTensor):
feed_tensor = core.LoDTensor()
feed_tensor.set(feed_dict[feed_name], self._act_places[0])
feed_tensor_dict[feed_name] = feed_tensor
fetch_var_name = '@FETCHED_VAR_NAME@' fetch_var_name = '@FETCHED_VAR_NAME@'
self.executor.run(fetch_list, fetch_var_name) self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
...@@ -22,9 +22,9 @@ function(py_test_modules TARGET_NAME) ...@@ -22,9 +22,9 @@ function(py_test_modules TARGET_NAME)
set(multiValueArgs MODULES DEPS ARGS ENVS) set(multiValueArgs MODULES DEPS ARGS ENVS)
cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_modules_ENVS} COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS} ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
endfunction() endfunction()
......
...@@ -535,9 +535,37 @@ class TestSwish(OpTest): ...@@ -535,9 +535,37 @@ class TestSwish(OpTest):
#--------------------test MKLDNN-------------------- #--------------------test MKLDNN--------------------
class TestMKLDNNRelu(TestRelu): class TestMKLDNNReluDim2(TestRelu):
def setUp(self): def setUp(self):
super(TestMKLDNNRelu, self).setUp() super(TestMKLDNNReluDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanhDim2(TestTanh):
def setUp(self):
super(TestMKLDNNTanhDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrtDim2(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrtDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbsDim2(TestAbs):
def setUp(self):
super(TestMKLDNNAbsDim2, self).setUp()
self.attrs = {"use_mkldnn": True}
class TestMKLDNNReluDim4(TestRelu):
def setUp(self):
super(TestMKLDNNReluDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs # The same reason with TestAbs
...@@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu): ...@@ -549,9 +577,9 @@ class TestMKLDNNRelu(TestRelu):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanh(TestTanh): class TestMKLDNNTanhDim4(TestTanh):
def setUp(self): def setUp(self):
super(TestMKLDNNTanh, self).setUp() super(TestMKLDNNTanhDim4, self).setUp()
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
...@@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh): ...@@ -560,9 +588,9 @@ class TestMKLDNNTanh(TestTanh):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrt(TestSqrt): class TestMKLDNNSqrtDim4(TestSqrt):
def setUp(self): def setUp(self):
super(TestMKLDNNSqrt, self).setUp() super(TestMKLDNNSqrtDim4, self).setUp()
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
...@@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt): ...@@ -571,9 +599,9 @@ class TestMKLDNNSqrt(TestSqrt):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbs(TestAbs): class TestMKLDNNAbsDim4(TestAbs):
def setUp(self): def setUp(self):
super(TestMKLDNNAbs, self).setUp() super(TestMKLDNNAbsDim4, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs # The same reason with TestAbs
......
...@@ -97,8 +97,11 @@ class TestConv2dOp(OpTest): ...@@ -97,8 +97,11 @@ class TestConv2dOp(OpTest):
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
def testcudnn(self):
return core.is_compiled_with_cuda() and self.use_cudnn
def test_check_output(self): def test_check_output(self):
if self.use_cudnn: if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-5) self.check_output_with_place(place, atol=1e-5)
else: else:
...@@ -107,7 +110,7 @@ class TestConv2dOp(OpTest): ...@@ -107,7 +110,7 @@ class TestConv2dOp(OpTest):
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
if self.use_cudnn: if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
place, place,
...@@ -121,7 +124,7 @@ class TestConv2dOp(OpTest): ...@@ -121,7 +124,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
if self.use_cudnn: if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
place, ['Input'], place, ['Input'],
...@@ -138,7 +141,7 @@ class TestConv2dOp(OpTest): ...@@ -138,7 +141,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
if self.use_cudnn: if self.testcudnn():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
place, ['Filter'], place, ['Filter'],
......
...@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase): ...@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase):
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
print(debuger.pprint_program_codes(p.desc)) print(debuger.pprint_program_codes(p))
debuger.draw_block_graphviz(p.block(0), path="./test.dot")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -115,18 +115,18 @@ class TestLookupTableWIsSelectedRows(OpTest): ...@@ -115,18 +115,18 @@ class TestLookupTableWIsSelectedRows(OpTest):
w_array = np.ones((len(rows), row_numel)).astype("float32") w_array = np.ones((len(rows), row_numel)).astype("float32")
for i in range(len(rows)): for i in range(len(rows)):
w_array[i] *= i w_array[i] *= i
ids_tensor = w_selected_rows.get_tensor() w_tensor = w_selected_rows.get_tensor()
ids_tensor.set(w_array, place) w_tensor.set(w_array, place)
# create Out Variable # create Out Variable
Out_tensor = scope.var('Out').get_tensor() out_tensor = scope.var('Out').get_tensor()
# create and run lookup_table operator # create and run lookup_table operator
lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out') lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
lookup_table.run(scope, place) lookup_table.run(scope, place)
# get result from Out # get result from Out
result_array = np.array(Out_tensor) result_array = np.array(out_tensor)
# all(): return True if all elements of the iterable are true (or if the iterable is empty) # all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(ids_array): for idx, row in enumerate(ids_array):
assert (row[0] == result_array[idx]).all() assert (row[0] == result_array[idx]).all()
......
...@@ -21,7 +21,11 @@ import paddle.dataset.mnist as mnist ...@@ -21,7 +21,11 @@ import paddle.dataset.mnist as mnist
import paddle.dataset.wmt16 as wmt16 import paddle.dataset.wmt16 as wmt16
def simple_fc_net(): def simple_fc_net(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_recordio_file( reader = fluid.layers.open_recordio_file(
filename='./mnist.recordio', filename='./mnist.recordio',
shapes=[[-1, 784], [-1, 1]], shapes=[[-1, 784], [-1, 1]],
...@@ -42,13 +46,18 @@ def simple_fc_net(): ...@@ -42,13 +46,18 @@ def simple_fc_net():
return loss return loss
def fc_with_batchnorm(): def fc_with_batchnorm(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_recordio_file( reader = fluid.layers.open_recordio_file(
filename='./mnist.recordio', filename='./mnist.recordio',
shapes=[[-1, 784], [-1, 1]], shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=['float32', 'int64']) dtypes=['float32', 'int64'])
img, label = fluid.layers.read_file(reader) img, label = fluid.layers.read_file(reader)
hidden = img hidden = img
for _ in xrange(1): for _ in xrange(1):
hidden = fluid.layers.fc( hidden = fluid.layers.fc(
...@@ -135,7 +144,9 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): ...@@ -135,7 +144,9 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
return fluid.layers.elementwise_add(x=short, y=scale, act='relu') return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def SE_ResNeXt152Small(batch_size=2): def SE_ResNeXt50Small(batch_size=2, use_feed=False):
assert not use_feed, "SE_ResNeXt doesn't support feed yet"
img = fluid.layers.fill_constant( img = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant( label = fluid.layers.fill_constant(
...@@ -150,9 +161,9 @@ def SE_ResNeXt152Small(batch_size=2): ...@@ -150,9 +161,9 @@ def SE_ResNeXt152Small(batch_size=2):
conv = fluid.layers.pool2d( conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
cardinality = 64 cardinality = 32
reduction_ratio = 16 reduction_ratio = 16
depth = [3, 8, 36, 3] depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024] num_filters = [128, 256, 512, 1024]
for block in range(len(depth)): for block in range(len(depth)):
...@@ -185,30 +196,32 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -185,30 +196,32 @@ class TestParallelExecutorBase(unittest.TestCase):
memory_opt=True, memory_opt=True,
iter=10, iter=10,
batch_size=None, batch_size=None,
allow_op_delay=False): allow_op_delay=False,
feed_dict={}):
main = fluid.Program() main = fluid.Program()
startup = fluid.Program() startup = fluid.Program()
with fluid.program_guard(main, startup): with fluid.program_guard(main, startup):
loss = method() loss = method(use_feed=len(feed_dict) > 0)
adam = fluid.optimizer.Adam() adam = fluid.optimizer.Adam()
adam.minimize(loss) adam.minimize(loss)
if memory_opt: if memory_opt:
fluid.memory_optimize(main) fluid.memory_optimize(main)
exe = fluid.ParallelExecutor( place = fluid.CUDAPlace(0)
loss_name=loss.name, startup_exe = fluid.Executor(place)
use_cuda=True, startup_exe.run(startup)
allow_op_delay=allow_op_delay)
exe = fluid.ParallelExecutor(True, loss_name=loss.name)
if batch_size is not None: if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count() batch_size *= fluid.core.get_cuda_device_count()
begin = time.time() begin = time.time()
first_loss, = exe.run([loss.name]) first_loss, = exe.run([loss.name], feed_dict=feed_dict)
first_loss = numpy.array(first_loss) first_loss = numpy.array(first_loss)
for i in xrange(iter): for i in xrange(iter):
exe.run([]) exe.run([], feed_dict=feed_dict)
last_loss, = exe.run([loss.name]) last_loss, = exe.run([loss.name], feed_dict=feed_dict)
end = time.time() end = time.time()
if batch_size is not None: if batch_size is not None:
...@@ -242,9 +255,19 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -242,9 +255,19 @@ class TestMNIST(TestParallelExecutorBase):
self.check_network_convergence(simple_fc_net) self.check_network_convergence(simple_fc_net)
self.check_network_convergence(simple_fc_net, allow_op_delay=True) self.check_network_convergence(simple_fc_net, allow_op_delay=True)
img = numpy.zeros(shape=[32, 784], dtype='float32')
label = numpy.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
simple_fc_net, feed_dict={"image": img,
"label": label})
def test_batchnorm_fc(self): def test_batchnorm_fc(self):
self.check_network_convergence(fc_with_batchnorm) self.check_network_convergence(fc_with_batchnorm)
self.check_network_convergence(fc_with_batchnorm, allow_op_delay=True) img = numpy.zeros(shape=[32, 784], dtype='float32')
label = numpy.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
fc_with_batchnorm, feed_dict={"image": img,
"label": label})
class TestResnet(TestParallelExecutorBase): class TestResnet(TestParallelExecutorBase):
...@@ -271,7 +294,7 @@ class TestResnet(TestParallelExecutorBase): ...@@ -271,7 +294,7 @@ class TestResnet(TestParallelExecutorBase):
batch_size = 2 batch_size = 2
self.check_network_convergence( self.check_network_convergence(
functools.partial( functools.partial(
SE_ResNeXt152Small, batch_size=batch_size), SE_ResNeXt50Small, batch_size=batch_size),
iter=20, iter=20,
batch_size=batch_size) batch_size=batch_size)
...@@ -400,7 +423,8 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): ...@@ -400,7 +423,8 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
import transformer_model import transformer_model
def transformer(): def transformer(use_feed):
assert not use_feed, "transfomer doesn't support feed yet"
return transformer_model.transformer( return transformer_model.transformer(
ModelHyperParams.src_vocab_size + 1, ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
...@@ -433,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase): ...@@ -433,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase):
@unittest.skip("transformer is buggy in multi gpu") @unittest.skip("transformer is buggy in multi gpu")
def test_main(self): def test_main(self):
self.check_network_convergence(transformer) self.check_network_convergence(transformer)
class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def test_parallel_testing(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = simple_fc_net(True)
test_program = main.clone(for_test=True)
opt = fluid.optimizer.SGD(learning_rate=0.0001)
opt.minimize(loss)
batch_size = 32
image = numpy.random.normal(size=(batch_size,
784)).astype('float32')
label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup)
feed_dict = {'image': image, 'label': label}
train_exe = fluid.ParallelExecutor(
use_cuda=True, loss_name=loss.name, main_program=main)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
for i in xrange(5):
test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
test_loss = numpy.array(test_loss)
train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
train_loss = numpy.array(train_loss)
self.assertTrue(numpy.allclose(train_loss, test_loss))
...@@ -28,7 +28,6 @@ class TestPriorBoxOp(OpTest): ...@@ -28,7 +28,6 @@ class TestPriorBoxOp(OpTest):
self.attrs = { self.attrs = {
'min_sizes': self.min_sizes, 'min_sizes': self.min_sizes,
'max_sizes': self.max_sizes,
'aspect_ratios': self.aspect_ratios, 'aspect_ratios': self.aspect_ratios,
'variances': self.variances, 'variances': self.variances,
'flip': self.flip, 'flip': self.flip,
...@@ -37,25 +36,28 @@ class TestPriorBoxOp(OpTest): ...@@ -37,25 +36,28 @@ class TestPriorBoxOp(OpTest):
'step_h': self.step_h, 'step_h': self.step_h,
'offset': self.offset 'offset': self.offset
} }
if len(self.max_sizes) > 0:
self.attrs['max_sizes'] = self.max_sizes
self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self):
return
def setUp(self): def setUp(self):
self.op_type = "prior_box" self.op_type = "prior_box"
self.set_data() self.set_data()
def set_max_sizes(self):
max_sizes = [5, 10]
self.max_sizes = np.array(max_sizes).astype('float32').tolist()
def init_test_params(self): def init_test_params(self):
self.layer_w = 4 self.layer_w = 32
self.layer_h = 4 self.layer_h = 32
self.image_w = 20 self.image_w = 40
self.image_h = 20 self.image_h = 40
self.step_w = float(self.image_w) / float(self.layer_w) self.step_w = float(self.image_w) / float(self.layer_w)
self.step_h = float(self.image_h) / float(self.layer_h) self.step_h = float(self.image_h) / float(self.layer_h)
...@@ -66,8 +68,7 @@ class TestPriorBoxOp(OpTest): ...@@ -66,8 +68,7 @@ class TestPriorBoxOp(OpTest):
self.min_sizes = [2, 4] self.min_sizes = [2, 4]
self.min_sizes = np.array(self.min_sizes).astype('float32').tolist() self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
self.max_sizes = [5, 10] self.set_max_sizes()
self.max_sizes = np.array(self.max_sizes).astype('float32').tolist()
self.aspect_ratios = [2.0, 3.0] self.aspect_ratios = [2.0, 3.0]
self.flip = True self.flip = True
self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
...@@ -79,7 +80,7 @@ class TestPriorBoxOp(OpTest): ...@@ -79,7 +80,7 @@ class TestPriorBoxOp(OpTest):
self.clip = True self.clip = True
self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
if len(self.max_sizes) > 1: if len(self.max_sizes) > 0:
self.num_priors += len(self.max_sizes) self.num_priors += len(self.max_sizes)
self.offset = 0.5 self.offset = 0.5
...@@ -105,11 +106,15 @@ class TestPriorBoxOp(OpTest): ...@@ -105,11 +106,15 @@ class TestPriorBoxOp(OpTest):
idx = 0 idx = 0
for s in range(len(self.min_sizes)): for s in range(len(self.min_sizes)):
min_size = self.min_sizes[s] min_size = self.min_sizes[s]
c_w = c_h = min_size / 2. # rest of priors
out_boxes[h, w, idx, :] = [ for r in range(len(self.real_aspect_ratios)):
(c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h, ar = self.real_aspect_ratios[r]
(c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h c_w = min_size * math.sqrt(ar) / 2
] c_h = (min_size / math.sqrt(ar)) / 2
out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
(c_y - c_h) / self.image_h,
(c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h]
idx += 1 idx += 1
if len(self.max_sizes) > 0: if len(self.max_sizes) > 0:
...@@ -122,18 +127,6 @@ class TestPriorBoxOp(OpTest): ...@@ -122,18 +127,6 @@ class TestPriorBoxOp(OpTest):
(c_y + c_h) / self.image_h] (c_y + c_h) / self.image_h]
idx += 1 idx += 1
# rest of priors
for r in range(len(self.real_aspect_ratios)):
ar = self.real_aspect_ratios[r]
if math.fabs(ar - 1.) < 1e-6:
continue
c_w = min_size * math.sqrt(ar) / 2
c_h = (min_size / math.sqrt(ar)) / 2
out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
(c_y - c_h) / self.image_h,
(c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h]
idx += 1
# clip the prior's coordidate such that it is within[0, 1] # clip the prior's coordidate such that it is within[0, 1]
if self.clip: if self.clip:
out_boxes = np.clip(out_boxes, 0.0, 1.0) out_boxes = np.clip(out_boxes, 0.0, 1.0)
...@@ -144,5 +137,10 @@ class TestPriorBoxOp(OpTest): ...@@ -144,5 +137,10 @@ class TestPriorBoxOp(OpTest):
self.out_var = out_var.astype('float32') self.out_var = out_var.astype('float32')
class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
def set_max_sizes(self):
self.max_sizes = []
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -19,9 +19,9 @@ from paddle.fluid.framework import Program ...@@ -19,9 +19,9 @@ from paddle.fluid.framework import Program
class TestOpDesc(unittest.TestCase): class TestOpDesc(unittest.TestCase):
def test_op_desc(self): def test_op_desc(self):
prog = core.ProgramDesc() program_desc = core.ProgramDesc()
self.assertIsNotNone(prog) self.assertIsNotNone(program_desc)
block = prog.block(0) block = program_desc.block(0)
self.assertIsNotNone(block) self.assertIsNotNone(block)
op = block.append_op() op = block.append_op()
self.assertIsNotNone(op) self.assertIsNotNone(op)
...@@ -67,7 +67,7 @@ class TestOpDesc(unittest.TestCase): ...@@ -67,7 +67,7 @@ class TestOpDesc(unittest.TestCase):
self.assertEqual(8, len(op.attr_names())) self.assertEqual(8, len(op.attr_names()))
op.set_block_attr("block_attr", prog.block(0)) op.set_block_attr("block_attr", program_desc.block(0))
self.assertEqual(0, op.block_attr("block_attr")) self.assertEqual(0, op.block_attr("block_attr"))
mul_op = block.append_op() mul_op = block.append_op()
...@@ -88,20 +88,20 @@ class TestProgramDesc(unittest.TestCase): ...@@ -88,20 +88,20 @@ class TestProgramDesc(unittest.TestCase):
del program_desc del program_desc
def test_append_block(self): def test_append_block(self):
prog_desc = core.ProgramDesc() program_desc = core.ProgramDesc()
self.assertIsNotNone(prog_desc) self.assertIsNotNone(program_desc)
block_root = prog_desc.block(0) block_root = program_desc.block(0)
self.assertIsNotNone(block_root) self.assertIsNotNone(block_root)
self.assertEqual(block_root.id, 0) self.assertEqual(block_root.id, 0)
block1 = prog_desc.append_block(block_root) block1 = program_desc.append_block(block_root)
block2 = prog_desc.append_block(block1) block2 = program_desc.append_block(block1)
self.assertIsNotNone(block1) self.assertIsNotNone(block1)
self.assertEqual(block1.id, block2.parent) self.assertEqual(block1.id, block2.parent)
self.assertEqual(block_root.id, block1.parent) self.assertEqual(block_root.id, block1.parent)
block3 = prog_desc.append_block(block_root) block3 = program_desc.append_block(block_root)
self.assertEqual(block3.parent, block_root.id) self.assertEqual(block3.parent, block_root.id)
self.assertEqual(prog_desc.block(1).id, 1) self.assertEqual(program_desc.block(1).id, 1)
self.assertEqual(4, prog_desc.num_blocks()) self.assertEqual(4, program_desc.num_blocks())
class TestVarDesc(unittest.TestCase): class TestVarDesc(unittest.TestCase):
...@@ -162,9 +162,9 @@ class TestVarDesc(unittest.TestCase): ...@@ -162,9 +162,9 @@ class TestVarDesc(unittest.TestCase):
class TestBlockDesc(unittest.TestCase): class TestBlockDesc(unittest.TestCase):
def test_add_var(self): def test_add_var(self):
prog = core.ProgramDesc() program_desc = core.ProgramDesc()
self.assertIsNotNone(prog) self.assertIsNotNone(program_desc)
block = prog.block(0) block = program_desc.block(0)
self.assertIsNotNone(block) self.assertIsNotNone(block)
var1 = block.var("var1") var1 = block.var("var1")
var2 = block.var("var2") var2 = block.var("var2")
...@@ -175,9 +175,9 @@ class TestBlockDesc(unittest.TestCase): ...@@ -175,9 +175,9 @@ class TestBlockDesc(unittest.TestCase):
self.assertEqual(var2_re, var2) self.assertEqual(var2_re, var2)
def test_add_op(self): def test_add_op(self):
prog = core.ProgramDesc() program_desc = core.ProgramDesc()
self.assertIsNotNone(prog) self.assertIsNotNone(program_desc)
block = prog.block(0) block = program_desc.block(0)
self.assertIsNotNone(block) self.assertIsNotNone(block)
op1 = block.append_op() op1 = block.append_op()
op2 = block.append_op() op2 = block.append_op()
...@@ -189,9 +189,9 @@ class TestBlockDesc(unittest.TestCase): ...@@ -189,9 +189,9 @@ class TestBlockDesc(unittest.TestCase):
def test_remove_op(self): def test_remove_op(self):
program = Program() program = Program()
prog = program.desc program_desc = program.desc
self.assertIsNotNone(prog) self.assertIsNotNone(program_desc)
block = prog.block(0) block = program_desc.block(0)
self.assertIsNotNone(block) self.assertIsNotNone(block)
op0 = block.append_op() op0 = block.append_op()
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle.v2 as paddle
import paddle.dataset.mnist as mnist import paddle.v2.dataset.mnist as mnist
class TestRecordIO(unittest.TestCase): class TestRecordIO(unittest.TestCase):
......
...@@ -97,5 +97,72 @@ class TestSparseSGDOp(unittest.TestCase): ...@@ -97,5 +97,72 @@ class TestSparseSGDOp(unittest.TestCase):
self.check_with_place(place) self.check_with_place(place)
class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
def check_with_place(self, place):
scope = core.Scope()
row_width = 12
# create and initialize Grad Variable
grad_height = 10
grad_rows = [0, 4, 7]
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(grad_height)
grad_selected_rows.set_rows(grad_rows)
grad_array = np.ones((len(grad_rows), row_width)).astype("float32")
grad_array[0, 0] = 2.0
grad_array[2, 8] = 4.0
grad_tensor = grad_selected_rows.get_tensor()
grad_tensor.set(grad_array, place)
# create and initialize Param Variable
# create and initialize W Variable
param_rows = [0, 1, 2, 3, 4, 5, 6, 7]
# init Param
w_selected_rows = scope.var('Param').get_selected_rows()
w_selected_rows.set_height(len(param_rows))
w_selected_rows.set_rows(param_rows)
w_array = np.ones((len(param_rows), row_width)).astype("float32")
for i in range(len(param_rows)):
w_array[i] *= i
w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place)
w_before_optimize = np.array(w_tensor)
# create and initialize LeraningRate Variable
lr_value = 0.1
lr = scope.var('LearningRate').get_tensor()
lr_array = np.full((1), lr_value).astype("float32")
lr.set(lr_array, place)
# optimize with Python
w_after_optimize = np.copy(w_before_optimize)
for index, id in enumerate(grad_rows):
w_after_optimize[id] = w_before_optimize[
id] - lr_value * grad_array[index]
# create and run sgd operator
sgd_op = Operator(
"sgd",
Param='Param',
Grad='Grad',
ParamOut='Param',
LearningRate='LearningRate')
sgd_op.run(scope, place)
# get and compare result
result_array = np.array(w_tensor)
assert (result_array == w_after_optimize).all()
def test_sparse_parameter_sgd(self):
places = [core.CPUPlace()]
# do not support GPU kernel currently
for place in places:
self.check_with_place(place)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -68,6 +68,17 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp): ...@@ -68,6 +68,17 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
self.use_cudnn = True self.use_cudnn = True
class TestSoftmaxFP16Op(TestSoftmaxOp):
def init_kernel_type(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp): class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
def init_kernel_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
......
#################### test_config_parser ######################### #################### test_config_parser #########################
add_test(NAME layers_test add_test(NAME layers_test
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
add_test(NAME test_reset_hook add_test(NAME test_reset_hook
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle) WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp) add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
add_test(NAME test_layerHelpers add_test(NAME test_layerHelpers
COMMAND COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE} ${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
) )
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
set -e set -e
cd `dirname $0` cd `dirname $0`
export PYTHONPATH=$PWD/../../../../
protostr=$PWD/protostr protostr=$PWD/protostr
. file_list.sh . file_list.sh
......
...@@ -58,7 +58,7 @@ def mkl(): ...@@ -58,7 +58,7 @@ def mkl():
'istaged': ISTAGED, 'istaged': ISTAGED,
'with_mkl': '@WITH_MKL@'}) 'with_mkl': '@WITH_MKL@'})
write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
packages=['paddle', packages=['paddle',
...@@ -107,9 +107,10 @@ package_dir={ ...@@ -107,9 +107,10 @@ package_dir={
# So that package points to other directory. # So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
} }
if '${WITH_FLUID_ONLY}'== 'OFF': if '${WITH_FLUID_ONLY}'== 'OFF':
package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle' package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
paddle_rt_lib_dir = 'lib' paddle_rt_lib_dir = 'lib'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册