提交 ae5923e2 编写于 作者: C chengduoZH

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_gather_and_BCast_op_handle
...@@ -139,9 +139,6 @@ def run_benchmark(model, args): ...@@ -139,9 +139,6 @@ def run_benchmark(model, args):
# inference program # inference program
inference_program = fluid.default_main_program().clone() inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])
# Optimization # Optimization
opt = fluid.optimizer.AdamOptimizer( opt = fluid.optimizer.AdamOptimizer(
...@@ -161,7 +158,7 @@ def run_benchmark(model, args): ...@@ -161,7 +158,7 @@ def run_benchmark(model, args):
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=args.batch_size) paddle.dataset.mnist.train(), batch_size=args.batch_size)
accuracy = fluid.average.WeightedAverage() accuracy = fluid.metrics.Accuracy()
iters, num_samples, start_time = 0, 0, time.time() iters, num_samples, start_time = 0, 0, time.time()
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
accuracy.reset() accuracy.reset()
...@@ -184,7 +181,7 @@ def run_benchmark(model, args): ...@@ -184,7 +181,7 @@ def run_benchmark(model, args):
"label": y_data}, "label": y_data},
fetch_list=[avg_cost, batch_acc, batch_size_tensor] fetch_list=[avg_cost, batch_acc, batch_size_tensor]
) # The accuracy is the accumulation of batches, but not the current batch. ) # The accuracy is the accumulation of batches, but not the current batch.
accuracy.add(value=outs[1], weight=outs[2]) accuracy.update(value=outs[1], weight=outs[2])
iters += 1 iters += 1
num_samples += len(y_data) num_samples += len(y_data)
loss = np.array(outs[0]) loss = np.array(outs[0])
......
...@@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path") ...@@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
IF(${CBLAS_PROVIDER} STREQUAL "MKLML") IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT WITH_GPU)
return()
endif()
include(ExternalProject)
set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
if(WITH_DSO)
# If we use DSO, we do not build nccl, just download the dependencies
set(NCCL_BUILD_COMMAND "")
set(NCCL_INSTALL_COMMAND "")
set(NCCL_INSTALL_DIR "")
else()
# otherwise, we build nccl and link it.
set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
# Note: cuda 8.0 is needed to make nccl
# When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
set(NCCL_BUILD_COMMAND "make -j 8")
set(NCCL_INSTALL_COMMAND "make install PREFIX=${NCCL_INSTALL_DIR}")
endif()
ExternalProject_Add(
extern_nccl
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git"
GIT_TAG "v1.3.4-1"
PREFIX "${NCCL_SOURCE_DIR}"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND "${NCCL_BUILD_COMMAND}"
INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}"
INSTALL_DIR "${NCCL_INSTALL_DIR}"
TEST_COMMAND ""
)
if(WITH_DSO)
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
add_library(nccl STATIC ${dummyfile})
else()
add_library(nccl INTERFACE)
endif()
else()
add_library(nccl STATIC IMPORTED GLOBAL)
set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
endif()
add_dependencies(nccl extern_nccl)
...@@ -16,3 +16,4 @@ ...@@ -16,3 +16,4 @@
block.md block.md
scope.md scope.md
executor.md executor.md
parallel_executor.md
...@@ -16,3 +16,4 @@ Core Concepts ...@@ -16,3 +16,4 @@ Core Concepts
block.md block.md
scope.md scope.md
executor.md executor.md
parallel_executor.md
# Problem # Kernel Hint Design
## Problem
In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this. In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
In the current design, we use KernelType to describe one kernel. In the current design, we use KernelType to describe one kernel.
......
# Background # Kernel Selection
## Background
Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold. Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
The `OpKernelType ` is as follows: The `OpKernelType ` is as follows:
......
Install and Build install and Compile
================= ==========
.. _install_steps: .. _install_steps:
Install Steps PaddlePaddle provides various methods of installation for many different users
++++++++
You can choose either pip or Docker to complete your install: Focus on Deep Learning Model Development
-----------------
PaddlePaddle provides lots of packages of python wheel , that pip can install:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
pip_install_en.rst pip_install_en.rst
docker_install_en.rst
Build from Source This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
-----------------
Follow the Bottom Frame
----------
PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
.. toctree::
:maxdepth: 1
docker_install_en.rst
.. warning:: We recommend running PaddlePaddle in Docker. This method has the following advantages:
We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary. - Does not require installation of third-party dependencies.
- Easy to share runtime environment.
.. toctree:: Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
.. toctree::
:maxdepth: 1 :maxdepth: 1
build_from_source_en.md build_from_source_en.rst
.. warning::
One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
FAQ FAQ
++++++++++ -----------
For any problems during installation, please refer to the page below for answers:
:ref:`常见问题解答 <install_faq>`
If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community:
`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_ `创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
...@@ -65,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D ...@@ -65,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D
不使用PaddlePaddle.org工具 不使用PaddlePaddle.org工具
-------------------------- --------------------------
使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似,通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行,在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档,具体步骤如下:
[TBD] .. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
docker build -t paddle:dev .
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
# 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
bash -x /paddle/paddle/scripts/docker/build.sh
注:上述命令把当前目录(源码根目录)映射为 container 里的 :code:`/paddle` 目录。
编译完成后,会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录,分别进入这些目录下,执行以下命令:
.. code-block:: bash
python -m SimpleHTTPServer 8088
在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即 如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即
.. code-block:: bash .. code-block:: bash
mkdir paddle
cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir -p build mkdir -p build
cd build cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
# 如果只需要构建使用文档,则执行以下命令 # 如果只需要构建使用文档,则执行以下命令
make -j $processors gen_proto_py make -j $processors paddle_docs
make -j $processors paddle_docs paddle_docs_cn
# 如果只需要构建API,则执行以下命令 # 如果只需要构建API,则执行以下命令
make -j $processors gen_proto_py framework_py_proto make -j $processors paddle_apis
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。 其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。
编译完成后,进入 ``doc/v2`` 目录,如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会生成 ``api/en/html`` 目录,分别进入这些目录下,执行以下命令: 编译完成后,同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录,如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录,分别进入这些子目录下,执行以下命令:
.. code-block:: bash .. code-block:: bash
python -m SimpleHTTPServer 8088 python -m SimpleHTTPServer 8088
在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。 在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
.. image:: src/doc_en.png .. image:: src/doc_en.png
:align: center :align: center
......
...@@ -68,39 +68,56 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo ...@@ -68,39 +68,56 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo
Manually Building the Documentation Manually Building the Documentation
------------------------------------- -------------------------------------
Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation. Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
[TBD] .. code-block:: bash
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
# Construct a docker image from source code
docker build -t paddle:dev .
docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
# Use build.sh to build PaddlePaddle documentation
bash -x /paddle/paddle/scripts/docker/build.sh
Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
.. code-block:: bash
python -m SimpleHTTPServer 8088
Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation. If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
.. code-block:: bash .. code-block:: bash
mkdir paddle
cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir -p build mkdir -p build
cd build cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
# If you only need to build documents, use the following commands # If you only need to build documents, use the following commands
make -j $processors gen_proto_py make -j $processors paddle_docs
make -j $processors paddle_docs paddle_docs_cn
# If you only need to build APIs, use the following commands # If you only need to build APIs, use the following commands
make -j $processors gen_proto_py framework_py_proto make -j $processors paddle_apis
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine. $processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs,it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands: After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html`` will be generated in both two directories. If you chose to build APIs,a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
.. code-block:: bash .. code-block:: bash
python -m SimpleHTTPServer 8088 python -m SimpleHTTPServer 8088
Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging. Use a web browser and navigate to http://localhost:8000, you could see the compiled ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
.. image:: src/doc_en.png .. image:: src/doc_en.png
:align: center :align: center
......
## Install and Build ## Install and Build
TBD ### Download & Install
Download the latest C-API development package from CI system and install. You can find the required version in the table below:
<table>
<thead>
<tr>
<th>Version Tips</th>
<th>C-API</th>
</tr>
</thead>
<tbody>
<tr>
<td>cpu_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cpu_avx_openblas</td>
<td>-</td>
</tr>
<tr>
<td>cpu_noavx_openblas</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda7.5_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn7_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr></tbody></table>
### From source
Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options:
<table>
<thead>
<tr>
<th>Options</th>
<th>Value</th>
</tr>
</thead>
<tbody>
<tr>
<td>WITH_C_API</td>
<td>ON</td>
</tr>
<tr>
<td>WITH_PYTHON</td>
<td>OFF(recommended)</td>
</tr>
<tr>
<td>WITH_SWIG_PY</td>
<td>OFF(recommended)</td>
</tr>
<tr>
<td>WITH_GOLANG</td>
<td>OFF(recommended)</td>
</tr>
<tr>
<td>WITH_GPU</td>
<td>ON/OFF</td>
</tr>
<tr>
<td>WITH_MKL</td>
<td>ON/OFF</td>
</tr></tbody></table>
It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need.
Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library):
```shell
PADDLE_ROOT=/path/of/capi
git clone https://github.com/PaddlePaddle/Paddle.git
cd Paddle
mkdir build
cd build
cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_GOLANG=OFF \
-DWITH_PYTHON=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=OFF \
..
```
After running the above code to generate Makefile , run: `make && make install`. After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory.
If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)):
```text
├── include
│   └── paddle
│   ├── arguments.h
│   ├── capi.h
│   ├── capi_private.h
│   ├── config.h
│   ├── error.h
│   ├── gradient_machine.h
│   ├── main.h
│   ├── matrix.h
│   ├── paddle_capi.map
│   └── vector.h
├── lib
│   ├── libpaddle_capi_engine.a
│   ├── libpaddle_capi_layers.a
│   ├── libpaddle_capi_shared.so
│   └── libpaddle_capi_whole.a
└── third_party
├── gflags
│   ├── include
│   │   └── gflags
│   │   ├── gflags_completions.h
│   │   ├── gflags_declare.h
│   │   ...
│   └── lib
│   └── libgflags.a
├── glog
│   ├── include
│   │   └── glog
│   │   ├── config.h
│   │   ...
│   └── lib
│   └── libglog.a
├── openblas
│   ├── include
│   │   ├── cblas.h
│   │   ...
│   └── lib
│   ...
├── protobuf
│   ├── include
│   │   └── google
│   │   └── protobuf
│   │   ...
│   └── lib
│   └── libprotobuf-lite.a
└── zlib
├── include
│   ...
└── lib
...
```
### Linking Description:
There are three kinds of linking methods:
1. Linking with dynamic library `libpaddle_capi_shared.so`(This way is much more convenient and easier, **Without special requirements, it is recommended**), refer to the following:
1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API.
1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library.
1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable.
2. Linking with static library `libpaddle_capi_whole.a`,refer to the following:
1. Specify `-Wl,--whole-archive` linking options.
1. Explicitly link third-party libraries such as `gflags``glog``libz``protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory.
1. Use OpenBLAS library if compiling C-API,must explicitly link `libopenblas.a`.
1. Use MKL when compiling C-API, must explicitly link MKL dynamic library.
3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`,refer to the following:
1. This linking methods is mainly used for mobile prediction.
1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries.
1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking.
1. The third-party dependencies need explicitly link same as method 2 above.
# Kubernetes Distributed # Distributed Training on Kubernetes
TBD We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the
previous document.
In this article, we will introduce how to create a PaddlePaddle job with multiple nodes
on Kubernetes cluster.
## Overall Architecture
Before creating a training job, the users need to slice the training data and deploy
the Python scripts along with it into the distributed file system
(We can use the different type of Kuberentes Volumes to mount different distributed
file systems). Before training starts, The program will copy the training data into the
Container and also save the models at the same path during training. The global architecture
is as follows:
![PaddlePaddle on Kubernetes Architecture](src/k8s-paddle-arch.png)
The above figure describes a distributed training architecture which contains 3 nodes, each
Pod mounts a folder of the distributed file system to save training data and models
by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on
3 nodes, each Pod has a PaddlePaddle container. After the containers car created,
PaddlePaddle starts up the communication between PServer and Trainer and read training
data for this training job.
As the description above, we can start up a PaddlePaddle distributed training job on a
Kubernetes ready cluster with the following steps:
1. [Build PaddlePaddle Docker Image](#Build a Docker Image)
1. [Split training data and upload to the distributed file system](#Upload Training Data)
1. [Edit a YAML file and create a Kubernetes Job](#Create a Job)
1. [Check the output](#Check The Output)
We will introduce these steps as follows:
### Build a Docker Image
Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training:
- Copying the training data into container.
- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
- https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile
```bash
$ cd doc/howto/usage/k8s/src/k8s_train
$ docker build -t [YOUR_REPO]/paddle:mypaddle .
```
And then upload the new Docker Image to a Docker hub:
```bash
docker push [YOUR_REPO]/paddle:mypaddle
```
**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository,
you need to use your repository instead of it. We will replace it with your respository name to
represent the Docker Image which built in this step.
### Prepare Training Data
We can download and split the training job by creating a Kubernetes Job, or custom your image
by editing [k8s_train](./src/k8s_train/).
Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
the different file system, the generated dataset would be saved on this volume.
```yaml
apiVersion: batch/v1
kind: Job
metadata:
name: paddle-data
spec:
template:
metadata:
name: pi
spec:
hostNetwork: true
containers:
- name: paddle-data
image: paddlepaddle/paddle-tutorial:k8s_data
imagePullPolicy: Always
volumeMounts:
- mountPath: "/mnt"
name: nfs
env:
- name: OUT_DIR
value: /home/work/mfs/paddle-cluster-job
- name: SPLIT_COUNT
value: "3"
volumes:
- name: nfs
persistentVolumeClaim:
claimName: mfs
restartPolicy: Never
```
Create the Job with the following command:
```bash
> kubectl create -f xxx.yaml
```
If created successfully, you can see some information like this:
```base
[root@paddle-kubernetes-node0 nfsdir]$ tree -d
.
`-- paddle-cluster-job
|-- 0
| `-- data
|-- 1
| `-- data
|-- 2
| `-- data
|-- output
|-- quick_start
```
The `paddle-cluster-job` above is the job name for this training job; we need 3
PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path,
the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs.
### Create a Job
Kubernetes allow users to create objects with YAML files, and we can use a command-line tool
to create it.
The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in
[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job).
The following is an example for this training job:
```yaml
apiVersion: batch/v1
kind: Job
metadata:
name: paddle-cluster-job
spec:
parallelism: 3
completions: 3
template:
metadata:
name: paddle-cluster-job
spec:
volumes:
- name: jobpath
hostPath:
path: /home/work/mfs
containers:
- name: trainer
image: [YOUR_REPO]/paddle:mypaddle
command: ["bin/bash", "-c", "/root/start.sh"]
env:
- name: JOB_NAME
value: paddle-cluster-job
- name: JOB_PATH
value: /home/jobpath
- name: JOB_NAMESPACE
value: default
- name: TRAIN_CONFIG_DIR
value: recommendation
- name: CONF_PADDLE_NIC
value: eth0
- name: CONF_PADDLE_PORT
value: "7164"
- name: CONF_PADDLE_PORTS_NUM
value: "2"
- name: CONF_PADDLE_PORTS_NUM_SPARSE
value: "2"
- name: CONF_PADDLE_GRADIENT_NUM
value: "3"
volumeMounts:
- name: jobpath
mountPath: /home/jobpath
restartPolicy: Never
```
In the above YAML file:
- `metadata.name`, The job name.
- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time.
- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0)
is equal to `completions`.
- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents
the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath`
to configure the host path we want to mount.
- `env`, the environment variables in the Container, we pass some startup arguments by
this approach, some details are as following:
- JOB_PATH:the mount path in the container
- JOB_NAME:the job name
- TRAIN_CONFIG_DIR:the job path in the container, we can find the training data path by
combine with JOB_NAME.
- CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network
device name.
- CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process.
- CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number
for dense prameter update.
- CONF_PADDLE_PORTS_NUM_SPARSE:the argument `--ports_num_for_sparse` of `Paddle PServer`,
the port number for sparse parameter update.
- CONF_PADDLE_GRADIENT_NUM:the number of training node, the argument
`--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`.
You can find some details information at [here]
(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file:
```bash
kubectl create -f job.yaml
```
Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node,
pull the Docker image and begin to train.
### Checkout the Output
At the process of training, we can check the logs and the output models which is stored in
the `output` folder.
**NOTE**, `node_0`, `node_1` and `node_2` represent the
`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes.
```bash
[root@paddle-kubernetes-node0 output]# tree -d
.
├── node_0
│   ├── server.log
│   └── train.log
├── node_1
│   ├── server.log
│   └── train.log
├── node_2
......
├── pass-00002
│   ├── done
│   ├── ___embedding_0__.w0
│   ├── ___embedding_1__.w0
......
```
We can checkout the status of each training Pod by viewing the logs:
```bash
[root@paddle-kubernetes-node0 node_0]# cat train.log
I1116 09:10:17.123121 50 Util.cpp:155] commandline:
/usr/local/bin/../opt/paddle/bin/paddle_trainer
--nics=eth0 --port=7164
--ports_num=2 --comment=paddle_process_by_paddle
--pservers=192.168.129.66,192.168.223.143,192.168.129.71
--ports_num_for_sparse=2 --config=./trainer_config.py
--trainer_count=4 --num_passes=10 --use_gpu=0
--log_period=50 --dot_period=10 --saving_period=1
--local=0 --trainer_id=0
--save_dir=/home/jobpath/paddle-cluster-job/output
I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions
I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done.
[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal
I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
I1116 09:10:17.681543 50 GradientMachine.cpp:134] Initing parameters..
I1116 09:10:18.012390 50 GradientMachine.cpp:141] Init parameters done.
I1116 09:10:18.018641 50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
I1116 09:10:18.018950 50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
I1116 09:10:18.019069 50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
I1116 09:10:18.019492 50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
I1116 09:10:18.019716 50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
I1116 09:10:18.019836 50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
```
## Some Additional Details
### Using Environment Variables
Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in
Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable
to the start up arguments of PaddlePaddle process:
```bash
API = "/api/v1/namespaces/"
JOBSELECTOR = "labelSelector=job-name="
JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
JOB_PATH_OUTPUT = JOB_PATH + "/output"
JOBNAME = os.getenv("JOB_NAME")
NAMESPACE = os.getenv("JOB_NAMESPACE")
PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
```
### Communication between Pods
At the begin of `start_paddle.py`, it would initializes and parses the arguments.
```python
parser = argparse.ArgumentParser(prog="start_paddle.py",
description='simple tool for k8s')
args, train_args_list = parser.parse_known_args()
train_args = refine_unknown_args(train_args_list)
train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
podlist = getPodList()
```
And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`.
```python
podlist = getPodList()
# need to wait until all pods are running
while not isPodAllRunning(podlist):
time.sleep(10)
podlist = getPodList()
idMap = getIdMap(podlist)
```
**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some
Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of
Kubernetes Pod or Replicaset in the future.
The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them
to generate `trainer_id`.
```python
def getIdMap(podlist):
'''
generate tainer_id by ip
'''
ips = []
for pod in podlist["items"]:
ips.append(pod["status"]["podIP"])
ips.sort()
idMap = {}
for i in range(len(ips)):
idMap[ips[i]] = i
return idMap
```
After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer`
so that we can start up them by `startPaddle(idMap, train_args_dict)`.
### Create Job
The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and
`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the
environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc...,
finally find `trainerId` from `idMap` according to its IP address.
```python
program = 'paddle train'
args = " --nics=" + PADDLE_NIC
args += " --port=" + str(PADDLE_PORT)
args += " --ports_num=" + str(PADDLE_PORTS_NUM)
args += " --comment=" + "paddle_process_by_paddle"
ip_string = ""
for ip in idMap.keys():
ip_string += (ip + ",")
ip_string = ip_string.rstrip(",")
args += " --pservers=" + ip_string
args_ext = ""
for key, value in train_args_dict.items():
args_ext += (' --' + key + '=' + value)
localIP = socket.gethostbyname(socket.gethostname())
trainerId = idMap[localIP]
args += " " + args_ext + " --trainer_id=" + \
str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
```
...@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include <queue>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include <queue>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -147,52 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) { ...@@ -147,52 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
return; return;
} }
auto get_vars = [](std::deque<std::unique_ptr<OpDesc>>::iterator &op, ops_.erase(ops_.begin() + s, ops_.begin() + e);
std::vector<std::string> &v) {
auto in_names = (*op)->InputArgumentNames();
v.insert(v.end(), in_names.begin(), in_names.end());
auto out_names = (*op)->OutputArgumentNames();
v.insert(v.end(), out_names.begin(), out_names.end());
std::sort(v.begin(), v.end());
auto last = std::unique(v.begin(), v.end());
v.erase(last, v.end());
};
need_update_ = true;
for (size_t i = s; i < e; i++) {
// since remove op one by one, every time remove the first op.
auto op = ops_.begin() + s;
// collect input and output variables from current delete op
std::vector<std::string> cur_vars;
get_vars(op, cur_vars);
// remove current op
ops_.erase(ops_.begin() + s);
// collect input and output variables from other ops
std::vector<std::string> other_vars;
for (auto it = ops_.begin(); it != ops_.end(); it++) {
get_vars(it, other_vars);
}
// variables should be deleted
std::vector<std::string> delete_vars;
// delete_vars = cur_vars - cur_vars ^ other_input_vars
std::set_difference(cur_vars.begin(), cur_vars.end(), other_vars.begin(),
other_vars.end(),
std::inserter(delete_vars, delete_vars.end()));
// remove variables
for (size_t i = 0; i < delete_vars.size(); i++) {
auto name = delete_vars[i];
auto it = vars_.find(name);
PADDLE_ENFORCE(it != vars_.end(),
"%s is not in variable list, it should not be deleted",
name);
vars_.erase(it);
VLOG(3) << "deleting variable " << name;
}
}
} }
std::vector<OpDesc *> BlockDesc::AllOps() const { std::vector<OpDesc *> BlockDesc::AllOps() const {
......
...@@ -105,7 +105,7 @@ static void BuildVar(const std::string& param_name, ...@@ -105,7 +105,7 @@ static void BuildVar(const std::string& param_name,
TEST(Operator, CPUtoGPU) { TEST(Operator, CPUtoGPU) {
using namespace paddle::framework; using namespace paddle::framework;
using namespace paddle::platform; using namespace paddle::platform;
InitDevices(); InitDevices(true);
paddle::framework::Scope scope; paddle::framework::Scope scope;
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
......
...@@ -2,21 +2,25 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) ...@@ -2,21 +2,25 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
if(WITH_GPU) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda) dynload_cuda)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
if(WITH_GPU)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
else() else()
set(multi_devices_graph_builder_deps) set(multi_devices_graph_builder_deps)
endif() endif()
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps}) scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context) simple_threadpool device_context)
cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/send_op_handle.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -54,12 +55,37 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( ...@@ -54,12 +55,37 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
} }
} }
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
const platform::Place &p,
const size_t &i) const {
auto *op_handle = result->ops_.back().get();
op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
platform::DeviceContextPool::Instance().Get(p));
auto var_names = op->InputArgumentNames();
for (auto &each_var_name : var_names) {
VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
op_handle->AddInput(var);
}
var_names = op->OutputArgumentNames();
for (auto &each_var_name : var_names) {
CreateOpOutput(result, op_handle, each_var_name, p, i);
}
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const { const ProgramDesc &program) const {
auto graph = new SSAGraph(); auto graph = new SSAGraph();
SSAGraph &result = *graph; SSAGraph &result = *graph;
std::unordered_set<std::string> og_has_been_broadcast; std::unordered_set<std::string> og_has_been_broadcast;
result.vars_.resize(places_.size());
// We cannot invoke resize. It is a bug of GCC 4.8
result.vars_ = std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
places_.size());
bool is_forwarding = true; bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) { for (auto *op : program.Block(0).AllOps()) {
...@@ -72,27 +98,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -72,27 +98,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
} }
} }
// append send op if program is distributed trainer main program.
// always use the first device
if (!is_forwarding && op->Type() == "send") {
auto &p = places_[0];
auto *s = local_scopes_[0];
// FIXME(wuyi): send op always copy from GPU 0
result.ops_.emplace_back(new SendOpHandle(*op, s, p));
// Create inputs for output on original place and no ssa output
// is created for send op.
CreateOpHandleIOs(&result, op, p, 0);
continue;
}
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i]; auto &p = places_[i];
auto *s = local_scopes_[i]; auto *s = local_scopes_[i];
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
auto *op_handle = result.ops_.back().get(); auto *op_handle = result.ops_.back().get();
op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>( CreateOpHandleIOs(&result, op, p, i);
platform::DeviceContextPool::Instance().Get(p));
auto var_names = op->InputArgumentNames();
for (auto &each_var_name : var_names) { auto var_names = op->OutputArgumentNames();
VarHandle *var =
CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
op_handle->AddInput(var);
}
var_names = op->OutputArgumentNames();
for (auto &each_var_name : var_names) {
CreateOpOutput(&result, op_handle, each_var_name, p, i);
}
if (is_forwarding) { if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) { if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
...@@ -147,15 +174,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build( ...@@ -147,15 +174,16 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
if (vars.empty()) { // This device has no data. continue. if (vars.empty()) { // This device has no data. continue.
continue; continue;
} }
auto *prev_grad = &vars[vars.size() - 1]; auto &prev_grad = vars[vars.size() - 1];
op_handle->AddInput(prev_grad); op_handle->AddInput(prev_grad.get());
auto &var = vars[vars.size()]; vars.emplace_back(new VarHandle);
var.place_ = p; auto &var = vars.back();
var.name_ = og; var->place_ = p;
var.version_ = vars.size() - 1; var->name_ = og;
var->version_ = vars.size() - 1;
op_handle->AddOutput(&var); op_handle->AddOutput(var.get());
} }
#else #else
PADDLE_ENFORCE("Not implemented"); PADDLE_ENFORCE("Not implemented");
......
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/ssa_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace paddle { namespace paddle {
...@@ -41,6 +44,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { ...@@ -41,6 +44,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override; std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
private:
void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
const size_t &i) const;
private: private:
std::string loss_var_name_; std::string loss_var_name_;
const std::vector<platform::Place> &places_; const std::vector<platform::Place> &places_;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/send_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
const Scope *local_scope,
const platform::Place &place)
: op_(framework::OpRegistry::CreateOp(op_desc)),
local_scope_(local_scope),
place_(place) {}
void SendOpHandle::RunImpl() {
// Wait input done
for (auto *in : inputs_) {
auto &p = static_cast<VarHandle *>(in)->place_;
if (in->DebugString() == "dummy") { // HACK
continue;
}
in->generated_op_->Wait(dev_ctxes_[p]);
}
op_->Run(*local_scope_, place_);
}
std::string SendOpHandle::Name() const { return "send"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
namespace details {
struct SendOpHandle : public OpHandleBase {
std::unique_ptr<OperatorBase> op_;
const Scope* local_scope_;
const platform::Place& place_;
SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
const platform::Place& place);
std::string Name() const override;
// Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool IsMultiDeviceTransfer() override { return false; };
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <map> #include <map>
#include <string> #include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/details/var_handle.h"
...@@ -24,7 +26,9 @@ namespace framework { ...@@ -24,7 +26,9 @@ namespace framework {
namespace details { namespace details {
struct SSAGraph { struct SSAGraph {
std::vector<std::unordered_map<std::string, std::map<int, VarHandle>>> vars_; std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
vars_;
// aux variables to represent dependency. Useful to resolve data hazard. // aux variables to represent dependency. Useful to resolve data hazard.
std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_; std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
std::vector<std::unique_ptr<OpHandleBase>> ops_; std::vector<std::unique_ptr<OpHandleBase>> ops_;
......
...@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { ...@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
auto it_old = name_pair.second.rbegin(); auto it_old = name_pair.second.rbegin();
++it_old; ++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
auto *write_op = it_new->second.generated_op_; auto *write_op = (*it_new)->generated_op_;
auto &read_ops = it_old->second.pending_ops_; auto &read_ops = (*it_old)->pending_ops_;
for (auto *read_op : read_ops) { for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op; // Manually add a dependency var from read_op to write_op;
...@@ -54,14 +54,15 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ...@@ -54,14 +54,15 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
auto &var_holder = var_holders[each_var_name]; auto &var_holder = var_holders[each_var_name];
VarHandle *var = nullptr; VarHandle *var = nullptr;
if (var_holder.empty()) { if (var_holder.empty()) {
var_holder.emplace_back(new VarHandle);
auto &init_var = var_holder[0]; auto &init_var = var_holder[0];
init_var.place_ = place; init_var->place_ = place;
init_var.name_ = each_var_name; init_var->name_ = each_var_name;
init_var.generated_op_ = nullptr; init_var->generated_op_ = nullptr;
init_var.version_ = 0; init_var->version_ = 0;
var = &init_var; var = init_var.get();
} else { } else {
var = &var_holder.rbegin()->second; var = var_holder.rbegin()->get();
} }
return var; return var;
} }
...@@ -72,11 +73,12 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, ...@@ -72,11 +73,12 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
size_t place_offset) { size_t place_offset) {
auto &vars = graph->vars_[place_offset][each_var_name]; auto &vars = graph->vars_[place_offset][each_var_name];
size_t version = vars.size(); size_t version = vars.size();
auto &var = vars[version]; vars.emplace_back(new VarHandle());
var.version_ = version; auto &var = vars.back();
var.name_ = each_var_name; var->version_ = version;
var.place_ = place; var->name_ = each_var_name;
op_handle->AddOutput(&var); var->place_ = place;
op_handle->AddOutput(var.get());
} }
template <typename Callback> template <typename Callback>
...@@ -84,7 +86,7 @@ void IterAllVar(const SSAGraph &graph, Callback callback) { ...@@ -84,7 +86,7 @@ void IterAllVar(const SSAGraph &graph, Callback callback) {
for (auto &each : graph.vars_) { for (auto &each : graph.vars_) {
for (auto &pair1 : each) { for (auto &pair1 : each) {
for (auto &pair2 : pair1.second) { for (auto &pair2 : pair1.second) {
callback(pair2.second); callback(*pair2);
} }
} }
} }
......
...@@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -69,7 +69,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto &var_map : graph_->vars_) { for (auto &var_map : graph_->vars_) {
for (auto &name_pair : var_map) { for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) { for (auto &version_pair : name_pair.second) {
InsertPendingVar(version_pair.second); InsertPendingVar(*version_pair);
} }
} }
} }
...@@ -95,7 +95,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ...@@ -95,7 +95,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto &var_map : graph_->vars_) { for (auto &var_map : graph_->vars_) {
auto it = var_map.find(fetch_var_name); auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) { if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
} }
} }
} }
......
...@@ -64,7 +64,7 @@ void InitP2P(int count) { ...@@ -64,7 +64,7 @@ void InitP2P(int count) {
#endif #endif
} }
void InitDevices() { void InitDevices(bool init_p2p) {
/*Init all avaiable devices by default */ /*Init all avaiable devices by default */
std::vector<platform::Place> places; std::vector<platform::Place> places;
...@@ -85,7 +85,9 @@ void InitDevices() { ...@@ -85,7 +85,9 @@ void InitDevices() {
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i)); places.emplace_back(platform::CUDAPlace(i));
} }
InitP2P(count); if (init_p2p) {
InitP2P(count);
}
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
} }
......
...@@ -24,7 +24,7 @@ void InitGflags(std::vector<std::string> &argv); ...@@ -24,7 +24,7 @@ void InitGflags(std::vector<std::string> &argv);
void InitGLOG(const std::string &prog_name); void InitGLOG(const std::string &prog_name);
void InitDevices(); void InitDevices(bool init_p2p);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -21,7 +21,7 @@ TEST(InitDevices, CPU) { ...@@ -21,7 +21,7 @@ TEST(InitDevices, CPU) {
using paddle::platform::DeviceContextPool; using paddle::platform::DeviceContextPool;
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
InitDevices(); InitDevices(true);
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U); ASSERT_EQ(pool.size(), 1U);
#endif #endif
...@@ -33,7 +33,7 @@ TEST(InitDevices, CUDA) { ...@@ -33,7 +33,7 @@ TEST(InitDevices, CUDA) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
InitDevices(); InitDevices(true);
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count)); ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif #endif
......
...@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include <stdint.h>
#include <string.h>
#include <algorithm>
#include <iterator>
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
...@@ -22,11 +27,6 @@ limitations under the License. */ ...@@ -22,11 +27,6 @@ limitations under the License. */
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h" #include "paddle/fluid/recordio/writer.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <iterator>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, ...@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx); TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
} }
void WriteToRecordIO(recordio::Writer &writer, void WriteToRecordIO(recordio::Writer *writer,
const std::vector<LoDTensor> &tensor, const std::vector<LoDTensor> &tensor,
const platform::DeviceContext &dev_ctx) { const platform::DeviceContext &dev_ctx) {
std::stringstream buffer; std::stringstream buffer;
...@@ -303,18 +303,20 @@ void WriteToRecordIO(recordio::Writer &writer, ...@@ -303,18 +303,20 @@ void WriteToRecordIO(recordio::Writer &writer,
for (auto &each : tensor) { for (auto &each : tensor) {
SerializeToStream(buffer, each, dev_ctx); SerializeToStream(buffer, each, dev_ctx);
} }
writer.Write(buffer.str()); writer->Write(buffer.str());
} }
std::vector<LoDTensor> ReadFromRecordIO( std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) { recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
std::istringstream sin(scanner.Next());
uint32_t sz;
sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
std::vector<LoDTensor> result; std::vector<LoDTensor> result;
result.resize(sz); if (scanner->HasNext()) {
for (uint32_t i = 0; i < sz; ++i) { std::istringstream sin(scanner->Next());
DeserializeFromStream(sin, &result[i], dev_ctx); uint32_t sz;
sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
result.resize(sz);
for (uint32_t i = 0; i < sz; ++i) {
DeserializeFromStream(sin, &result[i], dev_ctx);
}
} }
return result; return result;
} }
......
...@@ -15,6 +15,9 @@ limitations under the License. */ ...@@ -15,6 +15,9 @@ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include <string>
#include <utility>
#include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
...@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor, ...@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
void DeserializeFromStream(std::istream& is, LoDTensor* tensor, void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
const platform::DeviceContext& dev_ctx); const platform::DeviceContext& dev_ctx);
extern void WriteToRecordIO(recordio::Writer& writer, extern void WriteToRecordIO(recordio::Writer* writer,
const std::vector<LoDTensor>& tensor, const std::vector<LoDTensor>& tensor,
const platform::DeviceContext& dev_ctx); const platform::DeviceContext& dev_ctx);
extern std::vector<LoDTensor> ReadFromRecordIO( extern std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx); recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -12,17 +12,17 @@ ...@@ -12,17 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm> #include <algorithm>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) { ...@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) {
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
{ {
recordio::Writer writer(stream, recordio::Compressor::kSnappy); recordio::Writer writer(stream, recordio::Compressor::kSnappy);
WriteToRecordIO(writer, {tensor, tensor}, ctx); WriteToRecordIO(&writer, {tensor, tensor}, ctx);
WriteToRecordIO(writer, {tensor, tensor}, ctx); WriteToRecordIO(&writer, {tensor, tensor}, ctx);
writer.Flush(); writer.Flush();
} }
...@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) { ...@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) {
{ {
std::unique_ptr<std::istream> stream_ptr(stream); std::unique_ptr<std::istream> stream_ptr(stream);
recordio::Scanner scanner(std::move(stream_ptr)); recordio::Scanner scanner(std::move(stream_ptr));
auto tensors = ReadFromRecordIO(scanner, ctx); auto tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), 2);
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
tensors = ReadFromRecordIO(scanner, ctx); tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_EQ(tensors.size(), 2); ASSERT_EQ(tensors.size(), 2);
assert_tensor_ok(tensors[0]); assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]); assert_tensor_ok(tensors[1]);
......
...@@ -30,7 +30,7 @@ __global__ void test(size_t* a, int size) { ...@@ -30,7 +30,7 @@ __global__ void test(size_t* a, int size) {
} }
TEST(LoD, data) { TEST(LoD, data) {
paddle::framework::InitDevices(); paddle::framework::InitDevices(true);
paddle::framework::LoD lod{{0, 1, 2}}; paddle::framework::LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5}); lod.push_back({0, 2, 4, 5});
...@@ -46,7 +46,7 @@ TEST(LoD, data) { ...@@ -46,7 +46,7 @@ TEST(LoD, data) {
} }
TEST(LoDTensor, LoDInGPU) { TEST(LoDTensor, LoDInGPU) {
paddle::framework::InitDevices(); paddle::framework::InitDevices(true);
paddle::framework::LoDTensor lod_tensor; paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0); paddle::platform::CUDAPlace place(0);
......
...@@ -72,7 +72,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator, ...@@ -72,7 +72,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
paddle::framework::OpWithoutKernelCheckerMaker); paddle::framework::OpWithoutKernelCheckerMaker);
TEST(OperatorBase, all) { TEST(OperatorBase, all) {
paddle::framework::InitDevices(); paddle::framework::InitDevices(true);
paddle::framework::proto::OpDesc op_desc; paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("test_operator"); op_desc.set_type("test_operator");
BuildVar("input", {"IN1"}, op_desc.add_inputs()); BuildVar("input", {"IN1"}, op_desc.add_inputs());
...@@ -198,7 +198,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel, ...@@ -198,7 +198,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
// test with single input // test with single input
TEST(OpKernel, all) { TEST(OpKernel, all) {
paddle::framework::InitDevices(); paddle::framework::InitDevices(true);
paddle::framework::proto::OpDesc op_desc; paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("op_with_kernel"); op_desc.set_type("op_with_kernel");
BuildVar("x", {"IN1"}, op_desc.add_inputs()); BuildVar("x", {"IN1"}, op_desc.add_inputs());
...@@ -228,7 +228,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, ...@@ -228,7 +228,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
TEST(OpKernel, multi_inputs) { TEST(OpKernel, multi_inputs) {
using namespace paddle::framework; using namespace paddle::framework;
paddle::framework::InitDevices(); paddle::framework::InitDevices(true);
proto::OpDesc op_desc; proto::OpDesc op_desc;
op_desc.set_type("op_multi_inputs_with_kernel"); op_desc.set_type("op_multi_inputs_with_kernel");
...@@ -269,7 +269,7 @@ class OperatorClone : public paddle::framework::OperatorBase { ...@@ -269,7 +269,7 @@ class OperatorClone : public paddle::framework::OperatorBase {
}; };
TEST(Operator, Clone) { TEST(Operator, Clone) {
paddle::framework::InitDevices(); paddle::framework::InitDevices(true);
OperatorClone a("ABC", paddle::framework::VariableNameMap{}, OperatorClone a("ABC", paddle::framework::VariableNameMap{},
paddle::framework::VariableNameMap{}, paddle::framework::VariableNameMap{},
paddle::framework::AttributeMap{}); paddle::framework::AttributeMap{});
......
...@@ -115,14 +115,12 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -115,14 +115,12 @@ void ParallelExecutor::BCastParamsToGPUs(
for (auto &var : vars) { for (auto &var : vars) {
auto *main_var = main_scope->FindVar(var); auto *main_var = main_scope->FindVar(var);
if (!main_var->IsType<LoDTensor>()) { if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
continue; continue;
} }
auto &main_tensor = main_var->Get<LoDTensor>(); auto &main_tensor = main_var->Get<LoDTensor>();
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
...@@ -174,12 +172,17 @@ void ParallelExecutor::SplitTensorToPlaces( ...@@ -174,12 +172,17 @@ void ParallelExecutor::SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor> &feed_tensors) { const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
for (auto it : feed_tensors) { for (auto it : feed_tensors) {
auto lod_tensors = it.second.SplitLoDTensor(member_->places_); auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
PADDLE_ENFORCE_EQ(
member_->places_.size(), lod_tensors.size(),
"The number of samples of current batch is less than the count of "
"devices, currently, it is not allowed. (%d vs %d)",
member_->places_.size(), lod_tensors.size());
for (size_t j = 0; j < member_->places_.size(); ++j) { for (size_t j = 0; j < member_->places_.size(); ++j) {
// TODO(panxy0718): Do I need to delete this var? // TODO(panxy0718): Do I need to delete this var?
member_->local_scopes_[j] auto t =
->Var(it.first) member_->local_scopes_[j]->Var(it.first)->GetMutable<LoDTensor>();
->GetMutable<LoDTensor>() t->ShareDataWith(lod_tensors[j]);
->ShareDataWith(lod_tensors[j]); t->set_lod(lod_tensors[j].lod());
} }
} }
} }
......
...@@ -48,13 +48,13 @@ class ParallelExecutor { ...@@ -48,13 +48,13 @@ class ParallelExecutor {
const std::string& fetched_var_name, const std::string& fetched_var_name,
const std::unordered_map<std::string, LoDTensor>& feed_tensors); const std::unordered_map<std::string, LoDTensor>& feed_tensors);
void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
private: private:
void SplitTensorToPlaces( void SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor>& feed_tensors); const std::unordered_map<std::string, LoDTensor>& feed_tensors);
ParallelExecutorPrivate* member_; ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
}; };
} // namespace framework } // namespace framework
......
...@@ -85,9 +85,9 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) { ...@@ -85,9 +85,9 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
} }
const std::vector<std::string> ProgramDesc::GetFeedTargetNames() { const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
BlockDesc *global_block = blocks_[0].get(); auto &global_block = Block(0);
std::vector<std::string> feed_target_names; std::vector<std::string> feed_target_names;
for (auto *op : global_block->AllOps()) { for (auto *op : global_block.AllOps()) {
if (op->Type() == kFeedOpType) { if (op->Type() == kFeedOpType) {
feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]); feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
} }
...@@ -96,9 +96,9 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() { ...@@ -96,9 +96,9 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
} }
const std::vector<std::string> ProgramDesc::GetFetchTargetNames() { const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
BlockDesc *global_block = blocks_[0].get(); auto &global_block = Block(0);
std::vector<std::string> fetch_target_names; std::vector<std::string> fetch_target_names;
for (auto *op : global_block->AllOps()) { for (auto *op : global_block.AllOps()) {
if (op->Type() == kFetchOpType) { if (op->Type() == kFetchOpType) {
fetch_target_names.push_back(op->Input("X")[0]); fetch_target_names.push_back(op->Input("X")[0]);
} }
...@@ -106,5 +106,43 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() { ...@@ -106,5 +106,43 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
return fetch_target_names; return fetch_target_names;
} }
void ProgramDesc::SetFeedHolderName(const std::string &feed_holder_name) {
auto *global_block = MutableBlock(0);
int index = 0;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
// Unify the input's name of all feed_ops to feed_holder_name
global_block->RemoveVar(op->Input("X")[0]);
op->SetInput("X", {feed_holder_name});
op->SetAttr("col", {index});
op->CheckAttrs();
index++;
}
}
auto *feed_holder = global_block->Var(feed_holder_name);
feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
feed_holder->SetPersistable(true);
}
void ProgramDesc::SetFetchHolderName(const std::string &fetch_holder_name) {
auto *global_block = MutableBlock(0);
int index = 0;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
// Unify the output's name of all fetch_ops to fetch_holder_name
global_block->RemoveVar(op->Output("Out")[0]);
op->SetOutput("Out", {fetch_holder_name});
op->SetAttr("col", {index});
op->CheckAttrs();
index++;
}
}
auto *fetch_holder = global_block->Var(fetch_holder_name);
fetch_holder->SetType(proto::VarType::FETCH_LIST);
fetch_holder->SetPersistable(true);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
...@@ -52,9 +53,26 @@ class ProgramDesc { ...@@ -52,9 +53,26 @@ class ProgramDesc {
proto::ProgramDesc *Proto(); proto::ProgramDesc *Proto();
// The output variable of feed_op is referenced as feed_target.
// This function is used to collect the output variable's name of all
// feed_ops.
const std::vector<std::string> GetFeedTargetNames(); const std::vector<std::string> GetFeedTargetNames();
// The input variable of fetch_op is referenced as fetch_target.
// This function is used to collect the input variable's name of all
// fetch_ops.
const std::vector<std::string> GetFetchTargetNames(); const std::vector<std::string> GetFetchTargetNames();
// The input variable of feed_op that holds input Tensor provided by users is
// referenced as feed_holder.
// This function is used to change or unify the feed_holder variables' name.
void SetFeedHolderName(const std::string &feed_holder_name);
// The output variable of fetch_op that holds output Tensor needed by users is
// referenced as fetch_holder.
// This function is used to change or unify the fetch_holder variables' name.
void SetFetchHolderName(const std::string &fetch_holder_name);
private: private:
proto::ProgramDesc desc_; proto::ProgramDesc desc_;
......
...@@ -22,7 +22,9 @@ FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {} ...@@ -22,7 +22,9 @@ FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
void FileReader::ReadNext(std::vector<LoDTensor> *out) { void FileReader::ReadNext(std::vector<LoDTensor> *out) {
ReadNextImpl(out); ReadNextImpl(out);
PADDLE_ENFORCE_EQ(out->size(), dims_.size()); if (out->empty()) {
return;
}
for (size_t i = 0; i < dims_.size(); ++i) { for (size_t i = 0; i < dims_.size(); ++i) {
auto &actual = out->at(i).dims(); auto &actual = out->at(i).dims();
auto &expect = dims_[i]; auto &expect = dims_[i];
......
...@@ -14,14 +14,13 @@ ...@@ -14,14 +14,13 @@
#pragma once #pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include <memory>
#include <thread>
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -31,8 +30,6 @@ class ReaderBase { ...@@ -31,8 +30,6 @@ class ReaderBase {
virtual void ReInit() = 0; virtual void ReInit() = 0;
virtual bool HasNext() const = 0;
virtual ~ReaderBase(); virtual ~ReaderBase();
}; };
...@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase { ...@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase {
void ReInit() override { reader_->ReInit(); } void ReInit() override { reader_->ReInit(); }
bool HasNext() const override { return reader_->HasNext(); }
protected: protected:
ReaderBase* reader_; ReaderBase* reader_;
}; };
...@@ -80,8 +75,6 @@ class ReaderHolder { ...@@ -80,8 +75,6 @@ class ReaderHolder {
reader_->ReInit(); reader_->ReInit();
} }
bool HasNext() const { return reader_->HasNext(); }
private: private:
std::unique_ptr<ReaderBase> reader_; std::unique_ptr<ReaderBase> reader_;
}; };
......
...@@ -24,7 +24,8 @@ function(inference_test TARGET_NAME) ...@@ -24,7 +24,8 @@ function(inference_test TARGET_NAME)
endforeach() endforeach()
endfunction(inference_test) endfunction(inference_test)
inference_test(fit_a_line) # This unittest is buggy!
#inference_test(fit_a_line)
inference_test(image_classification ARGS vgg resnet) inference_test(image_classification ARGS vgg resnet)
inference_test(label_semantic_roles) inference_test(label_semantic_roles)
inference_test(recognize_digits ARGS mlp conv) inference_test(recognize_digits ARGS mlp conv)
......
...@@ -12,6 +12,7 @@ limitations under the License. */ ...@@ -12,6 +12,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/inference/tests/test_multi_thread_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
...@@ -26,32 +27,63 @@ TEST(inference, fit_a_line) { ...@@ -26,32 +27,63 @@ TEST(inference, fit_a_line) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices // 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input; for (int num_threads : {1, 2}) {
// The second dim of the input tensor should be 13 std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
// The input data should be >= 0 cpu_feeds.resize(num_threads);
int64_t batch_size = 10; for (int i = 0; i < num_threads; ++i) {
SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0), auto* input = new paddle::framework::LoDTensor();
static_cast<float>(10)); // The second dim of the input tensor should be 13
std::vector<paddle::framework::LoDTensor*> cpu_feeds; // The input data should be >= 0
cpu_feeds.push_back(&input); int64_t batch_size = 10;
SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
static_cast<float>(10));
cpu_feeds[i].push_back(input);
}
paddle::framework::LoDTensor output1; std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; cpu_fetchs1.resize(num_threads);
cpu_fetchs1.push_back(&output1); for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor();
cpu_fetchs1[i].push_back(output);
}
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); LOG(INFO) << "--- CPU Runs (num_threads: " << num_threads << "): ---";
LOG(INFO) << output1.dims(); if (num_threads == 1) {
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds[0],
cpu_fetchs1[0]);
} else {
TestMultiThreadInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, num_threads);
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; cpu_fetchs2.resize(num_threads);
cpu_fetchs2.push_back(&output2); for (int i = 0; i < num_threads; ++i) {
auto* output = new paddle::framework::LoDTensor();
cpu_fetchs2[i].push_back(output);
}
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); LOG(INFO) << "--- GPU Runs (num_threads: " << num_threads << "): ---";
LOG(INFO) << output2.dims(); if (num_threads == 1) {
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds[0],
cpu_fetchs2[0]);
} else {
TestMultiThreadInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, num_threads);
}
CheckError<float>(output1, output2); for (int i = 0; i < num_threads; ++i) {
CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
delete cpu_fetchs2[i][0];
}
#endif #endif
for (int i = 0; i < num_threads; ++i) {
delete cpu_feeds[i][0];
delete cpu_fetchs1[i][0];
}
} // num_threads-loop
} }
...@@ -25,7 +25,8 @@ limitations under the License. */ ...@@ -25,7 +25,8 @@ limitations under the License. */
template <typename T> template <typename T>
void SetupTensor(paddle::framework::LoDTensor* input, void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, T lower, T upper) { paddle::framework::DDim dims, T lower, T upper) {
std::mt19937 rng(100); // An arbitrarily chosen but fixed seed. static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1); std::uniform_real_distribution<double> uniform_dist(0, 1);
T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace()); T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
void ThreadedRunInference(
const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
paddle::framework::Executor* executor, paddle::framework::Scope* scope,
const int thread_id,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
new paddle::framework::ProgramDesc(*inference_program));
std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
std::string fetch_holder_name =
"fetch_" + paddle::string::to_string(thread_id);
copy_program->SetFeedHolderName(feed_holder_name);
copy_program->SetFetchHolderName(fetch_holder_name);
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
copy_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names =
copy_program->GetFetchTargetNames();
// 4. Prepare inputs: set up maps for feed targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
for (size_t i = 0; i < feed_target_names.size(); ++i) {
// Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
}
// 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
}
// 6. Run the inference program
executor->Run(*copy_program, scope, feed_targets, fetch_targets, true,
feed_holder_name, fetch_holder_name);
}
template <typename Place>
void TestMultiThreadInference(
const std::string& dirname,
const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_feeds,
const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_fetchs,
const int num_threads) {
// 1. Define place, executor, scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
// 2. Initialize the inference_program and load parameters
std::unique_ptr<paddle::framework::ProgramDesc> inference_program =
paddle::inference::Load(executor, *scope, dirname);
std::vector<std::thread*> threads;
for (int i = 0; i < num_threads; ++i) {
threads.push_back(new std::thread(
ThreadedRunInference, std::ref(inference_program), &executor, scope, i,
std::ref(cpu_feeds[i]), std::ref(cpu_fetchs[i])));
}
for (int i = 0; i < num_threads; ++i) {
threads[i]->join();
delete threads[i];
}
delete scope;
}
...@@ -662,14 +662,3 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, ...@@ -662,14 +662,3 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
ops::grad_functor<double>>); ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
REGISTER_OP_CPU_KERNEL(relu,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ReluFunctor<float>>,
ops::ActivationKernel<paddle::platform::CPUDeviceContext,
ops::ReluFunctor<double>>);
REGISTER_OP_CPU_KERNEL(
relu_grad, ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::ReluGradFunctor<float>>,
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,
ops::ReluGradFunctor<double>>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -17,31 +14,19 @@ limitations under the License. */ ...@@ -17,31 +14,19 @@ limitations under the License. */
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \ #define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor) \
act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \ REGISTER_OP_CUDA_KERNEL( \
ops::functor<float>>, \ act_type, \
ops::ActivationKernel<paddle::platform::CUDADeviceContext, \ ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>, \
ops::functor<double>>); \ ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
REGISTER_OP_CUDA_KERNEL( \ ops::ActivationKernel<plat::CUDADeviceContext, \
act_type##_grad, \ ops::functor<plat::float16>>); \
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext, \ REGISTER_OP_CUDA_KERNEL( \
ops::grad_functor<float>>, \ act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext, \ ops::grad_functor<float>>, \
ops::ActivationGradKernel<plat::CUDADeviceContext, \
ops::grad_functor<double>>); ops::grad_functor<double>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
REGISTER_OP_CUDA_KERNEL(
relu, ops::ActivationKernel<paddle::platform::CUDADeviceContext,
ops::ReluFunctor<float>>,
ops::ActivationKernel<paddle::platform::CUDADeviceContext,
ops::ReluFunctor<double>>,
ops::ActivationKernel<paddle::platform::CUDADeviceContext,
ops::ReluFunctor<paddle::platform::float16>>);
REGISTER_OP_CUDA_KERNEL(
relu_grad, ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
ops::ReluGradFunctor<float>>,
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,
ops::ReluGradFunctor<double>>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -13,9 +10,13 @@ See the License for the specific language governing permissions and ...@@ -13,9 +10,13 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/float16.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -336,11 +337,25 @@ struct Sine { ...@@ -336,11 +337,25 @@ struct Sine {
HOSTDEVICE T operator()(const T& val) const { return sin(val); } HOSTDEVICE T operator()(const T& val) const { return sin(val); }
}; };
template <>
struct Sine<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(sin(static_cast<float>(val)));
}
};
template <typename T> template <typename T>
struct Cosine { struct Cosine {
HOSTDEVICE T operator()(const T& val) const { return cos(val); } HOSTDEVICE T operator()(const T& val) const { return cos(val); }
}; };
template <>
struct Cosine<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(cos(static_cast<float>(val)));
}
};
// cosine'(x) = -sin(x) // cosine'(x) = -sin(x)
template <typename T> template <typename T>
struct CosGradFunctor : public BaseActivationFunctor<T> { struct CosGradFunctor : public BaseActivationFunctor<T> {
...@@ -824,6 +839,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -824,6 +839,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \
__macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \
__macro(exp, ExpFunctor, ExpGradFunctor); \ __macro(exp, ExpFunctor, ExpGradFunctor); \
__macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/adagrad_op.h" #include "paddle/fluid/operators/adagrad_op.h"
#include <vector>
#include <cmath> #include <cmath>
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/assign_value_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/auc_op.h" #include "paddle/fluid/operators/auc_op.h"
#include <string>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
...@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
std::vector<float> thresholds_list; std::vector<float> thresholds_list;
thresholds_list.reserve(num_thresholds); thresholds_list.reserve(num_thresholds);
for (int i = 1; i < num_thresholds - 1; i++) { for (int i = 1; i < num_thresholds - 1; i++) {
thresholds_list[i] = (float)i / (num_thresholds - 1); thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
} }
const float kEpsilon = 1e-7; const float kEpsilon = 1e-7;
thresholds_list[0] = 0.0f - kEpsilon; thresholds_list[0] = 0.0f - kEpsilon;
...@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> { ...@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace()); float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace()); float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
for (int i = 0; i < num_thresholds; i++) { for (int i = 0; i < num_thresholds; i++) {
tp_rate_data[i] = tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon); (tp_data[i] + fn_data[i] + epsilon);
fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon); fp_rate_data[i] =
rec_rate_data[i] = static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon); rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
(tp_data[i] + fp_data[i] + epsilon);
} }
*auc_data = 0.0f; *auc_data = 0.0f;
if (curve == "ROC") { if (curve == "ROC") {
......
...@@ -19,15 +19,15 @@ namespace operators { ...@@ -19,15 +19,15 @@ namespace operators {
template <> template <>
void GetAccumulators<paddle::platform::CPUDeviceContext>( void GetAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t* num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0]; *old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
num_accumulates_ = in_num_accumulates->data<int64_t>()[0]; *num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
num_updates_ = in_num_updates->data<int64_t>()[0]; *num_updates_ = in_num_updates->data<int64_t>()[0];
} }
template <> template <>
......
...@@ -19,18 +19,18 @@ namespace paddle { ...@@ -19,18 +19,18 @@ namespace paddle {
namespace operators { namespace operators {
template <> template <>
void GetAccumulators<paddle::platform::CUDADeviceContext>( void GetAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_, const framework::ExecutionContext& ctx, int64_t* num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) { int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates"); auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates"); auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates"); auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), &old_num_accumulates_, memory::Copy(platform::CPUPlace(), old_num_accumulates_,
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(), platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t), stream); sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream); in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
in_num_updates->data<int64_t>(), sizeof(int64_t), stream); in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
} }
......
...@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>; ...@@ -29,8 +29,8 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext> template <typename DeviceContext>
void GetAccumulators(const framework::ExecutionContext& ctx, void GetAccumulators(const framework::ExecutionContext& ctx,
int64_t& num_updates, int64_t& num_accumulates, int64_t* num_updates, int64_t* num_accumulates,
int64_t& old_num_accumulates); int64_t* old_num_accumulates);
template <typename DeviceContext> template <typename DeviceContext>
void SetAccumulators(const framework::ExecutionContext& ctx, void SetAccumulators(const framework::ExecutionContext& ctx,
...@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> { ...@@ -47,8 +47,8 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
int64_t num_updates = 0; int64_t num_updates = 0;
int64_t num_accumulates = 0; int64_t num_accumulates = 0;
int64_t old_num_accumulates = 0; int64_t old_num_accumulates = 0;
GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates, GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
old_num_accumulates); &old_num_accumulates);
// Get attrs // Get attrs
float average_window = ctx.Attr<float>("average_window"); float average_window = ctx.Attr<float>("average_window");
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include <string>
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
namespace paddle { namespace paddle {
......
...@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/framework/data_layout.h"
#include <cfloat> #include <cfloat>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
......
...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/compare_op.h" #include "paddle/fluid/operators/compare_op.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/concat_op.h" #include "paddle/fluid/operators/concat_op.h"
#include <string>
#include <vector> #include <vector>
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_transpose_op.h" #include "paddle/fluid/operators/conv_transpose_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <limits>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
......
...@@ -65,9 +65,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -65,9 +65,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
} }
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h,
// const sendrecv::VariableMessage& ret_msg) {
const ::grpc::ByteBuffer& ret_msg) { const ::grpc::ByteBuffer& ret_msg) {
framework::Variable* outvar = NULL; framework::Variable* outvar = nullptr;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
} }
......
...@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < tensor_numel; ++i) { for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
} }
for (int64_t i = 0; i < rows2->size(); ++i) { for (size_t i = 0; i < rows2->size(); ++i) {
EXPECT_EQ(rows_data2[i], i); EXPECT_EQ(rows_data2[i], i);
} }
EXPECT_EQ(slr2->height(), 1000); EXPECT_EQ(slr2->height(), 1000);
......
...@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
#ifdef __NVCC__ #ifdef __NVCC__
#include <cuda.h>
#include <thrust/iterator/iterator_adaptor.h> #include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_helper.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif #endif
...@@ -43,35 +44,35 @@ namespace operators { ...@@ -43,35 +44,35 @@ namespace operators {
*/ */
inline void get_mid_dims(const framework::DDim& x_dims, inline void get_mid_dims(const framework::DDim& x_dims,
const framework::DDim& y_dims, const int axis, const framework::DDim& y_dims, const int axis,
int& pre, int& n, int& post) { int* pre, int* n, int* post) {
pre = 1; *pre = 1;
n = 1; *n = 1;
post = 1; *post = 1;
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
pre *= x_dims[i]; (*pre) *= x_dims[i];
} }
for (int i = 0; i < y_dims.size(); ++i) { for (int i = 0; i < y_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
"Broadcast dimension mismatch."); "Broadcast dimension mismatch.");
n *= y_dims[i]; (*n) *= y_dims[i];
} }
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
post *= x_dims[i]; (*post) *= x_dims[i];
} }
} }
inline void trim_trailing_singular_dims(framework::DDim& dims) { inline void trim_trailing_singular_dims(framework::DDim* dims) {
// Remove trailing dimensions of size 1 for y // Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size(); auto actual_dims_size = dims->size();
for (; actual_dims_size != 0; --actual_dims_size) { for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break; if ((*dims)[actual_dims_size - 1] != 1) break;
} }
if (actual_dims_size != dims.size()) { if (actual_dims_size != dims->size()) {
auto actual_dims = framework::vectorize(dims); auto actual_dims = framework::vectorize(*dims);
actual_dims.resize(actual_dims_size); actual_dims.resize(actual_dims_size);
dims = framework::make_ddim(actual_dims); *dims = framework::make_ddim(actual_dims);
} }
} }
...@@ -159,7 +160,7 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext> ...@@ -159,7 +160,7 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext>
RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
super_t; super_t;
HOSTDEVICE RowwiseTransformIterator(const T* x, int n) HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
: super_t(x), begin_(x), n_(n){}; : super_t(x), begin_(x), n_(n) {}
friend class thrust::iterator_core_access; friend class thrust::iterator_core_access;
private: private:
...@@ -179,7 +180,7 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext> ...@@ -179,7 +180,7 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext>
MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
super_t; super_t;
HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
: super_t(x), begin_(x), n_(n), post_(post){}; : super_t(x), begin_(x), n_(n), post_(post) {}
friend class thrust::iterator_core_access; friend class thrust::iterator_core_access;
private: private:
...@@ -333,6 +334,55 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out, ...@@ -333,6 +334,55 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
} }
} }
#ifdef __NVCC__ #ifdef __NVCC__
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
__shared__ T shm[32];
const int warpSize = 32;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
}
return val;
}
template <typename T, typename DX_OP, typename DY_OP> template <typename T, typename DX_OP, typename DY_OP>
static __global__ void ElemwiseGradBroadcast1CUDAKernel( static __global__ void ElemwiseGradBroadcast1CUDAKernel(
const T* x, const T* y, const T* out, const T* dout, int h, int w, const T* x, const T* y, const T* out, const T* dout, int h, int w,
...@@ -355,7 +405,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( ...@@ -355,7 +405,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
if (dy) { if (dy) {
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = platform::reduceSum(val, tid, h); val = reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
...@@ -432,7 +482,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( ...@@ -432,7 +482,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
if (dy) { if (dy) {
int h = pre * post; int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = platform::reduceSum(val, tid, h); val = reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
} }
...@@ -472,11 +522,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -472,11 +522,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
auto y_dim = y.dims(); auto y_dim = y.dims();
axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis); axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
trim_trailing_singular_dims(y_dim); trim_trailing_singular_dims(&y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis; axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post; int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, pre, n, post); get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
if (post == 1) { if (post == 1) {
int h = pre; int h = pre;
int w = n; int w = n;
...@@ -514,7 +564,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -514,7 +564,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
} }
} }
} }
}; }
template <typename DeviceContext, typename T, typename functor, template <typename DeviceContext, typename T, typename functor,
typename broadcastfunctor, typename broadcast2functor> typename broadcastfunctor, typename broadcast2functor>
...@@ -543,11 +593,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx, ...@@ -543,11 +593,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
} }
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
trim_trailing_singular_dims(y_dims); trim_trailing_singular_dims(&y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post; int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post); get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
if (post == 1) { if (post == 1) {
broadcastfunctor f; broadcastfunctor f;
...@@ -582,11 +632,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx, ...@@ -582,11 +632,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)"); "Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(y_dims); trim_trailing_singular_dims(&y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
int pre, n, post; int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post); get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
if (post == 1) { if (post == 1) {
functor.RunRowWise(n, pre); functor.RunRowWise(n, pre);
return; return;
......
...@@ -39,13 +39,14 @@ void gemm<platform::CUDADeviceContext, float16>( ...@@ -39,13 +39,14 @@ void gemm<platform::CUDADeviceContext, float16>(
cublasOperation_t cuTransB = cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
// TODO(kexinzhao): add processing code for compute capability < 53 case // TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53, PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas fp16 gemm requires GPU compute capability >= 53"); "cublas fp16 gemm requires GPU compute capability >= 53");
#if CUDA_VERSION >= 8000
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
if (context.GetComputeCapability() >= 70) { if (context.GetComputeCapability() >= 70) {
...@@ -56,7 +57,7 @@ void gemm<platform::CUDADeviceContext, float16>( ...@@ -56,7 +57,7 @@ void gemm<platform::CUDADeviceContext, float16>(
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(), PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
CUBLAS_DEFAULT_MATH)); CUBLAS_DEFAULT_MATH));
} }
#endif #endif // CUDA_VERSION >= 9000
// cublasHgemm does true FP16 computation which is slow for non-Volta // cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
...@@ -66,6 +67,18 @@ void gemm<platform::CUDADeviceContext, float16>( ...@@ -66,6 +67,18 @@ void gemm<platform::CUDADeviceContext, float16>(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
CUDA_R_32F, algo)); CUDA_R_32F, algo));
#else
// CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, N));
#endif // CUDA_VERSION >= 8000
} }
template <> template <>
......
...@@ -66,13 +66,7 @@ class ReadOp : public framework::OperatorBase { ...@@ -66,13 +66,7 @@ class ReadOp : public framework::OperatorBase {
std::vector<std::string> out_arg_names = Outputs("Out"); std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
reader->ReInit();
reader->ReadNext(&ins);
PADDLE_ENFORCE(
!ins.empty(),
"Reader can not read the next data even it has been re-initialized.");
}
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
auto* out = auto* out =
......
...@@ -22,5 +22,6 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc) ...@@ -22,5 +22,6 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc) reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc) reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
# Export local libraries to parent # Export local libraries to parent
set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE) set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
...@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader { ...@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader {
StartPrefetcher(); StartPrefetcher();
} }
bool HasNext() const override;
void ReadNext(std::vector<framework::LoDTensor>* out) override; void ReadNext(std::vector<framework::LoDTensor>* out) override;
void ReInit() override; void ReInit() override;
~DoubleBufferReader() { EndPrefetcher(); } ~DoubleBufferReader() { EndPrefetcher(); }
private: private:
bool HasNext() const;
void StartPrefetcher() { void StartPrefetcher() {
channel_ = framework::MakeChannel<Item>(kChannelSize); channel_ = framework::MakeChannel<Item>(kChannelSize);
prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
...@@ -109,7 +110,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { ...@@ -109,7 +110,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
auto place_str = Attr<std::string>("place"); auto place_str = Attr<std::string>("place");
platform::Place place; platform::Place place;
if (place_str == "CPU") { if (place_str == "AUTO") {
place = dev_place;
} else if (place_str == "CPU") {
place = platform::CPUPlace(); place = platform::CPUPlace();
} else { } else {
std::istringstream sin(place_str); std::istringstream sin(place_str);
...@@ -140,28 +143,22 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -140,28 +143,22 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
enum_range.insert(string::Sprintf("CUDA:%d", i)); enum_range.insert(string::Sprintf("CUDA:%d", i));
} }
enum_range.insert("CPU"); enum_range.insert("CPU");
AddAttr<std::string>("place", "The double buffer place, default is CPU") enum_range.insert("AUTO");
.SetDefault("CPU") AddAttr<std::string>("place", "The double buffer place")
.SetDefault("AUTO")
.InEnum({enum_range}); .InEnum({enum_range});
} }
}; };
bool DoubleBufferReader::HasNext() const {
while (!channel_->IsClosed() && !channel_->CanReceive()) {
}
return channel_->CanReceive();
}
void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) { void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (!HasNext()) { out->clear();
PADDLE_THROW("There is no next data!"); if (HasNext()) {
} Item batch;
channel_->Receive(&batch);
Item batch; *out = batch.payloads_;
channel_->Receive(&batch); if (batch.ctx_) {
*out = batch.payloads_; batch.ctx_->Wait();
if (batch.ctx_) { }
batch.ctx_->Wait();
} }
} }
...@@ -171,16 +168,26 @@ void DoubleBufferReader::ReInit() { ...@@ -171,16 +168,26 @@ void DoubleBufferReader::ReInit() {
StartPrefetcher(); StartPrefetcher();
} }
bool DoubleBufferReader::HasNext() const {
while (!channel_->IsClosed() && !channel_->CanReceive()) {
}
return channel_->CanReceive();
}
void DoubleBufferReader::PrefetchThreadFunc() { void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts."; VLOG(5) << "A new prefetch thread starts.";
std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize); std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize); std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
size_t cached_tensor_id = 0; size_t cached_tensor_id = 0;
while (reader_->HasNext()) { while (true) {
Item batch; Item batch;
auto& cpu_batch = cpu_tensor_cache[cached_tensor_id]; auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
reader_->ReadNext(&cpu_batch); reader_->ReadNext(&cpu_batch);
if (cpu_batch.empty()) {
// The underlying reader have no next data.
break;
}
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
auto& gpu_batch = gpu_tensor_cache[cached_tensor_id]; auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
auto* gpu_ctx = ctxs_[cached_tensor_id].get(); auto* gpu_ctx = ctxs_[cached_tensor_id].get();
......
...@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader { ...@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader {
: DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {} : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
reader_->ReadNext(out); reader_->ReadNext(out);
} if (out->empty()) {
bool HasNext() const override {
if (reader_->HasNext()) {
return true;
} else {
++pass_count_; ++pass_count_;
if (pass_count_ >= pass_num_) { if (pass_count_ < pass_num_) {
return false;
} else {
reader_->ReInit(); reader_->ReInit();
return true; reader_->ReadNext(out);
} }
} }
} }
......
...@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase { ...@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase {
void ReInit() override { return; } void ReInit() override { return; }
bool HasNext() const override { return true; }
private: private:
float min_; float min_;
float max_; float max_;
...@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase { ...@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
const auto& ranks = Attr<std::vector<int>>("ranks"); const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()), static_cast<int>(shape_concat.size()),
"The accumulate of all ranks should be equal to the " "The accumulate of all ranks should be equal to the "
"shape concat's length."); "shape concat's length.");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks); std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <mutex>
#include <thread>
#include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/operators/reader/reader_op_registry.h"
#include "paddle/fluid/recordio/scanner.h" #include "paddle/fluid/recordio/scanner.h"
...@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader { ...@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader {
LOG(INFO) << "Creating file reader" << filename; LOG(INFO) << "Creating file reader" << filename;
} }
bool HasNext() const override { return scanner_.HasNext(); }
void ReInit() override { scanner_.Reset(); } void ReInit() override { scanner_.Reset(); }
protected: protected:
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override { void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
if (ThreadSafe) { if (ThreadSafe) {
std::lock_guard<std::mutex> guard(*mutex_); std::lock_guard<std::mutex> guard(*mutex_);
*out = framework::ReadFromRecordIO(scanner_, dev_ctx_); *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
} else { } else {
*out = framework::ReadFromRecordIO(scanner_, dev_ctx_); *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
} }
} }
...@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase { ...@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
const auto& ranks = Attr<std::vector<int>>("ranks"); const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()), static_cast<int>(shape_concat.size()),
"The accumulate of all ranks should be equal to the " "The accumulate of all ranks should be equal to the "
"shape concat's length."); "shape concat's length.");
std::string filename = Attr<std::string>("filename"); std::string filename = Attr<std::string>("filename");
......
...@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader {
std::random_device device; std::random_device device;
seed_ = device(); seed_ = device();
} }
ReadIntoBuffers(); ReloadBuffer();
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!HasNext()) { out->clear();
PADDLE_THROW("There is no next data!");
}
if (iteration_pos_ >= buffer_.size()) { if (iteration_pos_ >= buffer_.size()) {
VLOG(10) << "Resetting shuffle buffer"; VLOG(10) << "Resetting shuffle buffer";
ReadIntoBuffers(); ReloadBuffer();
if (buffer_.empty()) {
return;
}
} }
*out = buffer_[iteration_pos_++]; *out = buffer_[iteration_pos_++];
} }
bool HasNext() const override {
return iteration_pos_ < buffer_.size() || reader_->HasNext();
}
private: private:
void ReadIntoBuffers() { void ReloadBuffer() {
buffer_.clear(); buffer_.clear();
buffer_.reserve(buffer_size_); buffer_.reserve(buffer_size_);
iteration_pos_ = 0; iteration_pos_ = 0;
for (size_t i = 0; i < buffer_size_; ++i) { for (size_t i = 0; i < buffer_size_; ++i) {
if (!reader_->HasNext()) { std::vector<framework::LoDTensor> ins;
reader_->ReadNext(&ins);
if (ins.empty()) {
break; break;
} }
buffer_.emplace_back(); buffer_.emplace_back(ins);
reader_->ReadNext(&buffer_.back());
} }
std::mt19937 g(seed_); std::mt19937 g(seed_);
std::shuffle(buffer_.begin(), buffer_.end(), g); std::shuffle(buffer_.begin(), buffer_.end(), g);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class ThreadedReader : public framework::DecoratedReader {
public:
ThreadedReader(ReaderBase* reader, bool safe_mode)
: DecoratedReader(reader), safe_mode_(safe_mode) {}
void ReadNext(std::vector<framework::LoDTensor>* out) override {
std::lock_guard<std::mutex> lock(mutex_);
reader_->ReadNext(out);
}
void ReInit() override {
if (safe_mode_) {
PADDLE_THROW(
"ThreadedReader::ReInit() is disabled when 'safe_mode' is true.");
}
VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
"multi-thread environment.";
reader_->ReInit();
}
private:
bool safe_mode_;
std::mutex mutex_;
};
class CreateThreadedReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
auto* out = detail::Ref(scope.FindVar(Output("Out")))
.GetMutable<framework::ReaderHolder>();
if (out->Get() != nullptr) {
return;
}
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
bool safe_mode = Attr<bool>("safe_mode");
out->Reset(new ThreadedReader(underlying_reader.Get(), safe_mode));
}
};
class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
public:
CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: DecoratedReaderMakerBase(op_proto, op_checker) {
AddAttr<bool>("safe_mode",
"When 'safe_mode' is true, 'ReInit()' is disabled to avoid "
"unexpected bugs in multi-thread environment.")
.SetDefault(true);
AddComment(R"DOC(
CreateThreadedReader Operator
This operator creates a threaded reader. A threaded reader's
'ReadNext()' can be invoked by several threads at the same
time.
When the attribute 'safe_mode' is true, the threaded reader's
'ReInit()' is disabled to avoid unexpected bugs in multi-thread
environment.
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace reader = paddle::operators::reader;
REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
reader::CreateThreadedReaderOp,
reader::CreateThreadedReaderOpMaker);
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <thread> // NOLINT
#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/operators/reader/reader_op_registry.h"
...@@ -19,38 +21,23 @@ namespace paddle { ...@@ -19,38 +21,23 @@ namespace paddle {
namespace operators { namespace operators {
namespace reader { namespace reader {
class MultipleReader : public framework::ReaderBase { class MultiFileReader : public framework::ReaderBase {
public: public:
class ThreadBufferMap { MultiFileReader(const std::vector<std::string>& file_names,
public: const std::vector<framework::DDim>& dims, size_t thread_num,
std::vector<framework::LoDTensor>& operator[]( size_t buffer_size)
const std::thread::id& thread_id) { : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
std::lock_guard<std::mutex> lock(mutex_);
return buffer_[thread_id];
}
void Clear() { buffer_.clear(); }
private:
std::mutex mutex_;
std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
buffer_;
};
MultipleReader(const std::vector<std::string>& file_names,
const std::vector<framework::DDim>& dims, size_t thread_num)
: file_names_(file_names), dims_(dims) {
prefetchers_.resize(thread_num); prefetchers_.resize(thread_num);
StartNewScheduler(); StartNewScheduler();
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override; void ReadNext(std::vector<framework::LoDTensor>* out) override;
bool HasNext() const override;
void ReInit() override; void ReInit() override;
~MultipleReader() { EndScheduler(); } ~MultiFileReader() { EndScheduler(); }
private: private:
bool HasNext();
void StartNewScheduler(); void StartNewScheduler();
void EndScheduler(); void EndScheduler();
void ScheduleThreadFunc(); void ScheduleThreadFunc();
...@@ -60,39 +47,36 @@ class MultipleReader : public framework::ReaderBase { ...@@ -60,39 +47,36 @@ class MultipleReader : public framework::ReaderBase {
std::vector<framework::DDim> dims_; std::vector<framework::DDim> dims_;
std::thread scheduler_; std::thread scheduler_;
std::vector<std::thread> prefetchers_; std::vector<std::thread> prefetchers_;
size_t buffer_size_;
framework::Channel<size_t>* waiting_file_idx_; framework::Channel<size_t>* waiting_file_idx_;
framework::Channel<size_t>* available_thread_idx_; framework::Channel<size_t>* available_thread_idx_;
framework::Channel<std::vector<framework::LoDTensor>>* buffer_; framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
mutable ThreadBufferMap thread_buffer_map_;
}; };
void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) { void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (!HasNext()) { out->clear();
PADDLE_THROW("There is no next data!"); if (HasNext()) {
buffer_->Receive(out);
} }
auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
*out = thread_local_buffer;
thread_local_buffer.clear();
}
bool MultipleReader::HasNext() const {
auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
return thread_local_buffer.empty() ? buffer_->Receive(&thread_local_buffer)
: true;
} }
void MultipleReader::ReInit() { void MultiFileReader::ReInit() {
EndScheduler(); EndScheduler();
thread_buffer_map_.Clear();
StartNewScheduler(); StartNewScheduler();
} }
void MultipleReader::StartNewScheduler() { bool MultiFileReader::HasNext() {
while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
}
return buffer_->CanReceive();
}
void MultiFileReader::StartNewScheduler() {
size_t thread_num = prefetchers_.size(); size_t thread_num = prefetchers_.size();
waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size()); waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
available_thread_idx_ = framework::MakeChannel<size_t>(thread_num); available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
buffer_ = buffer_ =
framework::MakeChannel<std::vector<framework::LoDTensor>>(thread_num); framework::MakeChannel<std::vector<framework::LoDTensor>>(buffer_size_);
for (size_t i = 0; i < file_names_.size(); ++i) { for (size_t i = 0; i < file_names_.size(); ++i) {
waiting_file_idx_->Send(&i); waiting_file_idx_->Send(&i);
...@@ -105,7 +89,7 @@ void MultipleReader::StartNewScheduler() { ...@@ -105,7 +89,7 @@ void MultipleReader::StartNewScheduler() {
scheduler_ = std::thread([this] { ScheduleThreadFunc(); }); scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
} }
void MultipleReader::EndScheduler() { void MultiFileReader::EndScheduler() {
available_thread_idx_->Close(); available_thread_idx_->Close();
buffer_->Close(); buffer_->Close();
waiting_file_idx_->Close(); waiting_file_idx_->Close();
...@@ -117,8 +101,8 @@ void MultipleReader::EndScheduler() { ...@@ -117,8 +101,8 @@ void MultipleReader::EndScheduler() {
delete waiting_file_idx_; delete waiting_file_idx_;
} }
void MultipleReader::ScheduleThreadFunc() { void MultiFileReader::ScheduleThreadFunc() {
VLOG(5) << "MultipleReader schedule thread starts."; VLOG(5) << "MultiFileReader schedule thread starts.";
size_t completed_thread_num = 0; size_t completed_thread_num = 0;
size_t thread_idx; size_t thread_idx;
while (available_thread_idx_->Receive(&thread_idx)) { while (available_thread_idx_->Receive(&thread_idx)) {
...@@ -150,17 +134,20 @@ void MultipleReader::ScheduleThreadFunc() { ...@@ -150,17 +134,20 @@ void MultipleReader::ScheduleThreadFunc() {
p.join(); p.join();
} }
} }
VLOG(5) << "MultipleReader schedule thread terminates."; VLOG(5) << "MultiFileReader schedule thread terminates.";
} }
void MultipleReader::PrefetchThreadFunc(std::string file_name, void MultiFileReader::PrefetchThreadFunc(std::string file_name,
size_t thread_idx) { size_t thread_idx) {
VLOG(5) << "The prefetch thread of file '" << file_name << "' starts."; VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
std::unique_ptr<framework::ReaderBase> reader = std::unique_ptr<framework::ReaderBase> reader =
CreateReaderByFileName(file_name, dims_); CreateReaderByFileName(file_name, dims_);
while (reader->HasNext()) { while (true) {
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) {
break;
}
try { try {
buffer_->Send(&ins); buffer_->Send(&ins);
} catch (paddle::platform::EnforceNotMet e) { } catch (paddle::platform::EnforceNotMet e) {
...@@ -197,11 +184,13 @@ class OpenFilesOp : public framework::OperatorBase { ...@@ -197,11 +184,13 @@ class OpenFilesOp : public framework::OperatorBase {
const auto& file_names = Attr<std::vector<std::string>>("file_names"); const auto& file_names = Attr<std::vector<std::string>>("file_names");
PADDLE_ENFORCE(!file_names.empty(), "No file to be read!"); PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
const size_t thread_num = Attr<int>("thread_num"); const size_t thread_num = Attr<int>("thread_num");
const size_t buffer_size = Attr<int>("buffer_size");
auto* out = scope.FindVar(Output("Out")) auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>(); ->template GetMutable<framework::ReaderHolder>();
out->Reset(new MultipleReader( out->Reset(new MultiFileReader(file_names,
file_names, RestoreShapes(shape_concat, ranks), thread_num)); RestoreShapes(shape_concat, ranks),
thread_num, buffer_size));
} }
}; };
...@@ -212,11 +201,12 @@ class OpenFilesOpMaker : public FileReaderMakerBase { ...@@ -212,11 +201,12 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
AddAttr<std::vector<std::string>>("file_names", "Files to be read."); AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.") AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
.GreaterThan(0); .GreaterThan(0);
AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
AddComment(R"DOC( AddComment(R"DOC(
OpenFiles Operator OpenFiles Operator
An OpenFilesOp creates a MultipleReader, which is able to An OpenFilesOp creates a MultiFileReader, which is able to
read data multi-threaded from multiple files. read data multi-threaded from multiple files.
)DOC"); )DOC");
} }
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/spp_op.h" #include "paddle/fluid/operators/spp_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
......
...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/operators/sum_op.h"
#include <algorithm>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
......
...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and ...@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/unpool_op.h" #include "paddle/fluid/operators/unpool_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/unpooling.h" #include "paddle/fluid/operators/math/unpooling.h"
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/sequence_padding.h" #include "paddle/fluid/operators/math/sequence_padding.h"
......
...@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512; ...@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
USE_CUDA_ATOMIC(Add, float); USE_CUDA_ATOMIC(Add, float);
USE_CUDA_ATOMIC(Add, int); USE_CUDA_ATOMIC(Add, int);
USE_CUDA_ATOMIC(Add, unsigned int); USE_CUDA_ATOMIC(Add, unsigned int);
USE_CUDA_ATOMIC(Add, unsigned long long int); // CUDA API uses unsigned long long int, we cannot use uint64_t here.
// It because unsigned long long int is not necessarily uint64_t
USE_CUDA_ATOMIC(Add, unsigned long long int); // NOLINT
CUDA_ATOMIC_WRAPPER(Add, int64_t) { CUDA_ATOMIC_WRAPPER(Add, int64_t) {
static_assert(sizeof(int64_t) == sizeof(long long int), // Here, we check long long int must be int64_t.
static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT
"long long should be int64"); "long long should be int64");
return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address), return CudaAtomicAdd(
static_cast<unsigned long long int>(val)); reinterpret_cast<unsigned long long int*>(address), // NOLINT
static_cast<unsigned long long int>(val)); // NOLINT
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
USE_CUDA_ATOMIC(Add, double); USE_CUDA_ATOMIC(Add, double);
#else #else
CUDA_ATOMIC_WRAPPER(Add, double) { CUDA_ATOMIC_WRAPPER(Add, double) {
unsigned long long int* address_as_ull = unsigned long long int* address_as_ull = // NOLINT
reinterpret_cast<unsigned long long int*>(address); reinterpret_cast<unsigned long long int*>(address); // NOLINT
unsigned long long int old = *address_as_ull, assumed; unsigned long long int old = *address_as_ull, assumed; // NOLINT
do { do {
assumed = old; assumed = old;
...@@ -62,53 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) { ...@@ -62,53 +66,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
} }
#endif #endif
// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif
template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
__shared__ T shm[32];
const int warpSize = 32;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
if (tid < warpSize) shm[tid] = 0;
__syncthreads();
if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}
CREATE_SHFL_MASK(mask, tid < warpSize);
if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
}
return val;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -8,10 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,10 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <string>
#include <unordered_set> #include <unordered_set>
#include <vector>
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -8,13 +8,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -8,13 +8,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
......
...@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "gtest/gtest.h"
TEST(Device, Init) { TEST(Device, Init) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
......
...@@ -11,15 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,15 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include <google/protobuf/text_format.h>
#include <deque>
#include <fstream> #include <fstream>
#include <map> #include <map>
#include <mutex> #include <mutex> // NOLINT
#include <numeric> #include <numeric>
#include <thread> #include <string>
#include <thread> // NOLINT
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -123,7 +127,7 @@ void DisableActivity() { ...@@ -123,7 +127,7 @@ void DisableActivity() {
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
size_t *maxNumRecords) { size_t *maxNumRecords) {
uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize); uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
*size = kBufSize; *size = kBufSize;
*buffer = ALIGN_BUFFER(buf, kAlignSize); *buffer = ALIGN_BUFFER(buf, kAlignSize);
*maxNumRecords = 0; *maxNumRecords = 0;
......
...@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
......
...@@ -28,6 +28,10 @@ CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP); ...@@ -28,6 +28,10 @@ CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif #endif
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -71,7 +71,6 @@ extern void *cublas_dso_handle; ...@@ -71,7 +71,6 @@ extern void *cublas_dso_handle;
__macro(cublasDgemm_v2); \ __macro(cublasDgemm_v2); \
__macro(cublasHgemm); \ __macro(cublasHgemm); \
__macro(cublasSgemmEx); \ __macro(cublasSgemmEx); \
__macro(cublasGemmEx); \
__macro(cublasSgeam_v2); \ __macro(cublasSgeam_v2); \
__macro(cublasDgeam_v2); \ __macro(cublasDgeam_v2); \
__macro(cublasCreate_v2); \ __macro(cublasCreate_v2); \
...@@ -83,11 +82,6 @@ extern void *cublas_dso_handle; ...@@ -83,11 +82,6 @@ extern void *cublas_dso_handle;
__macro(cublasDgemmBatched); \ __macro(cublasDgemmBatched); \
__macro(cublasCgemmBatched); \ __macro(cublasCgemmBatched); \
__macro(cublasZgemmBatched); \ __macro(cublasZgemmBatched); \
__macro(cublasSgemmStridedBatched); \
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched); \
__macro(cublasSgetrfBatched); \ __macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \ __macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \ __macro(cublasDgetrfBatched); \
...@@ -95,10 +89,24 @@ extern void *cublas_dso_handle; ...@@ -95,10 +89,24 @@ extern void *cublas_dso_handle;
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 8.0
#if CUDA_VERSION >= 8000
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(cublasGemmEx); \
__macro(cublasSgemmStridedBatched); \
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
// APIs available after CUDA 9.0 // APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode); #define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif #endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
......
...@@ -1003,6 +1003,46 @@ HOSTDEVICE inline float16 exp(const float16& a) { ...@@ -1003,6 +1003,46 @@ HOSTDEVICE inline float16 exp(const float16& a) {
return float16(::expf(static_cast<float>(a))); return float16(::expf(static_cast<float>(a)));
} }
template <>
HOSTDEVICE inline float16 log(const float16& a) {
return float16(::logf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline float16 tanh(const float16& a) {
return float16(::tanhf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline float16 sqrt(const float16& a) {
return float16(::sqrtf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline float16 ceil(const float16& a) {
return float16(::ceilf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline float16 floor(const float16& a) {
return float16(::floorf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline float16 round(const float16& a) {
return float16(::roundf(static_cast<float>(a)));
}
template <>
HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
}
template <>
HOSTDEVICE inline float16 abs(const float16& a) {
return float16(::fabs(static_cast<float>(a)));
}
} // namespace numext } // namespace numext
} // namespace Eigen } // namespace Eigen
...@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <mkldnn.hpp> #include <vector>
#include "mkldnn/include/mkldnn.hpp"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
namespace paddle { namespace paddle {
......
...@@ -14,8 +14,9 @@ ...@@ -14,8 +14,9 @@
#pragma once #pragma once
#include <thread> #include <thread> // NOLINT
#include <typeindex> #include <typeindex>
#include <vector>
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -29,6 +30,8 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { ...@@ -29,6 +30,8 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
return ncclDouble; return ncclDouble;
} else if (type == typeid(int)) { // NOLINT } else if (type == typeid(int)) { // NOLINT
return ncclInt; return ncclInt;
} else if (type == typeid(int64_t)) { // NOLINT
return ncclInt64;
} else { } else {
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
...@@ -58,7 +61,7 @@ struct NCCLContext { ...@@ -58,7 +61,7 @@ struct NCCLContext {
ncclComm_t comm_; ncclComm_t comm_;
explicit NCCLContext(int dev_id) explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {} : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
cudaStream_t stream() const { return ctx_->stream(); } cudaStream_t stream() const { return ctx_->stream(); }
...@@ -66,23 +69,23 @@ struct NCCLContext { ...@@ -66,23 +69,23 @@ struct NCCLContext {
return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device; return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
} }
static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts, static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
const std::vector<platform::Place> &places) { const std::vector<platform::Place> &places) {
std::vector<ncclComm_t> comms; std::vector<ncclComm_t> comms;
std::vector<int> devs; std::vector<int> devs;
comms.resize(contexts.size()); comms.resize(contexts->size());
devs.reserve(contexts.size()); devs.reserve(contexts->size());
for (auto &p : places) { for (auto &p : places) {
devs.push_back(boost::get<platform::CUDAPlace>(p).device); devs.push_back(boost::get<platform::CUDAPlace>(p).device);
} }
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(contexts.size()), &devs[0])); &comms[0], static_cast<int>(contexts->size()), &devs[0]));
int i = 0; int i = 0;
for (auto &dev_id : devs) { for (auto &dev_id : devs) {
contexts.at(dev_id).comm_ = comms[i++]; contexts->at(dev_id).comm_ = comms[i++];
} }
} }
}; };
...@@ -91,7 +94,8 @@ struct NCCLContextMap { ...@@ -91,7 +94,8 @@ struct NCCLContextMap {
std::unordered_map<int, NCCLContext> contexts_; std::unordered_map<int, NCCLContext> contexts_;
std::vector<int> order_; std::vector<int> order_;
NCCLContextMap(const std::vector<platform::Place> &places) { explicit NCCLContextMap(const std::vector<platform::Place> &places) {
PADDLE_ENFORCE(!places.empty());
order_.reserve(places.size()); order_.reserve(places.size());
for (auto &p : places) { for (auto &p : places) {
int dev_id = boost::get<CUDAPlace>(p).device; int dev_id = boost::get<CUDAPlace>(p).device;
...@@ -102,15 +106,17 @@ struct NCCLContextMap { ...@@ -102,15 +106,17 @@ struct NCCLContextMap {
order_.size(), contexts_.size(), order_.size(), contexts_.size(),
"NCCL Context Map does not support contain two or more same device"); "NCCL Context Map does not support contain two or more same device");
std::vector<ncclComm_t> comms; if (places.size() > 1) {
comms.resize(order_.size()); std::vector<ncclComm_t> comms;
comms.resize(order_.size());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
&comms[0], static_cast<int>(order_.size()), &order_[0])); &comms[0], static_cast<int>(order_.size()), &order_[0]));
int i = 0; int i = 0;
for (auto &dev_id : order_) { for (auto &dev_id : order_) {
contexts_.at(dev_id).comm_ = comms[i++]; contexts_.at(dev_id).comm_ = comms[i++];
}
} }
} }
......
...@@ -15,8 +15,11 @@ limitations under the License. */ ...@@ -15,8 +15,11 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <algorithm>
#include <iomanip> #include <iomanip>
#include <map> #include <map>
#include <mutex> // NOLINT
#include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -28,10 +31,10 @@ limitations under the License. */ ...@@ -28,10 +31,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct EventList;
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
// To record which timer the profiler used, CUDA or CPU.
static std::string g_profiler_place = "";
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
// The thread index of each thread // The thread index of each thread
static thread_local int32_t g_thread_id; static thread_local int32_t g_thread_id;
...@@ -45,6 +48,39 @@ static std::list<std::shared_ptr<EventList>> g_all_event_lists; ...@@ -45,6 +48,39 @@ static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list; static thread_local std::shared_ptr<EventList> g_event_list;
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
void Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks;
};
inline uint64_t GetTimeInNsec() { inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady, using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock, std::chrono::high_resolution_clock,
...@@ -60,9 +96,9 @@ inline uint64_t PosixInNsec() { ...@@ -60,9 +96,9 @@ inline uint64_t PosixInNsec() {
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec); return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
} }
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (has_cuda_) { if (has_cuda_) {
...@@ -76,17 +112,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id, ...@@ -76,17 +112,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id,
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
std::string Event::kind() const { const EventType& Event::type() const { return type_; }
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
double Event::CpuElapsedMs(const Event& e) const { double Event::CpuElapsedMs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0); return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
...@@ -129,15 +155,15 @@ inline EventList& GetEventList() { ...@@ -129,15 +155,15 @@ inline EventList& GetEventList() {
} }
void Mark(const std::string& name, const DeviceContext* dev_ctx) { void Mark(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
} }
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
} }
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
...@@ -197,12 +223,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -197,12 +223,7 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling ", "The profiling state should be disabled when calling ",
"EnableProfiler."); "EnableProfiler.");
g_state = state; g_state = state;
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kAll) {
g_profiler_place = "CUDA";
} else if (g_state == ProfilerState::kCPU) {
g_profiler_place = "CPU";
} else {
g_profiler_place = "All";
GetDeviceTracer()->Enable(); GetDeviceTracer()->Enable();
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -240,27 +261,63 @@ std::vector<std::vector<Event>> GetAllEvents() { ...@@ -240,27 +261,63 @@ std::vector<std::vector<Event>> GetAllEvents() {
return result; return result;
} }
void DisableProfiler(EventSortingKey sorted_key, // The information of each event given in the profiling report
const std::string& profile_path) { struct EventItem {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, std::string name;
"Can't disable profiling, since it's not starting."); int calls;
// Mark the profiling stop. double total_time;
Mark("_stop_profiler_", nullptr); double min_time;
g_state = ProfilerState::kDisabled; double max_time;
double ave_time;
};
// Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::string place;
if (g_state == ProfilerState::kCPU) {
place = "CPU";
} else if (g_state == ProfilerState::kCUDA) {
place = "CUDA";
} else if (g_state == ProfilerState::kAll) {
place = "All";
} else {
PADDLE_THROW("Invalid profiler state");
}
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::cout << "Place: " << place << std::endl;
ParseEvents(all_events, sorted_key); std::cout << "Time unit: ms" << std::endl;
ResetProfiler(); std::cout << "Sorted by " << sorted_domain
DeviceTracer* tracer = GetDeviceTracer(); << " in descending order in the same thread\n\n";
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { // Output events table
tracer->Disable(); std::cout.setf(std::ios::left);
tracer->GenProfile(profile_path); std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
}
} }
std::cout << std::endl;
} }
void ParseEvents(std::vector<std::vector<Event>>& events, // Parse the event list and output the profiling report
EventSortingKey sorted_by) { void ParseEvents(const std::vector<std::vector<Event>>& events,
if (g_profiler_place == "") return; EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return;
std::string sorted_domain; std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func; std::function<bool(const EventItem&, const EventItem&)> sorted_func;
...@@ -307,9 +364,9 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -307,9 +364,9 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
std::unordered_map<std::string, int> event_idx; std::unordered_map<std::string, int> event_idx;
for (size_t j = 0; j < events[i].size(); j++) { for (size_t j = 0; j < events[i].size(); j++) {
if (events[i][j].kind() == "push") { if (events[i][j].type() == EventType::kPushRange) {
pushed_events.push_back(events[i][j]); pushed_events.push_back(events[i][j]);
} else if (events[i][j].kind() == "pop") { } else if (events[i][j].type() == EventType::kPopRange) {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin(); std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend() && while (rit != pushed_events.rend() &&
rit->name() != events[i][j].name()) { rit->name() != events[i][j].name()) {
...@@ -317,10 +374,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -317,10 +374,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
} }
if (rit != pushed_events.rend()) { if (rit != pushed_events.rend()) {
double event_time = double event_time = (g_state == ProfilerState::kCUDA ||
(g_profiler_place == "CUDA" || g_profiler_place == "All") g_state == ProfilerState::kAll)
? rit->CudaElapsedMs(events[i][j]) ? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]); : rit->CpuElapsedMs(events[i][j]);
std::string event_name = std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
...@@ -376,35 +433,22 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -376,35 +433,22 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
} }
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table, void DisableProfiler(EventSortingKey sorted_key,
std::string& sorted_domain, const size_t name_width, const std::string& profile_path) {
const size_t data_width) { PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
// Output header information "Can't disable profiling, since it's not starting.");
std::cout << "\n------------------------->" // Mark the profiling stop.
<< " Profiling Report " Mark("_stop_profiler_", nullptr);
<< "<-------------------------\n\n";
std::cout << "Place: " << g_profiler_place << std::endl; std::vector<std::vector<Event>> all_events = GetAllEvents();
std::cout << "Time unit: ms" << std::endl; ParseEvents(all_events, sorted_key);
std::cout << "Sorted by " << sorted_domain ResetProfiler();
<< " in descending order in the same thread\n\n"; DeviceTracer* tracer = GetDeviceTracer();
// Output events table if (g_state == ProfilerState::kAll && tracer && tracer->IsEnabled()) {
std::cout.setf(std::ios::left); tracer->Disable();
std::cout << std::setw(name_width) << "Event" << std::setw(data_width) tracer->GenProfile(profile_path);
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
}
} }
std::cout << std::endl; g_state = ProfilerState::kDisabled;
} }
} // namespace platform } // namespace platform
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <forward_list> #include <forward_list>
#include <list> #include <list>
#include <mutex> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
...@@ -23,16 +23,16 @@ limitations under the License. */ ...@@ -23,16 +23,16 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
enum EventKind { kMark, kPushRange, kPopRange }; enum EventType { kMark, kPushRange, kPopRange };
class Event { class Event {
public: public:
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id, Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx); const DeviceContext* dev_ctx);
std::string kind() const; const EventType& type() const;
std::string name() const { return name_; } std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; } uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; } bool has_cuda() const { return has_cuda_; }
...@@ -46,7 +46,7 @@ class Event { ...@@ -46,7 +46,7 @@ class Event {
double CudaElapsedMs(const Event& e) const; double CudaElapsedMs(const Event& e) const;
private: private:
EventKind kind_; EventType type_;
std::string name_; std::string name_;
uint32_t thread_id_; uint32_t thread_id_;
int64_t cpu_ns_; int64_t cpu_ns_;
...@@ -57,39 +57,6 @@ class Event { ...@@ -57,39 +57,6 @@ class Event {
#endif #endif
}; };
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
void Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks;
};
enum ProfilerState { enum ProfilerState {
kDisabled, // disabled state kDisabled, // disabled state
kCPU, // CPU profiling state kCPU, // CPU profiling state
...@@ -136,16 +103,6 @@ struct RecordThread { ...@@ -136,16 +103,6 @@ struct RecordThread {
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
// The information of each event given in the profiling report
struct EventItem {
std::string name;
int calls;
double total_time;
double min_time;
double max_time;
double ave_time;
};
// Candidate keys to sort the profiling report // Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
...@@ -158,14 +115,5 @@ void ResetProfiler(); ...@@ -158,14 +115,5 @@ void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path); const std::string& profile_path);
// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault);
// Print results
void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,22 +13,23 @@ See the License for the specific language governing permissions and ...@@ -13,22 +13,23 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "cuda_runtime.h" #include <cuda_runtime.h>
#endif #endif
#include "gtest/gtest.h" #include "gtest/gtest.h"
TEST(Event, CpuElapsedTime) { TEST(Event, CpuElapsedTime) {
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventType;
Event start_event(EventKind::kPushRange, "test", 0, nullptr); Event start_event(EventType::kPushRange, "test", 0, nullptr);
EXPECT_TRUE(start_event.has_cuda() == false); EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0, nullptr); Event stop_event(EventType::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
} }
...@@ -38,16 +39,16 @@ TEST(Event, CudaElapsedTime) { ...@@ -38,16 +39,16 @@ TEST(Event, CudaElapsedTime) {
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace; using paddle::platform::CUDAPlace;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventType;
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true); EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0); EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
} }
#endif #endif
...@@ -55,7 +56,7 @@ TEST(Event, CudaElapsedTime) { ...@@ -55,7 +56,7 @@ TEST(Event, CudaElapsedTime) {
TEST(RecordEvent, RecordEvent) { TEST(RecordEvent, RecordEvent) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventType;
using paddle::platform::RecordEvent; using paddle::platform::RecordEvent;
using paddle::platform::ProfilerState; using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey; using paddle::platform::EventSortingKey;
......
...@@ -252,7 +252,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -252,7 +252,6 @@ All parameter, weight, gradient are variables in Paddle.
py::return_value_policy::reference); py::return_value_policy::reference);
py::class_<framework::ReaderHolder>(m, "Reader", "") py::class_<framework::ReaderHolder>(m, "Reader", "")
.def("has_next", &framework::ReaderHolder::HasNext)
.def("reset", &framework::ReaderHolder::ReInit); .def("reset", &framework::ReaderHolder::ReInit);
py::class_<Scope>(m, "Scope", "") py::class_<Scope>(m, "Scope", "")
...@@ -465,7 +464,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -465,7 +464,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
m.def("init_glog", framework::InitGLOG); m.def("init_glog", framework::InitGLOG);
m.def("init_devices", &framework::InitDevices); m.def("init_devices",
[](bool init_p2p) { framework::InitDevices(init_p2p); });
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -553,6 +553,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -553,6 +553,7 @@ All parameter, weight, gradient are variables in Paddle.
bcast_vars, main_program, loss_var_name, bcast_vars, main_program, loss_var_name,
scope, local_scopes, allow_op_delay); scope, local_scopes, allow_op_delay);
}) })
.def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
.def("local_scopes", .def("local_scopes",
[](ParallelExecutor &self) -> std::vector<Scope *> * { [](ParallelExecutor &self) -> std::vector<Scope *> * {
return &self.GetLocalScopes(); return &self.GetLocalScopes();
......
...@@ -39,7 +39,7 @@ class RecordIOWriter { ...@@ -39,7 +39,7 @@ class RecordIOWriter {
void CompleteAppendTensor() { void CompleteAppendTensor() {
auto& ctx = auto& ctx =
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
framework::WriteToRecordIO(writer_, tensors_, ctx); framework::WriteToRecordIO(&writer_, tensors_, ctx);
tensors_.clear(); tensors_.clear();
} }
......
...@@ -41,6 +41,6 @@ int main(int argc, char** argv) { ...@@ -41,6 +41,6 @@ int main(int argc, char** argv) {
paddle::memory::Used(paddle::platform::CUDAPlace(0)); paddle::memory::Used(paddle::platform::CUDAPlace(0));
#endif #endif
paddle::framework::InitDevices(); paddle::framework::InitDevices(true);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }
...@@ -29,6 +29,7 @@ import optimizer ...@@ -29,6 +29,7 @@ import optimizer
import backward import backward
import regularizer import regularizer
import average import average
import metrics
from param_attr import ParamAttr, WeightNormParamAttr from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
...@@ -85,6 +86,8 @@ def __bootstrap__(): ...@@ -85,6 +86,8 @@ def __bootstrap__():
import core import core
import os import os
in_test = 'unittest' in sys.modules
try: try:
num_threads = int(os.getenv('OMP_NUM_THREADS', '1')) num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
except ValueError: except ValueError:
...@@ -109,8 +112,11 @@ def __bootstrap__(): ...@@ -109,8 +112,11 @@ def __bootstrap__():
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
core.init_devices() # don't init_p2p when in unittest to save time.
core.init_devices(not in_test)
# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
# Consider paddle.init(args) or paddle.main(args)
layers.monkey_patch_variable() layers.monkey_patch_variable()
__bootstrap__() __bootstrap__()
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import warnings
""" """
Class of all kinds of Average. Class of all kinds of Average.
...@@ -22,6 +23,8 @@ import numpy as np ...@@ -22,6 +23,8 @@ import numpy as np
wrappers of Python functions. wrappers of Python functions.
""" """
__all__ = ["WeightedAverage"]
def _is_number_(var): def _is_number_(var):
return isinstance(var, int) or isinstance(var, float) or (isinstance( return isinstance(var, int) or isinstance(var, float) or (isinstance(
...@@ -34,6 +37,9 @@ def _is_number_or_matrix_(var): ...@@ -34,6 +37,9 @@ def _is_number_or_matrix_(var):
class WeightedAverage(object): class WeightedAverage(object):
def __init__(self): def __init__(self):
warnings.warn(
"The %s is deprecated, please use fluid.metrics.Accuracy instead." %
(self.__class__.__name__), Warning)
self.reset() self.reset()
def reset(self): def reset(self):
......
...@@ -102,6 +102,8 @@ def split_dense_variable(var_list, ...@@ -102,6 +102,8 @@ def split_dense_variable(var_list,
the parameter server side can gain better performance. By default the parameter server side can gain better performance. By default
minimum block size is 1024. The max block size is used to prevent minimum block size is 1024. The max block size is used to prevent
very large blocks that may cause send error. very large blocks that may cause send error.
:return: A list of VarBlocks. Each VarBlock specifies a shard of
the var.
""" """
blocks = [] blocks = []
for var in var_list: for var in var_list:
...@@ -192,22 +194,24 @@ class DistributeTranspiler: ...@@ -192,22 +194,24 @@ class DistributeTranspiler:
self.trainer_id = trainer_id self.trainer_id = trainer_id
pserver_endpoints = pservers.split(",") pserver_endpoints = pservers.split(",")
# step1 # step1: For large parameters and gradients, split them into smaller
# blocks.
param_list = [pg[0] for pg in params_grads] param_list = [pg[0] for pg in params_grads]
grad_list = [pg[1] for pg in params_grads] grad_list = [pg[1] for pg in params_grads]
grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints)) grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
param_blocks = split_dense_variable(param_list, len(pserver_endpoints)) param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
# step2 # step2: Create new vars for the parameters and gradients blocks and
# add ops to do the split.
grad_var_mapping = self._append_split_op(program, grad_blocks) grad_var_mapping = self._append_split_op(program, grad_blocks)
# step3 param_var_mapping = self._create_vars_from_blocklist(program,
param_blocks)
# step3: Add gradients as send op inputs and parameters as send
# op outputs.
send_inputs = [] send_inputs = []
send_outputs = [] send_outputs = []
for b in grad_blocks: # append by order for b in grad_blocks: # append by order
varname, block_id, _ = b.split(":") varname, block_id, _ = b.split(":")
send_inputs.append(grad_var_mapping[varname][int(block_id)]) send_inputs.append(grad_var_mapping[varname][int(block_id)])
param_var_mapping = self._create_vars_from_blocklist(program,
param_blocks)
for b in param_blocks: for b in param_blocks:
varname, block_id, _ = b.split(":") varname, block_id, _ = b.split(":")
send_outputs.append(param_var_mapping[varname][int(block_id)]) send_outputs.append(param_var_mapping[varname][int(block_id)])
...@@ -237,7 +241,7 @@ class DistributeTranspiler: ...@@ -237,7 +241,7 @@ class DistributeTranspiler:
"RPCClient": rpc_client_var}, "RPCClient": rpc_client_var},
attrs={"endpoints": pserver_endpoints, attrs={"endpoints": pserver_endpoints,
"epmap": eplist}) "epmap": eplist})
# step4 # step4: Concat the parameters splits together after recv.
for varname, splited_var in param_var_mapping.iteritems(): for varname, splited_var in param_var_mapping.iteritems():
if len(splited_var) <= 1: if len(splited_var) <= 1:
continue continue
...@@ -251,6 +255,7 @@ class DistributeTranspiler: ...@@ -251,6 +255,7 @@ class DistributeTranspiler:
def get_trainer_program(self): def get_trainer_program(self):
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
self.program.global_block().delete_ops(self.optimize_ops) self.program.global_block().delete_ops(self.optimize_ops)
self.program.sync_with_cpp()
# FIXME(typhoonzero): serialize once will fix error occurs when clone. # FIXME(typhoonzero): serialize once will fix error occurs when clone.
self.program.__str__() self.program.__str__()
return self.program return self.program
...@@ -258,13 +263,14 @@ class DistributeTranspiler: ...@@ -258,13 +263,14 @@ class DistributeTranspiler:
def get_pserver_program(self, endpoint): def get_pserver_program(self, endpoint):
""" """
Get pserver side program using the endpoint. Get pserver side program using the endpoint.
TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
NOTE: assume blocks of the same variable is not distributed NOTE: assume blocks of the same variable is not distributed
on the same pserver, only change param/grad varnames for on the same pserver, only change param/grad varnames for
trainers to fetch. trainers to fetch.
""" """
# step1 # step1
pserver_program = Program() pserver_program = Program()
# step2 # step2: Create vars to receive vars at parameter servers.
recv_inputs = [] recv_inputs = []
for v in self.param_grad_ep_mapping[endpoint]["params"]: for v in self.param_grad_ep_mapping[endpoint]["params"]:
self._clone_var(pserver_program.global_block(), v) self._clone_var(pserver_program.global_block(), v)
...@@ -273,17 +279,21 @@ class DistributeTranspiler: ...@@ -273,17 +279,21 @@ class DistributeTranspiler:
# we don't need to create them when grad arrives. # we don't need to create them when grad arrives.
# change client side var name to origin name by # change client side var name to origin name by
# removing ".trainer_%d" suffix # removing ".trainer_%d" suffix
suff_idx = v.name.find(".trainer_") suff_idx = v.name.find(".trainer_")
if suff_idx >= 0: if suff_idx >= 0:
orig_var_name = v.name[:suff_idx] orig_var_name = v.name[:suff_idx]
else: else:
orig_var_name = v.name orig_var_name = v.name
single_trainer_var = pserver_program.global_block().create_var( # NOTE: single_trainer_var must be created for multi-trainer
name=orig_var_name, # case to merge grads from multiple trainers
persistable=True, single_trainer_var = \
type=v.type, pserver_program.global_block().create_var(
dtype=v.dtype, name=orig_var_name,
shape=v.shape) persistable=True,
type=v.type,
dtype=v.dtype,
shape=v.shape)
if self.trainers > 1: if self.trainers > 1:
for trainer_id in xrange(self.trainers): for trainer_id in xrange(self.trainers):
var = pserver_program.global_block().create_var( var = pserver_program.global_block().create_var(
...@@ -344,7 +354,7 @@ class DistributeTranspiler: ...@@ -344,7 +354,7 @@ class DistributeTranspiler:
self._append_pserver_non_opt_ops(block, op) self._append_pserver_non_opt_ops(block, op)
append_block = optimize_block append_block = optimize_block
# append lr decay ops to the child block if exits # append lr decay ops to the child block if exists
lr_ops = self._get_lr_ops() lr_ops = self._get_lr_ops()
if len(lr_ops) > 0: if len(lr_ops) > 0:
for _, op in enumerate(lr_ops): for _, op in enumerate(lr_ops):
...@@ -447,8 +457,10 @@ class DistributeTranspiler: ...@@ -447,8 +457,10 @@ class DistributeTranspiler:
block_list, block_list,
add_trainer_suffix=False): add_trainer_suffix=False):
""" """
Create vars for each split.
NOTE: only grads need to be named for different trainers, use NOTE: only grads need to be named for different trainers, use
add_trainer_suffix to rename the grad vars. add_trainer_suffix to rename the grad vars.
:return: A dict mapping from original var name to each var split.
""" """
block_map = dict() block_map = dict()
var_mapping = dict() var_mapping = dict()
...@@ -615,6 +627,7 @@ class DistributeTranspiler: ...@@ -615,6 +627,7 @@ class DistributeTranspiler:
type="sum", type="sum",
inputs={"X": vars2merge}, inputs={"X": vars2merge},
outputs={"Out": merged_var}) outputs={"Out": merged_var})
# TODO(panyx0718): What if it's SELECTED_ROWS.
if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS: if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
optimize_block.append_op( optimize_block.append_op(
type="scale", type="scale",
...@@ -638,7 +651,7 @@ class DistributeTranspiler: ...@@ -638,7 +651,7 @@ class DistributeTranspiler:
shape=param_block.shape) shape=param_block.shape)
new_inputs[key] = tmpvar new_inputs[key] = tmpvar
elif key == "LearningRate": elif key == "LearningRate":
# leraning rate variable has already be created by non-optimize op, # learning rate variable has already be created by non-optimize op,
# don't create it once again. # don't create it once again.
lr_varname = opt_op.input(key)[0] lr_varname = opt_op.input(key)[0]
if pserver_block.vars.has_key(lr_varname): if pserver_block.vars.has_key(lr_varname):
...@@ -773,6 +786,7 @@ class DistributeTranspiler: ...@@ -773,6 +786,7 @@ class DistributeTranspiler:
return False return False
def _get_input_map_from_op(self, varmap, op): def _get_input_map_from_op(self, varmap, op):
"""Returns a dict from op input name to the vars in varmap."""
iomap = dict() iomap = dict()
for key in op.input_names: for key in op.input_names:
vars = [] vars = []
...@@ -785,6 +799,7 @@ class DistributeTranspiler: ...@@ -785,6 +799,7 @@ class DistributeTranspiler:
return iomap return iomap
def _get_output_map_from_op(self, varmap, op): def _get_output_map_from_op(self, varmap, op):
"""Returns a dict from op output name to the vars in varmap."""
iomap = dict() iomap = dict()
for key in op.output_names: for key in op.output_names:
vars = [] vars = []
...@@ -812,6 +827,7 @@ class DistributeTranspiler: ...@@ -812,6 +827,7 @@ class DistributeTranspiler:
find_ops.append(op) find_ops.append(op)
# make a union find struct by the ops in default_main_program # make a union find struct by the ops in default_main_program
ufind = UnionFind(block.ops) ufind = UnionFind(block.ops)
for op1 in block.ops: for op1 in block.ops:
for op2 in block.ops: for op2 in block.ops:
# NOTE: we need to skip all optimize ops, since it is connected # NOTE: we need to skip all optimize ops, since it is connected
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import warnings
import numpy as np import numpy as np
import layers import layers
...@@ -59,6 +60,9 @@ class Evaluator(object): ...@@ -59,6 +60,9 @@ class Evaluator(object):
""" """
def __init__(self, name, **kwargs): def __init__(self, name, **kwargs):
warnings.warn(
"The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
% (self.__class__.__name__, self.__class__.__name__), Warning)
self.states = [] self.states = []
self.metrics = [] self.metrics = []
self.helper = LayerHelper(name, **kwargs) self.helper = LayerHelper(name, **kwargs)
......
...@@ -818,6 +818,11 @@ class Block(object): ...@@ -818,6 +818,11 @@ class Block(object):
del self.vars[name] del self.vars[name]
self.sync_with_cpp() self.sync_with_cpp()
def remove_var(self, name):
self.sync_with_cpp()
self.desc.remove_var(name)
del self.vars[name]
def create_parameter(self, *args, **kwargs): def create_parameter(self, *args, **kwargs):
global_block = self.program.global_block() global_block = self.program.global_block()
param = Parameter(global_block, *args, **kwargs) param = Parameter(global_block, *args, **kwargs)
...@@ -838,6 +843,11 @@ class Block(object): ...@@ -838,6 +843,11 @@ class Block(object):
self.ops.insert(index, op) self.ops.insert(index, op)
return op return op
def remove_op(self, index):
self.sync_with_cpp()
self.desc.remove_op(index, index + 1)
del self.ops[index]
def delete_ops(self, ops): def delete_ops(self, ops):
# remove from cpp # remove from cpp
# FIXME(typhoonzero): remove only the first occurrence. # FIXME(typhoonzero): remove only the first occurrence.
...@@ -846,6 +856,7 @@ class Block(object): ...@@ -846,6 +856,7 @@ class Block(object):
end = list(self.ops).index(ops[-1]) end = list(self.ops).index(ops[-1])
except Exception, e: except Exception, e:
raise e raise e
self.desc.remove_op(start, end + 1) self.desc.remove_op(start, end + 1)
def slice_ops(self, start, end): def slice_ops(self, start, end):
......
...@@ -18,7 +18,8 @@ import contextlib ...@@ -18,7 +18,8 @@ import contextlib
__all__ = [ __all__ = [
'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu', 'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
'init_on_cpu' 'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
'NormalInitializer', 'XavierInitializer'
] ]
_force_init_on_cpu_ = False _force_init_on_cpu_ = False
......
...@@ -21,8 +21,7 @@ from ..executor import global_scope ...@@ -21,8 +21,7 @@ from ..executor import global_scope
__all__ = [ __all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
'open_files', 'read_file', 'create_shuffle_reader', 'open_files', 'read_file', 'shuffle', 'double_buffer'
'create_double_buffer_reader', 'create_multi_pass_reader'
] ]
...@@ -237,13 +236,9 @@ def monkey_patch_reader_methods(reader): ...@@ -237,13 +236,9 @@ def monkey_patch_reader_methods(reader):
var = scope.find_var(reader.name) var = scope.find_var(reader.name)
return var.get_reader() return var.get_reader()
def eof():
return not __get_reader__().has_next()
def reset(): def reset():
return __get_reader__().reset() return __get_reader__().reset()
reader.eof = eof
reader.reset = reset reader.reset = reset
reader.stop_gradient = True reader.stop_gradient = True
reader.persistable = True reader.persistable = True
...@@ -283,7 +278,42 @@ def _copy_reader_create_op_(block, op): ...@@ -283,7 +278,42 @@ def _copy_reader_create_op_(block, op):
return new_op return new_op
def open_recordio_file(filename, shapes, lod_levels, dtypes): def open_recordio_file(filename,
shapes,
lod_levels,
dtypes,
pass_num=1,
for_parallel=False):
"""
Open a RecordIO file
This layer takes a RecordIO file to read from and returns a Reader Variable.
Via the Reader Variable, we can get data from the given RecordIO file.
Args:
filename(str): The RecordIO file's name.
shapes(list): List of tuples which declaring data shapes.
lod_levels(list): List of ints which declaring data lod_level.
dtypes(list): List of strs which declaring data type.
pass_num(int): Number of passes to run.
for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel.
Returns:
Variable: A Reader Variable via which we can get RecordIO file data.
Examples:
.. code-block:: python
reader = fluid.layers.io.open_recordio_file(
filename='./data.recordio',
shapes=[(3,224,224), (1)],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
# Via the reader, we can use 'read_file' layer to get data:
image, label = fluid.layers.read_file(reader)
"""
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
shape_concat = [] shape_concat = []
ranks = [] ranks = []
...@@ -310,10 +340,63 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes): ...@@ -310,10 +340,63 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
startup_var.persistable = True startup_var.persistable = True
main_prog_var = _copy_reader_var_(default_main_program().current_block(), main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var) startup_var)
if pass_num > 1:
main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
if for_parallel:
main_prog_var = parallel(reader=main_prog_var)
return monkey_patch_reader_methods(main_prog_var) return monkey_patch_reader_methods(main_prog_var)
def open_files(filenames, thread_num, shapes, lod_levels, dtypes): def open_files(filenames,
shapes,
lod_levels,
dtypes,
thread_num,
buffer_size=None,
pass_num=1,
for_parallel=False):
"""
Open files
This layer takes a list of files to read from and returns a Reader Variable.
Via the Reader Variable, we can get data from given files. All files must
have name suffixs to indicate their formats, e.g., '*.recordio'.
Args:
filenames(list): The list of file names.
shapes(list): List of tuples which declaring data shapes.
lod_levels(list): List of ints which declaring data lod_level.
dtypes(list): List of strs which declaring data type.
thread_num(int): The maximal concurrent prefetch thread number.
buffer_size(int): The size of prefetch buffer.
pass_num(int): Number of passes to run.
for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel.
Returns:
Variable: A Reader Variable via which we can get file data.
Examples:
.. code-block:: python
reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
'./data2.recordio'],
shapes=[(3,224,224), (1)],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=2,
buffer_size=2)
# Via the reader, we can use 'read_file' layer to get data:
image, label = fluid.layers.io.read_file(reader)
"""
if buffer_size is None:
buffer_size = thread_num
if isinstance(filenames, basestring):
filenames = [filenames]
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
shape_concat = [] shape_concat = []
ranks = [] ranks = []
...@@ -322,29 +405,36 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes): ...@@ -322,29 +405,36 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
shape_concat.extend(shape) shape_concat.extend(shape)
ranks.append(len(shape)) ranks.append(len(shape))
var_name = unique_name('multiple_reader') multi_file_reader_name = unique_name('multi_file_reader')
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name) startup_reader = startup_blk.create_var(name=multi_file_reader_name)
startup_blk.append_op( startup_blk.append_op(
type='open_files', type='open_files',
outputs={'Out': [startup_var]}, outputs={'Out': [startup_reader]},
attrs={ attrs={
'shape_concat': shape_concat, 'shape_concat': shape_concat,
'lod_levels': lod_levels, 'lod_levels': lod_levels,
'ranks': ranks, 'ranks': ranks,
'file_names': filenames, 'file_names': filenames,
'thread_num': thread_num 'thread_num': thread_num,
'buffer_size': buffer_size
}) })
startup_var.desc.set_dtypes(dtypes) startup_reader.desc.set_dtypes(dtypes)
startup_var.persistable = True startup_reader.persistable = True
main_prog_var = _copy_reader_var_(default_main_program().current_block(), main_prog_reader = _copy_reader_var_(default_main_program().current_block(),
startup_var) startup_reader)
return monkey_patch_reader_methods(main_prog_var) if pass_num > 1:
main_prog_reader = multi_pass(
reader=main_prog_reader, pass_num=pass_num)
if for_parallel:
main_prog_reader = parallel(reader=main_prog_reader)
return monkey_patch_reader_methods(main_prog_reader)
def __create_decorated_reader__(op_type, reader, attrs): def __create_shared_decorated_reader__(op_type, reader, attrs):
var_name = unique_name(op_type) var_name = unique_name(op_type)
startup_blk = default_startup_program().current_block() startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name) startup_var = startup_blk.create_var(name=var_name)
...@@ -360,22 +450,41 @@ def __create_decorated_reader__(op_type, reader, attrs): ...@@ -360,22 +450,41 @@ def __create_decorated_reader__(op_type, reader, attrs):
return monkey_patch_reader_methods(main_prog_var) return monkey_patch_reader_methods(main_prog_var)
def create_shuffle_reader(reader, buffer_size): def __create_unshared_decorated_reader__(op_type, reader, attrs):
return __create_decorated_reader__('create_shuffle_reader', reader, new_reader_name = unique_name(op_type)
{'buffer_size': int(buffer_size)}) main_blk = default_main_program().current_block()
new_reader = main_blk.create_var(name=new_reader_name)
main_blk.append_op(
type=op_type,
inputs={'UnderlyingReader': reader},
outputs={'Out': [new_reader]},
attrs=attrs)
new_reader.persistable = True
new_reader.stop_gradient = True
return monkey_patch_reader_methods(new_reader)
def shuffle(reader, buffer_size):
return __create_unshared_decorated_reader__(
'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
def create_double_buffer_reader(reader, place=None): def double_buffer(reader, place=None):
attrs = dict() attrs = dict()
if place is not None: if place is not None:
attrs['place'] = str(place).upper() attrs['place'] = str(place).upper()
return __create_decorated_reader__('create_double_buffer_reader', reader, return __create_unshared_decorated_reader__('create_double_buffer_reader',
attrs) reader, attrs)
def multi_pass(reader, pass_num):
return __create_shared_decorated_reader__(
'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
def create_multi_pass_reader(reader, pass_num): def parallel(reader):
return __create_decorated_reader__('create_multi_pass_reader', reader, return __create_shared_decorated_reader__('create_threaded_reader', reader,
{'pass_num': int(pass_num)}) {})
def read_file(file_obj): def read_file(file_obj):
......
...@@ -15,12 +15,13 @@ ...@@ -15,12 +15,13 @@
All layers just related to metric. All layers just related to metric.
""" """
import warnings
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant from ..initializer import Normal, Constant
from ..framework import Variable from ..framework import Variable
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
__all__ = ['accuracy'] __all__ = ['accuracy', 'auc']
def accuracy(input, label, k=1, correct=None, total=None): def accuracy(input, label, k=1, correct=None, total=None):
...@@ -55,3 +56,37 @@ def accuracy(input, label, k=1, correct=None, total=None): ...@@ -55,3 +56,37 @@ def accuracy(input, label, k=1, correct=None, total=None):
"Total": [total], "Total": [total],
}) })
return acc_out return acc_out
def auc(input, label, curve='ROC', num_thresholds=200):
warnings.warn(
"This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
but can not aggregate them and get the pass AUC, because pass \
auc can not be averaged with weighted from the minibatch auc value. \
Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
which can get every minibatch and every pass auc value.", Warning)
helper = LayerHelper("auc", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
auc_out = helper.create_tmp_variable(dtype="float32")
if correct is None:
correct = helper.create_tmp_variable(dtype="int64")
if total is None:
total = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="accuracy",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
"Label": [label]
},
attrs={"curve": curve,
"num_thresholds": num_thresholds},
outputs={"AUC": [auc_out], })
return auc_out
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fluid Metrics
The metrics are accomplished via Python natively.
"""
import numpy as np
import copy
import warnings
__all__ = [
'MetricBase',
'CompositeMetric',
'Accuracy',
'ChunkEvaluator',
'EditDistance',
'DetectionMAP',
'Auc',
]
def _is_numpy_(var):
return isinstance(var, (np.ndarray, np.generic))
def _is_number_(var):
return isinstance(var, int) or isinstance(var, float) or (isinstance(
var, np.ndarray) and var.shape == (1, ))
def _is_number_or_matrix_(var):
return _is_number_(var) or isinstance(var, np.ndarray)
class MetricBase(object):
"""
Base Class for all evaluators
Args:
name(str): The name of evaluator. such as, "accuracy". Used for generate
temporary variable name.
Interface:
Note(*) : the states is the attributes who not has _ prefix.
get_config(): print current states and configuration
reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray),
Please override this method.
update(): update states at every minibatch
eval(): get metric evaluation in numpy type.
"""
def __init__(self, name, **kwargs):
self._name = str(name) if name != None else self.__class__.__name__
self._kwargs = kwargs if kwargs != None else dict()
self.reset()
def __str__(self):
return self._name
def reset(self):
"""
states is the attributes who not has _ prefix.
reset the states of metrics.
"""
states = {
attr: value
for attr, value in self.__dict__.iteritems()
if not attr.startswith("_")
}
for attr, value in states.iteritems():
if isinstance(value, int):
setattr(self, attr, 0)
elif isinstance(value, float):
setattr(self, attr, .0)
elif isinstance(value, (np.ndarray, np.generic)):
setattr(self, attr, np.zeros_like(value))
else:
setattr(self, attr, None)
def get_config(self):
states = {
attr: value
for attr, value in self.__dict__.iteritems()
if not attr.startswith("_")
}
config = copy.deepcopy(self._kwargs)
config.update({"name": self._name, "states": copy.deepcopy(states)})
return config
def update(self):
raise NotImplementedError()
def eval(self):
raise NotImplementedError()
class CompositeMetric(MetricBase):
"""
Compute multiple metrics in each minibatch.
for example, merge F1, accuracy, recall into one Metric.
"""
def __init__(self, name=None, **kwargs):
super(CompositeMetric, self).__init__(name, kwargs)
self._metrics = []
def add_metric(self, metric):
if not isinstance(metric, MetricBase):
raise ValueError("SubMetric should be inherit from MetricBase.")
self._metrics.append(metric)
def eval(self):
ans = []
for m in self._metrics:
ans.append(m.eval())
return ans
class Accuracy(MetricBase):
"""
Accumulate the accuracy from minibatches and compute the average accuracy
for every pass.
Args:
name: the metrics name
Example:
minibatch_accuracy = fluid.layers.accuracy(pred, label)
accuracy_evaluator = fluid.metrics.Accuracy()
for epoch in PASS_NUM:
accuracy_evaluator.reset()
for data in batches:
loss = exe.run(fetch_list=[cost, minibatch_accuracy])
accuracy_evaluator.update(value=minibatch_accuracy, weight=batches)
accuracy = accuracy_evaluator.eval()
"""
def __init__(self, name=None):
super(Accuracy, self).__init__(name)
self.value = .0
self.weight = .0
def update(self, value, weight):
if not _is_number_or_matrix_(value):
raise ValueError(
"The 'value' must be a number(int, float) or a numpy ndarray.")
if not _is_number_(weight):
raise ValueError("The 'weight' must be a number(int, float).")
self.value += value * weight
self.weight += weight
def eval(self):
if self.weight == 0:
raise ValueError(
"There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy."
)
return self.value / self.weight
class ChunkEvalutor(MetricBase):
"""
Accumulate counter numbers output by chunk_eval from mini-batches and
compute the precision recall and F1-score using the accumulated counter
numbers.
"""
def __init__(self, name=None):
super(ChunkEvalutor, self).__init__(name)
self.num_infer_chunks = 0
self.num_label_chunks = 0
self.num_correct_chunks = 0
def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
if not _is_number_or_matrix_(num_infer_chunks):
raise ValueError(
"The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray."
)
if not _is_number_or_matrix_(num_label_chunks):
raise ValueError(
"The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
)
if not _is_number_or_matrix_(num_correct_chunks):
raise ValueError(
"The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
)
self.num_infer_chunks += num_infer_chunks
self.num_label_chunks += num_label_chunks
self.num_correct_chunks += num_correct_chunks
def eval(self):
precision = float(
self.num_correct_chunks
) / self.num_infer_chunks if self.num_infer_chunks else 0
recall = float(self.num_correct_chunks
) / self.num_label_chunks if self.num_label_chunks else 0
f1_score = float(2 * precision * recall) / (
precision + recall) if self.num_correct_chunks else 0
return precision, recall, f1_score
class EditDistance(MetricBase):
"""
Accumulate edit distance sum and sequence number from mini-batches and
compute the average edit_distance and instance error of all batches.
Args:
name: the metrics name
Example:
edit_distance_metrics = fluid.layers.edit_distance(input, label)
distance_evaluator = fluid.metrics.EditDistance()
for epoch in PASS_NUM:
distance_evaluator.reset()
for data in batches:
loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
distance_evaluator.update(*edit_distance_metrics)
distance, instance_error = distance_evaluator.eval()
In the above example:
'distance' is the average of the edit distance in a pass.
'instance_error' is the instance error rate in a pass.
"""
def __init__(self, name):
super(EditDistance, self).__init__(name)
self.total_distance = .0
self.seq_num = 0
self.instance_error = 0
def update(self, distances, seq_num):
if not _is_numpy_(distances):
raise ValueError("The 'distances' must be a numpy ndarray.")
if not _is_number_(seq_num):
raise ValueError("The 'seq_num' must be a number(int, float).")
seq_right_count = np.sum(distances == 0)
total_distance = np.sum(distances)
self.seq_num += seq_num
self.instance_error += seq_num - seq_right_count
self.total_distance += total_distance
def eval():
if self.seq_num == 0:
raise ValueError(
"There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
)
avg_distance = self.total_distance / self.seq_num
avg_instance_error = self.instance_error / self.seq_num
return avg_distance, avg_instance_error
class DetectionMAP(MetricBase):
"""
Calculate the detection mean average precision (mAP).
TODO (Dang Qingqing): update the following doc.
The general steps are as follows:
1. calculate the true positive and false positive according to the input
of detection and labels.
2. calculate mAP value, support two versions: '11 point' and 'integral'.
Please get more information from the following articles:
https://sanchom.wordpress.com/tag/average-precision/
https://arxiv.org/abs/1512.02325
"""
def __init__(self, name=None):
super(DetectionMAP, self).__init__(name)
# the current map value
self.value = .0
def update(self, value, weight):
if not _is_number_or_matrix_(value):
raise ValueError(
"The 'value' must be a number(int, float) or a numpy ndarray.")
if not _is_number_(weight):
raise ValueError("The 'weight' must be a number(int, float).")
self.value += value
self.weight += weight
def eval(self):
if self.weight == 0:
raise ValueError(
"There is no data in DetectionMAP Metrics. "
"Please check layers.detection_map output has added to DetectionMAP."
)
return self.value / self.weight
class Auc(MetricBase):
"""
Auc Metrics which adapts to binary classification.
Need to note that auc metrics compute the value via Python natively.
If you concern the speed, please use the fluid.layers.auc instead.
The `auc` function creates four local variables, `true_positives`,
`true_negatives`, `false_positives` and `false_negatives` that are used to
compute the AUC. To discretize the AUC curve, a linearly spaced set of
thresholds is used to compute pairs of recall and precision values. The area
under the ROC-curve is therefore computed using the height of the recall
values by the false positive rate, while the area under the PR-curve is the
computed using the height of the precision values by the recall.
Args:
name: metric name
curve: Specifies the name of the curve to be computed, 'ROC' [default] or
'PR' for the Precision-Recall-curve.
num_thresholds: The number of thresholds to use when discretizing the roc
curve.
"NOTE: only implement the ROC curve type via Python now."
"""
def __init__(self, name, curve='ROC', num_thresholds=200):
super(MetricBase, self).__init__(name, curve, num_thresholds)
self._curve = curve
self._num_thresholds = num_thresholds
self._epsilon = 1e-6
self.tp_list = np.ndarray((num_thresholds, ))
self.fn_list = np.ndarray((num_thresholds, ))
self.tn_list = np.ndarray((num_thresholds, ))
self.fp_list = np.ndarray((num_thresholds, ))
def update(self, labels, predictions, axis=1):
if not _is_numpy_(labels):
raise ValueError("The 'labels' must be a numpy ndarray.")
if not _is_numpy_(predictions):
raise ValueError("The 'predictions' must be a numpy ndarray.")
kepsilon = 1e-7 # to account for floating point imprecisions
thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
for i in range(num_thresholds - 2)]
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
# caculate TP, FN, TN, FP count
for idx_thresh, thresh in enumerate(thresholds):
tp, fn, tn, fp = 0, 0, 0, 0
for i, lbl in enumerate(labels):
if lbl:
if predictions[i, 0] >= thresh:
tp += 1
else:
fn += 1
else:
if predictions[i, 0] >= thresh:
fp += 1
else:
tn += 1
tp_list[idx_thresh] += tp
fn_list[idx_thresh] += fn
tn_list[idx_thresh] += tn
fp_list[idx_thresh] += fp
def eval(self):
epsilon = self._epsilon
num_thresholds = self._num_thresholds
tpr = (tp_list.astype("float32") + epsilon) / (
tp_list + fn_list + epsilon)
fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
rec = (tp_list.astype("float32") + epsilon) / (
tp_list + fp_list + epsilon)
x = fpr[:num_thresholds - 1] - fpr[1:]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
auc_value = np.sum(x * y)
return auc_value
...@@ -87,7 +87,8 @@ class ParallelExecutor(object): ...@@ -87,7 +87,8 @@ class ParallelExecutor(object):
# performance. Worth tunning for other models in the future. # performance. Worth tunning for other models in the future.
num_threads = len(self._places) num_threads = len(self._places)
else: else:
min(len(self._places) * 2, multiprocessing.cpu_count()) num_threads = min(
len(self._places) * 2, multiprocessing.cpu_count())
main = main_program main = main_program
main = main if main else framework.default_main_program() main = main if main else framework.default_main_program()
...@@ -99,9 +100,11 @@ class ParallelExecutor(object): ...@@ -99,9 +100,11 @@ class ParallelExecutor(object):
local_scopes = share_vars_from.executor.local_scopes( local_scopes = share_vars_from.executor.local_scopes(
) if share_vars_from else [] ) if share_vars_from else []
persistable_vars = [ self.persistable_vars = [
v.name v.name
for v in filter(lambda var: var.persistable, main.list_vars()) for v in filter(lambda var: \
var.persistable and var.type != core.VarDesc.VarType.RAW,
main.list_vars())
] ]
self.executor = core.ParallelExecutor( self.executor = core.ParallelExecutor(
...@@ -112,7 +115,7 @@ class ParallelExecutor(object): ...@@ -112,7 +115,7 @@ class ParallelExecutor(object):
p.name for p in main.global_block().iter_parameters() p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient if not p.stop_gradient
]), ]),
set(persistable_vars), set(self.persistable_vars),
main.desc, main.desc,
loss_name if loss_name else '', loss_name if loss_name else '',
scope, scope,
...@@ -142,3 +145,6 @@ class ParallelExecutor(object): ...@@ -142,3 +145,6 @@ class ParallelExecutor(object):
self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict) self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
return [arr[i] for i in range(len(arr))] return [arr[i] for i in range(len(arr))]
def bcast_params(self):
self.executor.bcast_params(set(self.persistable_vars))
...@@ -22,221 +22,504 @@ from scipy.special import expit ...@@ -22,221 +22,504 @@ from scipy.special import expit
class TestExp(OpTest): class TestExp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "exp" self.op_type = "exp"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': np.exp(self.inputs['X'])} x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.exp(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Exp(TestExp):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSigmoid(OpTest): class TestSigmoid(OpTest):
def setUp(self): def setUp(self):
self.op_type = "sigmoid" self.op_type = "sigmoid"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': 1 / (1 + np.exp(-self.inputs['X']))} x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
out = 1 / (1 + np.exp(-x))
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out', max_relative_error=0.008) if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.01)
def init_dtype(self):
pass
class TestFP16Sigmoid(TestSigmoid):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestLogSigmoid(OpTest): class TestLogSigmoid(OpTest):
def setUp(self): def setUp(self):
self.op_type = "logsigmoid" self.op_type = "logsigmoid"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(-1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': np.log(1 / (1 + np.exp(-self.inputs['X'])))} x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
out = np.log(1 / (1 + np.exp(-x)))
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.008) self.check_grad(['X'], 'Out', max_relative_error=0.008)
def init_dtype(self):
pass
class TestFP16LogSigmoid(TestLogSigmoid):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestTanh(OpTest): class TestTanh(OpTest):
def setUp(self): def setUp(self):
self.op_type = "tanh" self.op_type = "tanh"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': np.tanh(self.inputs['X'])} x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.tanh(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Tanh(TestTanh):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestTanhShrink(OpTest): class TestTanhShrink(OpTest):
def setUp(self): def setUp(self):
self.op_type = "tanh_shrink" self.op_type = "tanh_shrink"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': self.inputs['X'] - np.tanh(self.inputs['X'])} x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
out = x - np.tanh(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.008) self.check_grad(['X'], 'Out', max_relative_error=0.008)
def init_dtype(self):
pass
class TestFP16TanhShrink(TestTanhShrink):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestHardShrink(OpTest): class TestHardShrink(OpTest):
def setUp(self): def setUp(self):
self.op_type = "hard_shrink" self.op_type = "hard_shrink"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.init_dtype()
threshold = 0.5 threshold = 0.5
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.copy(x)
out[(out >= -threshold) & (out <= threshold)] = 0
self.inputs = {'X': x}
self.attrs = {'lambda': threshold} self.attrs = {'lambda': threshold}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
t = np.copy(x) self.outputs = {'Out': out}
t[(t >= -threshold) & (t <= threshold)] = 0
self.outputs = {'Out': t}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.005) self.check_grad(['X'], 'Out', max_relative_error=0.005)
def init_dtype(self):
pass
class TestFP16HardShrink(TestHardShrink):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSoftShrink(OpTest): class TestSoftShrink(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softshrink" self.op_type = "softshrink"
self.dtype = np.float32
self.init_dtype()
lambda_val = 0.1 lambda_val = 0.1
x = np.random.uniform(0.25, 10, [4, 4]).astype(self.dtype)
out = np.copy(x)
out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
out - lambda_val)
self.attrs = {'lambda': lambda_val} self.attrs = {'lambda': lambda_val}
self.inputs = { self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
'X': np.random.uniform(0.25, 10, [4, 4]).astype("float32") self.outputs = {'Out': out}
}
y = np.copy(self.inputs['X'])
y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
y - lambda_val)
self.outputs = {'Out': y}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16SoftShrink(TestSoftShrink):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSqrt(OpTest): class TestSqrt(OpTest):
def setUp(self): def setUp(self):
self.op_type = "sqrt" self.op_type = "sqrt"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': np.sqrt(self.inputs['X'])} x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.sqrt(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Sqrt(TestSqrt):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestAbs(OpTest): class TestAbs(OpTest):
def setUp(self): def setUp(self):
self.op_type = "abs" self.op_type = "abs"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
# Because we set delta = 0.005 in caculating numeric gradient, # Because we set delta = 0.005 in caculating numeric gradient,
# if x is too small, such as 0.002, x_neg will be -0.003 # if x is too small, such as 0.002, x_neg will be -0.003
# x_pos will be 0.007, so the numeric gradient is unaccurate. # x_pos will be 0.007, so the numeric gradient is unaccurate.
# we should avoid this # we should avoid this
x[np.abs(x) < 0.005] = 0.02 x[np.abs(x) < 0.005] = 0.02
self.inputs = {'X': x} out = np.abs(x)
self.outputs = {'Out': np.abs(self.inputs['X'])}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Abs(TestAbs):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCeil(OpTest): class TestCeil(OpTest):
def setUp(self): def setUp(self):
self.op_type = "ceil" self.op_type = "ceil"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.inputs = {'X': x} self.init_dtype()
self.outputs = {'Out': np.ceil(self.inputs['X'])}
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.ceil(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Ceil(TestCeil):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestFloor(OpTest): class TestFloor(OpTest):
def setUp(self): def setUp(self):
self.op_type = "floor" self.op_type = "floor"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.inputs = {'X': x} self.init_dtype()
self.outputs = {'Out': np.floor(self.inputs['X'])}
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.floor(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Floor(TestFloor):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCos(OpTest): class TestCos(OpTest):
def setUp(self): def setUp(self):
self.op_type = "cos" self.op_type = "cos"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.inputs = {'X': x} self.init_dtype()
self.outputs = {'Out': np.cos(self.inputs['X'])}
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.cos(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Cos(TestCos):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSin(OpTest): class TestSin(OpTest):
def setUp(self): def setUp(self):
self.op_type = "sin" self.op_type = "sin"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.inputs = {'X': x} self.init_dtype()
self.outputs = {'Out': np.sin(self.inputs['X'])}
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.sin(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Sin(TestSin):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestRound(OpTest): class TestRound(OpTest):
def setUp(self): def setUp(self):
self.op_type = "round" self.op_type = "round"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.inputs = {'X': x} self.init_dtype()
self.outputs = {'Out': np.round(self.inputs['X'])}
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.round(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Round(TestRound):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestRelu(OpTest): class TestRelu(OpTest):
def setUp(self): def setUp(self):
...@@ -278,222 +561,463 @@ class TestFP16Relu(TestRelu): ...@@ -278,222 +561,463 @@ class TestFP16Relu(TestRelu):
class TestBRelu(OpTest): class TestBRelu(OpTest):
def setUp(self): def setUp(self):
self.op_type = "brelu" self.op_type = "brelu"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.dtype = np.float32
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
t_min = 1.0 t_min = 1.0
t_max = 4.0 t_max = 4.0
# The same with TestAbs # The same with TestAbs
x[np.abs(x - t_min) < 0.005] = t_min + 0.02 x[np.abs(x - t_min) < 0.005] = t_min + 0.02
x[np.abs(x - t_max) < 0.005] = t_max + 0.02 x[np.abs(x - t_max) < 0.005] = t_max + 0.02
self.inputs = {'X': x}
self.attrs = {'t_min': t_min, 't_max': t_max}
t = np.copy(x) t = np.copy(x)
t[t < t_min] = t_min t[t < t_min] = t_min
t[t > t_max] = t_max t[t > t_max] = t_max
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {'t_min': t_min, 't_max': t_max}
self.outputs = {'Out': t} self.outputs = {'Out': t}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.02) self.check_grad(['X'], 'Out', max_relative_error=0.02)
def init_dtype(self):
pass
class TestFP16BRelu(TestBRelu):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestRelu6(OpTest): class TestRelu6(OpTest):
def setUp(self): def setUp(self):
self.op_type = "relu6" self.op_type = "relu6"
x = np.random.uniform(-1, 1, [4, 10]).astype("float32") self.dtype = np.float32
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 10]).astype(self.dtype)
threshold = 6.0 threshold = 6.0
# The same with TestAbs # The same with TestAbs
x[np.abs(x) < 0.005] = 0.02 x[np.abs(x) < 0.005] = 0.02
x[np.abs(x - threshold) < 0.005] = threshold + 0.02 x[np.abs(x - threshold) < 0.005] = threshold + 0.02
out = np.minimum(np.maximum(x, 0), threshold)
self.inputs = {'X': x} self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {'threshold': threshold} self.attrs = {'threshold': threshold}
self.outputs = { self.outputs = {'Out': out}
'Out': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.02) self.check_grad(['X'], 'Out', max_relative_error=0.02)
def init_dtype(self):
pass
class TestFP16Relu6(TestRelu6):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSoftRelu(OpTest): class TestSoftRelu(OpTest):
def setUp(self): def setUp(self):
self.op_type = "soft_relu" self.op_type = "soft_relu"
x = np.random.uniform(-3, 3, [4, 4]).astype("float32") self.dtype = np.float32
self.init_dtype()
x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
threshold = 2.0 threshold = 2.0
# The same reason with TestAbs # The same reason with TestAbs
x[np.abs(x - threshold) < 0.005] = threshold + 0.02 x[np.abs(x - threshold) < 0.005] = threshold + 0.02
x[np.abs(x + threshold) < 0.005] = -threshold + 0.02 x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
self.inputs = {'X': x}
self.attrs = {'threshold': threshold}
t = np.copy(x) t = np.copy(x)
t[t < -threshold] = -threshold t[t < -threshold] = -threshold
t[t > threshold] = threshold t[t > threshold] = threshold
self.outputs = {'Out': np.log((np.exp(t) + 1))} out = np.log((np.exp(t) + 1))
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {'threshold': threshold}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.02) self.check_grad(['X'], 'Out', max_relative_error=0.02)
def init_dtype(self):
pass
class TestFP16SoftRelu(TestSoftRelu):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestELU(OpTest): class TestELU(OpTest):
def setUp(self): def setUp(self):
self.op_type = "elu" self.op_type = "elu"
x = np.random.uniform(-3, 3, [4, 4]).astype("float32") self.dtype = np.float32
self.init_dtype()
x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
alpha = 1. alpha = 1.
out = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
# Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1) # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
# is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
self.inputs = {'X': x} self.inputs = {'X': x}
self.attrs = {'alpha': alpha} self.attrs = {'alpha': alpha}
self.outputs = { self.outputs = {'Out': out}
'Out': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.02) self.check_grad(['X'], 'Out', max_relative_error=0.02)
def init_dtype(self):
pass
class TestFP16ELU(TestELU):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestReciprocal(OpTest): class TestReciprocal(OpTest):
def setUp(self): def setUp(self):
self.op_type = "reciprocal" self.op_type = "reciprocal"
self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")} self.dtype = np.float32
self.outputs = {'Out': np.reciprocal(self.inputs['X'])} self.init_dtype()
x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
out = np.reciprocal(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.01) self.check_grad(['X'], 'Out', max_relative_error=0.01)
def init_dtype(self):
pass
class TestFP16Reciprocal(TestReciprocal):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestLog(OpTest): class TestLog(OpTest):
def setUp(self): def setUp(self):
self.op_type = "log" self.op_type = "log"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': np.log(self.inputs['X'])} x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.log(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Log(TestLog):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSquare(OpTest): class TestSquare(OpTest):
def setUp(self): def setUp(self):
self.op_type = "square" self.op_type = "square"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = {'Out': np.square(self.inputs['X'])} x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.square(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Square(TestSquare):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestPow(OpTest): class TestPow(OpTest):
def setUp(self): def setUp(self):
self.op_type = "pow" self.op_type = "pow"
self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")} self.dtype = np.float32
self.init_dtype()
x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
out = np.power(x, 3)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {'factor': 3.0} self.attrs = {'factor': 3.0}
self.outputs = {'Out': np.power(self.inputs['X'], 3)} self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.02) self.check_grad(['X'], 'Out', max_relative_error=0.02)
def init_dtype(self):
pass
class TestFP16Pow(TestPow):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=5e-2)
class TestSTanh(OpTest): class TestSTanh(OpTest):
def setUp(self): def setUp(self):
self.op_type = "stanh" self.op_type = "stanh"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.init_dtype()
}
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
scale_a = 2.0 / 3.0 scale_a = 2.0 / 3.0
scale_b = 1.7159 scale_b = 1.7159
out = scale_b * np.tanh(x * scale_a)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {'scale_a': scale_a, 'scale_b': scale_b} self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
self.outputs = {'Out': scale_b * np.tanh(self.inputs['X'] * scale_a)} self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16STanh(TestSTanh):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSoftplus(OpTest): class TestSoftplus(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softplus" self.op_type = "softplus"
self.inputs = { self.dtype = np.float64
'X': np.random.uniform(-1, 1, [11, 17]).astype("float64") self.init_dtype()
}
self.outputs = {'Out': np.log(1 + np.exp(self.inputs['X']))} x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
out = np.log(1 + np.exp(x))
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Softplus(TestSoftplus):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSoftsign(OpTest): class TestSoftsign(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softsign" self.op_type = "softsign"
self.inputs = { self.dtype = np.float32
'X': np.random.uniform(-1, 1, [11, 17]).astype("float32") self.init_dtype()
}
self.outputs = { x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
'Out': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X'])) out = np.divide(x, 1 + np.abs(x))
}
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
def init_dtype(self):
pass
class TestFP16Softsign(TestSoftsign):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestThresholdedRelu(OpTest): class TestThresholdedRelu(OpTest):
def setUp(self): def setUp(self):
self.op_type = "thresholded_relu" self.op_type = "thresholded_relu"
self.dtype = np.float32
self.init_dtype()
threshold = 0.25 threshold = 0.25
self.relative_error = 0.005 self.relative_error = 0.005
X = np.random.uniform(-1, 1, [11, 17]).astype("float32") X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
# Same reason as TestAbs # Same reason as TestAbs
X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2 X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2
out = (X > threshold) * X
self.inputs = {'X': X} self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
self.attrs = {'threshold': threshold} self.attrs = {'threshold': threshold}
self.outputs = {'Out': (X > threshold) * X} self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=self.relative_error) self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
def init_dtype(self):
pass
class TestFP16ThresholdedRelu(TestThresholdedRelu):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestHardSigmoid(OpTest): class TestHardSigmoid(OpTest):
def setUp(self): def setUp(self):
self.op_type = "hard_sigmoid" self.op_type = "hard_sigmoid"
self.dtype = np.float32
self.init_dtype()
self.relative_error = 0.002 self.relative_error = 0.002
X = np.random.uniform(-5, 5, [2, 2]).astype("float32") X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
...@@ -502,7 +1026,6 @@ class TestHardSigmoid(OpTest): ...@@ -502,7 +1026,6 @@ class TestHardSigmoid(OpTest):
lower_threshold = -offset / slope lower_threshold = -offset / slope
upper_threshold = (1 - offset) / slope upper_threshold = (1 - offset) / slope
self.inputs = {'X': X}
# Same reason as TestAbs # Same reason as TestAbs
X[np.abs(X - lower_threshold) < self.relative_error] = \ X[np.abs(X - lower_threshold) < self.relative_error] = \
lower_threshold + 0.2 lower_threshold + 0.2
...@@ -510,29 +1033,70 @@ class TestHardSigmoid(OpTest): ...@@ -510,29 +1033,70 @@ class TestHardSigmoid(OpTest):
upper_threshold - 0.2 upper_threshold - 0.2
temp = X * slope + offset temp = X * slope + offset
self.outputs = {'Out': np.maximum(0.0, np.minimum(1.0, temp))} out = np.maximum(0.0, np.minimum(1.0, temp))
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.002) self.check_grad(['X'], 'Out', max_relative_error=0.002)
def init_dtype(self):
pass
class TestFP16HardSigmoid(TestHardSigmoid):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSwish(OpTest): class TestSwish(OpTest):
def setUp(self): def setUp(self):
self.op_type = "swish" self.op_type = "swish"
X = np.random.uniform(0.1, 1, [11, 17]).astype("float32") self.dtype = np.float32
self.inputs = {'X': X} self.init_dtype()
self.attrs = {'beta': 2.3}
self.outputs = {'Out': X * expit(self.attrs['beta'] * X)} X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
beta = 2.3
out = X * expit(beta * X)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
self.attrs = {'beta': beta}
self.outputs = {'Out': out}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.008) self.check_grad(['X'], 'Out', max_relative_error=0.008)
def init_dtype(self):
pass
class TestFP16Swish(TestSwish):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
#--------------------test MKLDNN-------------------- #--------------------test MKLDNN--------------------
class TestMKLDNNReluDim2(TestRelu): class TestMKLDNNReluDim2(TestRelu):
......
...@@ -61,8 +61,12 @@ class TestMultipleReader(unittest.TestCase): ...@@ -61,8 +61,12 @@ class TestMultipleReader(unittest.TestCase):
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
batch_count = 0 batch_count = 0
while not data_files.eof(): while True:
img_val, = exe.run(fetch_list=[img]) try:
img_val, = exe.run(fetch_list=[img])
except fluid.core.EnforceNotMet as ex:
self.assertIn("There is no next data.", ex.message)
break
batch_count += 1 batch_count += 1
self.assertLessEqual(img_val.shape[0], self.batch_size) self.assertLessEqual(img_val.shape[0], self.batch_size)
data_files.reset() data_files.reset()
......
...@@ -44,7 +44,7 @@ class TestMultipleReader(unittest.TestCase): ...@@ -44,7 +44,7 @@ class TestMultipleReader(unittest.TestCase):
shapes=[(-1, 784), (-1, 1)], shapes=[(-1, 784), (-1, 1)],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=['float32', 'int64']) dtypes=['float32', 'int64'])
data_file = fluid.layers.create_multi_pass_reader( data_file = fluid.layers.io.multi_pass(
reader=data_file, pass_num=self.pass_num) reader=data_file, pass_num=self.pass_num)
img, label = fluid.layers.read_file(data_file) img, label = fluid.layers.read_file(data_file)
...@@ -57,8 +57,12 @@ class TestMultipleReader(unittest.TestCase): ...@@ -57,8 +57,12 @@ class TestMultipleReader(unittest.TestCase):
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
batch_count = 0 batch_count = 0
while not data_file.eof(): while True:
img_val, = exe.run(fetch_list=[img]) try:
img_val, = exe.run(fetch_list=[img])
except fluid.core.EnforceNotMet as ex:
self.assertIn("There is no next data.", ex.message)
break
batch_count += 1 batch_count += 1
self.assertLessEqual(img_val.shape[0], self.batch_size) self.assertLessEqual(img_val.shape[0], self.batch_size)
data_file.reset() data_file.reset()
......
...@@ -26,11 +26,14 @@ def simple_fc_net(use_feed): ...@@ -26,11 +26,14 @@ def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32') img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else: else:
reader = fluid.layers.open_recordio_file( reader = fluid.layers.open_files(
filename='./mnist.recordio', filenames=['./mnist.recordio'],
shapes=[[-1, 784], [-1, 1]], shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=['float32', 'int64']) dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader) img, label = fluid.layers.read_file(reader)
hidden = img hidden = img
for _ in xrange(4): for _ in xrange(4):
...@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed): ...@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32') img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else: else:
reader = fluid.layers.open_recordio_file( reader = fluid.layers.open_files(
filename='./mnist.recordio', filenames=['mnist.recordio'],
shapes=[[-1, 784], [-1, 1]], shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0], lod_levels=[0, 0],
dtypes=['float32', 'int64']) dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader) img, label = fluid.layers.read_file(reader)
hidden = img hidden = img
......
...@@ -201,24 +201,6 @@ class TestBlockDesc(unittest.TestCase): ...@@ -201,24 +201,6 @@ class TestBlockDesc(unittest.TestCase):
op1.set_type("test") op1.set_type("test")
op2.set_type("test") op2.set_type("test")
var0 = block.var("var0")
var1 = block.var("var1")
var2 = block.var("var2")
var3 = block.var("var3")
var4 = block.var("var4")
var5 = block.var("var5")
op0.set_input("X", ["var0"])
op0.set_output("Y", ["var0"])
op1.set_input("X", ["var1", "var2"])
op1.set_output("Y", ["var3", "var4"])
op2.set_input("X", ["var1"])
op2.set_output("Y", ["var4", "var5"])
program.sync_with_cpp()
# remove op1, its input var2 and output var3 will be removed at the same time,
# but its input var1 and output var4 will not be removed since they are used for op2.
block.remove_op(1, 2) block.remove_op(1, 2)
program.sync_with_cpp() program.sync_with_cpp()
...@@ -226,8 +208,6 @@ class TestBlockDesc(unittest.TestCase): ...@@ -226,8 +208,6 @@ class TestBlockDesc(unittest.TestCase):
for idx in xrange(0, block.op_size()): for idx in xrange(0, block.op_size()):
all_ops.append(block.op(idx)) all_ops.append(block.op(idx))
self.assertEqual(all_ops, [op0, op2]) self.assertEqual(all_ops, [op0, op2])
all_vars = block.all_vars()
self.assertEqual(set(all_vars), {var0, var1, var4, var5})
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -65,8 +65,13 @@ class TestRecordIO(unittest.TestCase): ...@@ -65,8 +65,13 @@ class TestRecordIO(unittest.TestCase):
# train a pass # train a pass
batch_id = 0 batch_id = 0
while not data_file.eof(): while True:
tmp, = exe.run(fetch_list=[avg_loss]) try:
tmp, = exe.run(fetch_list=[avg_loss])
except fluid.core.EnforceNotMet as ex:
self.assertIn("There is no next data.", ex.message)
break
avg_loss_np.append(tmp) avg_loss_np.append(tmp)
batch_id += 1 batch_id += 1
data_file.reset() data_file.reset()
...@@ -74,8 +79,8 @@ class TestRecordIO(unittest.TestCase): ...@@ -74,8 +79,8 @@ class TestRecordIO(unittest.TestCase):
self.assertLess(avg_loss_np[-1], avg_loss_np[0]) self.assertLess(avg_loss_np[-1], avg_loss_np[0])
def test_shuffle_reader(self): def test_shuffle_reader(self):
self.test_main(decorator_callback=lambda reader: fluid.layers.create_shuffle_reader(reader, buffer_size=200)) self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
def test_double_buffer_reader(self): def test_double_buffer_reader(self):
self.test_main(decorator_callback=lambda reader: fluid.layers.create_double_buffer_reader(reader, self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu')) place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
...@@ -102,7 +102,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF': ...@@ -102,7 +102,7 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
package_data['py_paddle']=['*.py','_swig_paddle.so'] package_data['py_paddle']=['*.py','_swig_paddle.so']
package_dir={ package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}', '': '${PADDLE_BINARY_DIR}/python',
# The paddle.fluid.proto will be generated while compiling. # The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory. # So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册