提交 c87d11a7 编写于 作者: C caoying03

Merge branch 'develop' into enhance_reshape

...@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) ...@@ -53,7 +53,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. # TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option.
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" ON) option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON) option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
......
...@@ -53,10 +53,14 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 ...@@ -53,10 +53,14 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
# version util jupyter fixes this issue. # version util jupyter fixes this issue.
# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
# version(1.7.1 for now), which causes building documentation failed.
RUN pip install --upgrade pip && \ RUN pip install --upgrade pip && \
pip install -U wheel && \ pip install -U wheel && \
pip install -U docopt PyYAML sphinx && \ pip install -U docopt PyYAML sphinx==1.5.6 && \
pip install -U sphinx-rtd-theme==0.1.9 recommonmark pip install sphinx-rtd-theme==0.1.9 recommonmark
RUN pip install pre-commit 'ipython==5.3.0' && \ RUN pip install pre-commit 'ipython==5.3.0' && \
pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
......
...@@ -244,11 +244,11 @@ function(cc_test TARGET_NAME) ...@@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START") if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END) list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif() endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS} COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
...@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME) ...@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
endif() endif()
endfunction(nv_test) endfunction(nv_test)
......
# C++ Data Feeding
In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
## Reader
A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
### `ReaderBase`
`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
```cpp
class ReaderBase {
public:
explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
PADDLE_ENFORCE(!shapes_.empty());
}
// Read the next batch of data. (A 'batch' can be only one instance)
// If the next batch doesn't exist, the '*out' will be an empty std::vector.
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
// Reinitialize the reader and read the file from the begin.
virtual void ReInit() = 0;
// Get a certain read in data's shape.
DDim shape(size_t idx) const;
// Get shapes of all read in data.
std::vector<DDim> shapes() const { return shapes_; }
// Set shapes of read in data.
void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
virtual ~ReaderBase() {}
protected:
std::vector<DDim> shapes_;
};
```
### `FileReader` and `DecoratedReader`
These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
### `ReaderHolder`
Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
```cpp
var->Get<ReaderBase>("batch_reader");
```
we have to write:
```cpp
var->Get<BatchReader>("batch_reader");
```
This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
## Related Operators
To create and invoke readers, some now ops are introduced:
### `CreateReaderOp`
Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
### `ReadOp`
A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
# C++ Data Feeding
While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
## Overview
![](images/readers.png)
## Reader
In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
### ReaderBase
`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
```cpp
class ReaderBase {
public:
// Reads the next batch of data. (A 'batch' can be only one instance)
// If the next batch doesn't exist, it throws an exception
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
// Checks whether the next instance exists.
virtual bool HasNext() = 0;
// Reinitializes the reader and read the file from the beginning.
virtual void ReInit() = 0;
virtual ~ReaderBase();
};
```
### FileReader
`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
```cpp
class FileReader : public ReaderBase {
public:
explicit FileReader(const std::vector<DDim>& dims);
void ReadNext(std::vector<LoDTensor>* out) override;
protected:
virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
private:
std::vector<DDim> dims_;
};
```
A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.
### DecoratedReader
A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling, batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
```cpp
class DecoratedReader : public ReaderBase {
public:
explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
PADDLE_ENFORCE_NOT_NULL(reader_);
}
void ReInit() override { reader_->ReInit(); }
bool HasNext() const override { return reader_->HasNext(); }
protected:
ReaderBase* reader_;
};
```
Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
### MultipleReader
All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
![](images/multiple_reader.png)
This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel.
### ReaderHolder
Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
```cpp
var->Get<ReaderBase>("batch_reader");
```
We would have to write:
```cpp
var->Get<BatchReader>("batch_reader");
```
This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
## Related Operators
To create and invoke readers, some new ops are introduced:
### CreateReaderOp
Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
### OpenFilesOp
The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
### HasNextOp
`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
### ResetOp
`ResetOp` is used to reset a reader via its `ReInit()` interface.
### ReadOp
A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
## Program with Readers
A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
The ops of a `startup_program` with readers would be like this:
```
multiple_reader = open_files_op(...)
batch_reader = create_batch_reader_op(multiple_reader)
double_buffer_reader = create_double_buffer_op(batch_reader)
... (other initializers)
```
The forwarding ops of the corresponding `main_program` would be like this:
```
while_op {
has_next = has_next_op(double_buffer_reader)
if_else_op(has_next) {
batch_data = read_op(double_buffer_reader)
... (subsequent training ops)
} else {
reset_op(double_buffer_reader)
}
}
```
Two important considerations for these programs are as follows:
1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
2. All readers exist in both `startup_program` and `main_program`. And they are persistable.
# Design Doc: Distributed Training Architecture # Design Doc: Fluid Distributed Training Architecture
## Abstract ## Abstract
...@@ -155,7 +155,7 @@ Cluster environment. ...@@ -155,7 +155,7 @@ Cluster environment.
<img src="src/remote_executor.png" width="500" align="center" /> <img src="src/remote_executor.png" width="500" align="center" />
`RemoteExecutor.run` sends the `ProgramDesc` and `RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`. to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
......
...@@ -59,6 +59,17 @@ After converting: ...@@ -59,6 +59,17 @@ After converting:
queue. It will block until the queue has the required number of queue. It will block until the queue has the required number of
tensors. tensors.
### Sparse Update
For embedding layers, the gradient may have many rows containing only 0 when training,
if the gradient uses a dense tensor to do parameter optimization,
it could spend unnecessary memory, slow down the calculations and waste
the bandwidth while doing distributed training.
In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
non-zero gradient data. So when we do parameter optimization both locally and remotely,
we only need to send those non-zero rows to the optimizer operators:
<img src="src/sparse_update.png" width="700" />
### Benefits ### Benefits
...@@ -91,6 +102,6 @@ After converting: ...@@ -91,6 +102,6 @@ After converting:
`min_count` attribute), does our current design support it? (similar `min_count` attribute), does our current design support it? (similar
question for the *Add* OP) question for the *Add* OP)
### References
### References:
[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) [1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
...@@ -94,7 +94,7 @@ The classical DS2 network contains 15 layers (from bottom to top): ...@@ -94,7 +94,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
- **One** CTC-loss layer - **One** CTC-loss layer
<div align="center"> <div align="center">
<img src="image/ds2_network.png" width=350><br/> <img src="images/ds2_network.png" width=350><br/>
Figure 1. Archetecture of Deep Speech 2 Network. Figure 1. Archetecture of Deep Speech 2 Network.
</div> </div>
...@@ -141,7 +141,7 @@ TODO by Assignees ...@@ -141,7 +141,7 @@ TODO by Assignees
### Beam Search with CTC and LM ### Beam Search with CTC and LM
<div align="center"> <div align="center">
<img src="image/beam_search.png" width=600><br/> <img src="images/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder. Figure 2. Algorithm for CTC Beam Search Decoder.
</div> </div>
......
# API注释撰写标准
- [API注释模块](#API注释模块)
- [格式及示例](#格式及示例)
- [完整示例](#完整示例)
## API注释模块
API文档须包含以下几个模块(排列顺序为文档撰写顺序):
- Python API Definition
API的代码定义。
- Function Description
API的功能描述。描述该API的含义、作用或对输入所做的操作,及参考文献和对应链接(如果有),必要时给出公式,并解释公式中关键变量的含义。
- Args Description
API参数介绍。按代码定义中的参数顺序逐个介绍,介绍内容包含数据类型、默认值(如果有)、含义等。
- Returns
API返回值介绍。介绍返回值含义,必要时给出对应的形状。若返回值为包含多个参数的tuple,则按顺序逐个介绍各参数。
- Raises(如果有)
可能抛出的异常或错误及可能的产生原因,当可能抛出多种异常或错误时应分条列出。
- Note(如果有)
注意事项。当有多条注意事项时,应分条列出。
- Examples
API的使用示例。
## 格式及示例
API文档须使用reStructuredText格式撰写,该格式详情请参考[链接](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html)。API文档各模块的内容格式及示例如下(以下以fc为例进行说明):
- Python API Definition
- 格式:
[Python API Definition]
- 示例
```
fc(input,
size,
num_flatten_dims=1,
param_attr=None,
bias_attr=None,
act=None,
name=None,
main_program=None,
startup_program=None)
```
- Function Description
- 格式
本模块应包含以下内容(排列顺序为文档撰写顺序):
[Function Description]
[Formula]
[Symbols' Descriptions if necessary]
[References if necessary]
- 示例
[Function Description]
```
**Fully Connected Layer**
The fully connected layer can take multiple tensors as its inputs. It
creates a variable called weights for each input tensor, which represents
a fully connected weight matrix from each input unit to each output unit.
The fully connected layer multiplies each input tensor with its coresponding
weight to produce an output Tensor. If multiple input tensors are given,
the results of multiple multiplications will be sumed up. If bias_attr is
not None, a bias variable will be created and added to the output. Finally,
if activation is not None, it will be applied to the output as well.
```
[Formula]
```
This process can be formulated as follows:
.. math::
Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
```
[Symbols' Descriptions if necessary]
```
In the above equation:
* :math:`N`: Number of the input.
* :math:`X_i`: The input tensor.
* :math:`W`: The weights created by this layer.
* :math:`b`: The bias parameter created by this layer (if needed).
* :math:`Act`: The activation function.
* :math:`Out`: The output tensor.
```
[References if necessary]
因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例:
```
Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
```
- Args Description
- 格式
\[Arg's Name\][(Data Type, Default Value)][Description]
- 示例
fc的部分参数注释如下:
```
Args:
input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
the input tensor(s) is at least 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
parameters/weights of this layer.
name (str, default None): The name of this layer.
```
- Returns
- 格式
[Name][Shape]
- 示例
```
Returns:
A tensor variable storing the transformation result.
```
当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例:
```
Returns:
A tuple containing:
The hidden state of LSTM whose shape is (T X D).
The cell state of LSTM whose shape is (T X D).
```
- Raises
- 格式
[Exception Type][Condition]
- 示例
```
Raises:
ValueError: If the rank of the input is less than 2.
```
- Note
- 格式
[Note]
- 示例
fc没有注意事项,故该模块省略不写。如有注意事项应明确给出,当有多条注意事项,须分条列出,以scaled\_dot\_product\_attention为例:
```
Note:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
2. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
```
- Examples
- 格式
\[Python Code Snipper]
- 示例
```
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
fc = fluid.layers.fc(input=data, size=1000, act="tanh")
```
## 完整示例
fc 的完整注释见[示例](src/fc.py)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def fc(input,
size,
num_flatten_dims=1,
param_attr=None,
bias_attr=None,
act=None,
name=None):
"""
**Fully Connected Layer**
The fully connected layer can take multiple tensors as its inputs. It
creates a variable called weights for each input tensor, which represents
a fully connected weight matrix from each input unit to each output unit.
The fully connected layer multiplies each input tensor with its coresponding
weight to produce an output Tensor. If multiple input tensors are given,
the results of multiple multiplications will be sumed up. If bias_attr is
not None, a bias variable will be created and added to the output. Finally,
if activation is not None, it will be applied to the output as well.
This process can be formulated as follows:
.. math::
Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
In the above equation:
* :math:`N`: Number of the input.
* :math:`X_i`: The input tensor.
* :math:`W`: The weights created by this layer.
* :math:`b`: The bias parameter created by this layer (if needed).
* :math:`Act`: The activation function.
* :math:`Out`: The output tensor.
Args:
input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
the input tensor(s) is at least 2.
size(int): The number of output units in this layer.
num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
two dimensions. If this happens, the multidimensional tensor will first be flattened
into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
dimensions will be flatten to form the first dimension of the final matrix (height of
the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
form the second dimension of the final matrix (width of the matrix). For example, suppose
`X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
of this layer. If it is set to None, no bias will be added to the output units.
act (str, default None): Activation to be applied to the output of this layer.
name (str, default None): The name of this layer.
Returns:
A tensor variable storing the transformation result.
Raises:
ValueError: If rank of the input tensor is less than 2.
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
fc = fluid.layers.fc(input=data, size=1000, act="tanh")
"""
开发标准 开发标准
======== ========
PaddlePaddle遵守如下三个部分的代码和文档规范。
PaddlePaddle使用git做版本管理,docker作为构建和测试环境。代码中包含了Cuda, C++, Python, Shell等多种编程语言。语言规范遵守Google C++ Style, Pep-8, 代码库中包含自动化检查工具做风格检查。代码注释需要遵守Doxygen规范,不满足风格要求的代码会编译失败。关于如何使用git, 构建测试及代码开发, 我们提供了如下指南。
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
contribute_to_paddle_cn.md contribute_to_paddle_cn.md
PaddlePaddle面向国内外用户,包含了中文和英文两部分的文档。设计文档和issue问题描述都推荐使用英文。对于设计文档,重在问题描述,背景阐述,然后才是解决方案。文档由Sphinx生成,因此代码注释也需要符合Sphinx文档标准。推荐本地使用paddlepaddle.org工具编译生成和预览文档,请参阅如下文档。
.. toctree::
:maxdepth: 1
write_docs_cn.rst write_docs_cn.rst
PaddlePaddle V2 使用新增Layer方式定义新的操作。组合基础API可以实现多种复杂Layer, 满足绝大多数应用。如需要定制Layer,请参阅如下文档,欢迎提交patch。
.. toctree::
:maxdepth: 1
new_layer_cn.rst new_layer_cn.rst
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
下图是一个全连接层的示意图。在全连接层中,每个输出节点都连接到所有的输入节点上。 下图是一个全连接层的示意图。在全连接层中,每个输出节点都连接到所有的输入节点上。
.. image:: FullyConnected.jpg .. image:: src/FullyConnected.jpg
:align: center :align: center
:scale: 60 % :scale: 60 %
......
...@@ -16,7 +16,7 @@ First we need to derive equations of the *forward* and *backward* part of the la ...@@ -16,7 +16,7 @@ First we need to derive equations of the *forward* and *backward* part of the la
The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes. The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes.
.. image:: FullyConnected.jpg .. image:: src/FullyConnected.jpg
:align: center :align: center
:scale: 60 % :scale: 60 %
......
...@@ -2,20 +2,19 @@ ...@@ -2,20 +2,19 @@
如何贡献文档 如何贡献文档
############# #############
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成,也可以利用paddlepaddle.org工具来编译和预览文档。
也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
如何构建文档 如何构建文档
============ ============
PaddlePaddle的文档构建有三种方式 PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法
使用PaddlePaddle.org工具 使用PaddlePaddle.org工具
-------------- ------------------------
这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档 这个是目前推荐的使用方法。除了可以自动编译文档,还可以直接在网页中预览文档,需要注意的是,采用后续说明的其它方式虽然也可以预览文档,但是文档的样式与官网文档是不一致的,使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果
文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后即可用以下命令启动工具
.. code-block:: bash .. code-block:: bash
...@@ -35,7 +34,7 @@ PaddlePaddle的文档构建有三种方式。 ...@@ -35,7 +34,7 @@ PaddlePaddle的文档构建有三种方式。
之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
.. code-block:: bash .. code-block:: bash
...@@ -62,37 +61,46 @@ PaddlePaddle的文档构建有三种方式。 ...@@ -62,37 +61,46 @@ PaddlePaddle的文档构建有三种方式。
想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。 想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
使用Docker构建 不使用PaddlePaddle.org工具
-------------- --------------------------
使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即 使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
.. code-block:: bash [TBD]
cd TO_YOUR_PADDLE_CLONE_PATH 如果不想使用Docker,也可以使用以下命令直接构建PaddlePaddle文档,即
cd paddle/scripts/tools/build_docs
sh build_docs.sh
编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 .. code-block:: bash
打开浏览器访问对应目录下的index.html即可访问本地文档。
直接构建 mkdir paddle
-------- cd paddle
git clone https://github.com/PaddlePaddle/Paddle.git
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
如果提示正确,可以执行以下命令编译生成文档,即 # 如果只需要构建使用文档,则执行以下命令
make -j $processors gen_proto_py
make -j $processors paddle_docs paddle_docs_cn
.. code-block:: bash # 如果只需要构建API,则执行以下命令
make -j $processors gen_proto_py framework_py_proto
make -j $processors copy_paddle_pybind
make -j $processors paddle_api_docs
其中$processors代表启动和CPU核一样多的进程来并行编译,可以根据本机的CPU核数设置相应的值。
编译完成后,进入 ``doc/v2`` 目录,如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录,选择构建API则会生成 ``api/en/html`` 目录,分别进入这些目录下,执行以下命令:
.. code-block:: bash
cd TO_YOUR_PADDLE_CLONE_PATH python -m SimpleHTTPServer 8088
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make gen_proto_py
make paddle_docs paddle_docs_cn
编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。 在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。
打开浏览器访问对应目录下的index.html即可访问本地文档。
.. image:: src/doc_en.png
:align: center
:scale: 60 %
如何书写文档 如何书写文档
============ ============
...@@ -102,7 +110,7 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程 ...@@ -102,7 +110,7 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程
如何更新www.paddlepaddle.org 如何更新www.paddlepaddle.org
============================ ============================
更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。 更新的文档以PR的形式提交到github中,提交方式参见 `如何贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/dev/write_docs_cn.html>`_ 。
目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和 目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。 `英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
......
...@@ -71,6 +71,13 @@ paddle.init( ...@@ -71,6 +71,13 @@ paddle.init(
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 - trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 - pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
```python
trainer = paddle.trainer.SGD(..., is_local=False)
```
参数说明
- is_local: **必选, 默认True**, 是否使用PServer更新参数
## 准备数据集 ## 准备数据集
......
...@@ -73,6 +73,14 @@ Parameter Description ...@@ -73,6 +73,14 @@ Parameter Description
- trainer_id: **required, default 0**, ID for every trainer, start from 0. - trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
```python
trainer = paddle.trainer.SGD(..., is_local=False)
```
Parameter Description
- is_local: **required, default True**, whether update parameters by PServer.
## Prepare Training Dataset ## Prepare Training Dataset
Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
......
Distributed Training Distributed Training
==================== ====================
In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job: The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model.
.. image:: src/ps_en.png .. image:: src/ps_en.png
:width: 500 :width: 500
...@@ -10,13 +9,27 @@ In this section, we'll explain how to run distributed training jobs with PaddleP ...@@ -10,13 +9,27 @@ In this section, we'll explain how to run distributed training jobs with PaddleP
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. - Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. - Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD. The training of synchronous random gradient descent for neural network can be achieved by cooperation of trainers and parameter servers.
PaddlePaddle supports both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient. Before starting the cluster training, you need to prepare the cluster configuration, PaddlePaddle installation, and other preparations. To understand how to configure the basic environment for distributed training, check the link below:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
preparations_en.md preparations_en.md
Cluster training has a large number of configurable parameters, such as the number of machines used, communication ports, etc. To learn how to configure the distributed training process by setting startup these parameters, check the link below:
.. toctree::
:maxdepth: 1
cmd_argument_en.md cmd_argument_en.md
PaddlePaddle is compatible with a variety of different clusters. Each cluster has its own advantages, To learn how to run PaddlePaddle in different types of them, check the link below:
.. toctree::
:maxdepth: 1
multi_cluster/index_en.rst multi_cluster/index_en.rst
add_subdirectory(cuda) if(NOT WITH_FLUID)
add_subdirectory(function) add_subdirectory(cuda)
add_subdirectory(utils) add_subdirectory(function)
add_subdirectory(math) add_subdirectory(utils)
add_subdirectory(gserver) add_subdirectory(math)
add_subdirectory(parameter) add_subdirectory(gserver)
add_subdirectory(testing) add_subdirectory(parameter)
if(MOBILE_INFERENCE)
add_subdirectory(capi)
else()
add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(scripts)
if(WITH_C_API) if(MOBILE_INFERENCE)
add_subdirectory(capi) add_subdirectory(capi)
endif() else()
add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(scripts)
if(NOT ANDROID AND NOT IOS) if(WITH_C_API)
add_subdirectory(fluid) add_subdirectory(capi)
endif() endif()
if(WITH_SWIG_PY) if(WITH_SWIG_PY)
add_subdirectory(api) add_subdirectory(api)
endif()
endif() endif()
endif() endif()
add_subdirectory(testing)
if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
add_subdirectory(fluid)
endif()
...@@ -36,7 +36,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) ...@@ -36,7 +36,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER} add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
${CAPI_PRIVATE_HEADER}) ${CAPI_PRIVATE_HEADER})
add_dependencies(paddle_capi paddle_proto) add_dependencies(paddle_capi paddle_proto paddle_gserver)
# TODO: paddle_capi_whole will be removed. # TODO: paddle_capi_whole will be removed.
set(PADDLE_CAPI_LAYERS_LIBS set(PADDLE_CAPI_LAYERS_LIBS
......
...@@ -21,7 +21,7 @@ endif() ...@@ -21,7 +21,7 @@ endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init) nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
...@@ -103,4 +103,5 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) ...@@ -103,4 +103,5 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(channel_test SRCS channel_test.cc) cc_test(channel_test SRCS channel_test.cc)
cc_test(tuple_test SRCS tuple_test.cc ) cc_test(tuple_test SRCS tuple_test.cc )
cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc) channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
conditional_block_op while_op assign_op print_op executor proto_desc)
...@@ -135,6 +135,14 @@ OpDesc *BlockDesc::PrependOp() { ...@@ -135,6 +135,14 @@ OpDesc *BlockDesc::PrependOp() {
return ops_.front().get(); return ops_.front().get();
} }
OpDesc *BlockDesc::InsertOp(size_t index) {
need_update_ = true;
auto it = ops_.begin() + index;
std::unique_ptr<OpDesc> new_op(new OpDesc(this));
it = ops_.insert(it, std::move(new_op));
return (*it).get();
}
void BlockDesc::RemoveOp(size_t s, size_t e) { void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
return; return;
......
...@@ -87,6 +87,8 @@ class BlockDesc { ...@@ -87,6 +87,8 @@ class BlockDesc {
OpDesc *PrependOp(); OpDesc *PrependOp();
OpDesc *InsertOp(size_t index);
void RemoveOp(size_t s, size_t e); void RemoveOp(size_t s, size_t e);
std::vector<OpDesc *> AllOps() const; std::vector<OpDesc *> AllOps() const;
......
...@@ -15,23 +15,43 @@ limitations under the License. */ ...@@ -15,23 +15,43 @@ limitations under the License. */
#pragma once #pragma once
#include <stddef.h> // for size_t #include <stddef.h> // for size_t
#include <condition_variable>
#include <typeindex> #include <typeindex>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
enum class ChannelAction {
SEND = 0,
RECEIVE = 1,
CLOSE = 2,
};
// Channel is the abstract class of buffered and un-buffered channels. // Channel is the abstract class of buffered and un-buffered channels.
template <typename T> template <typename T>
class Channel { class Channel {
public: public:
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual bool Send(T*) = 0; virtual bool Send(T*) = 0;
virtual bool Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Lock() = 0; virtual void Lock() = 0;
virtual void Unlock() = 0; virtual void Unlock() = 0;
virtual bool IsClosed() = 0;
virtual void Close() = 0; virtual void Close() = 0;
virtual ~Channel() {} virtual ~Channel() {}
virtual void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) = 0;
virtual void RemoveFromSendQ(const void* referrer) = 0;
virtual void RemoveFromReceiveQ(const void* referrer) = 0;
}; };
// Forward declaration of channel implementations. // Forward declaration of channel implementations.
...@@ -80,6 +100,27 @@ class ChannelHolder { ...@@ -80,6 +100,27 @@ class ChannelHolder {
return channel != nullptr ? channel->Receive(data) : false; return channel != nullptr ? channel->Receive(data) : false;
} }
bool IsClosed() {
if (IsInitialized()) {
return holder_->IsClosed();
}
return false;
}
bool CanSend() {
if (IsInitialized()) {
return holder_->CanSend();
}
return false;
}
bool CanReceive() {
if (IsInitialized()) {
return holder_->CanReceive();
}
return false;
}
void close() { void close() {
if (IsInitialized()) holder_->Close(); if (IsInitialized()) holder_->Close();
} }
...@@ -97,6 +138,38 @@ class ChannelHolder { ...@@ -97,6 +138,38 @@ class ChannelHolder {
if (IsInitialized()) holder_->Unlock(); if (IsInitialized()) holder_->Unlock();
} }
template <typename T>
void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToSendQ(referrer, data, cond, cb);
}
}
}
template <typename T>
void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) {
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
if (channel != nullptr) {
channel->AddToReceiveQ(referrer, data, cond, cb);
}
}
}
void RemoveFromSendQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromSendQ(referrer);
}
void RemoveFromReceiveQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer);
}
inline bool IsInitialized() const { return holder_ != nullptr; } inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() { inline const std::type_index Type() {
...@@ -113,6 +186,11 @@ class ChannelHolder { ...@@ -113,6 +186,11 @@ class ChannelHolder {
virtual ~Placeholder() {} virtual ~Placeholder() {}
virtual const std::type_index Type() const = 0; virtual const std::type_index Type() const = 0;
virtual void* Ptr() const = 0; virtual void* Ptr() const = 0;
virtual bool IsClosed() = 0;
virtual bool CanSend() = 0;
virtual bool CanReceive() = 0;
virtual void RemoveFromSendQ(const void* referrer) = 0;
virtual void RemoveFromReceiveQ(const void* referrer) = 0;
virtual void Close() = 0; virtual void Close() = 0;
virtual void Lock() = 0; virtual void Lock() = 0;
virtual void Unlock() = 0; virtual void Unlock() = 0;
...@@ -129,6 +207,39 @@ class ChannelHolder { ...@@ -129,6 +207,39 @@ class ChannelHolder {
virtual void* Ptr() const { return static_cast<void*>(channel_.get()); } virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
virtual bool IsClosed() {
if (channel_) {
return channel_->IsClosed();
}
return false;
}
virtual bool CanSend() {
if (channel_) {
return channel_->CanSend();
}
return false;
}
virtual bool CanReceive() {
if (channel_) {
return channel_->CanReceive();
}
return false;
}
virtual void RemoveFromSendQ(const void* referrer) {
if (channel_) {
channel_->RemoveFromSendQ(referrer);
}
}
virtual void RemoveFromReceiveQ(const void* referrer) {
if (channel_) {
channel_->RemoveFromReceiveQ(referrer);
}
}
virtual void Close() { virtual void Close() {
if (channel_) channel_->Close(); if (channel_) channel_->Close();
} }
......
...@@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -29,32 +29,50 @@ class ChannelImpl : public paddle::framework::Channel<T> {
friend void paddle::framework::CloseChannel<T>(Channel<T> *); friend void paddle::framework::CloseChannel<T>(Channel<T> *);
public: public:
virtual bool CanSend();
virtual bool CanReceive();
virtual bool Send(T *); virtual bool Send(T *);
virtual bool Receive(T *); virtual bool Receive(T *);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Lock(); virtual void Lock();
virtual void Unlock(); virtual void Unlock();
virtual bool IsClosed();
virtual void Close(); virtual void Close();
ChannelImpl(size_t); ChannelImpl(size_t);
virtual ~ChannelImpl(); virtual ~ChannelImpl();
virtual void AddToSendQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void AddToReceiveQ(const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb);
virtual void RemoveFromSendQ(const void *referrer);
virtual void RemoveFromReceiveQ(const void *referrer);
private: private:
struct QueueMessage { struct QueueMessage {
T *data; T *data;
std::condition_variable_any cond; std::shared_ptr<std::condition_variable_any> cond;
bool chan_closed = false; bool chan_closed = false;
bool completed = false; bool completed = false;
const void *referrer; // TODO(thuan): figure out better way to do this
std::function<bool(ChannelAction)> callback;
QueueMessage(T *item) : data(item) {} QueueMessage(T *item)
: data(item), cond(std::make_shared<std::condition_variable_any>()) {}
QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
: data(item), cond(cond) {}
void Wait(std::unique_lock<std::recursive_mutex> &lock) { void Wait(std::unique_lock<std::recursive_mutex> &lock) {
cond.wait(lock, [this]() { return completed; }); cond->wait(lock, [this]() { return completed; });
} }
void Notify() { void Notify() {
completed = true; completed = true;
cond.notify_all(); cond->notify_all();
} }
}; };
...@@ -87,6 +105,18 @@ ChannelImpl<T>::ChannelImpl(size_t capacity) ...@@ -87,6 +105,18 @@ ChannelImpl<T>::ChannelImpl(size_t capacity)
PADDLE_ENFORCE_GE(capacity, 0); PADDLE_ENFORCE_GE(capacity, 0);
} }
template <typename T>
bool ChannelImpl<T>::CanSend() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !closed_ && (!recvq.empty() || buf_.size() < cap_);
}
template <typename T>
bool ChannelImpl<T>::CanReceive() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
}
template <typename T> template <typename T>
bool ChannelImpl<T>::Send(T *item) { bool ChannelImpl<T>::Send(T *item) {
send_ctr++; send_ctr++;
...@@ -105,7 +135,24 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -105,7 +135,24 @@ bool ChannelImpl<T>::Send(T *item) {
std::shared_ptr<QueueMessage> m = recvq.front(); std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front(); recvq.pop_front();
// Do the data transfer // Do the data transfer
*(m->data) = std::move(*item); // We will do this data transfer if either of the following
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_send = true;
if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
if (do_send)
*(m->data) = std::move(*item);
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Send function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return send_return(Send(item));
// Wake up the blocked process and unlock // Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); lock.unlock();
...@@ -150,7 +197,25 @@ bool ChannelImpl<T>::Receive(T *item) { ...@@ -150,7 +197,25 @@ bool ChannelImpl<T>::Receive(T *item) {
std::shared_ptr<QueueMessage> m = sendq.front(); std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front(); sendq.pop_front();
// Do the data transfer // Do the data transfer
*item = std::move(*(m->data)); // We will do this data transfer if either of the following
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_receive = true;
if (m->callback != nullptr)
do_receive = m->callback(ChannelAction::RECEIVE);
if (do_receive)
*item = std::move(*(m->data));
else
// We cannot do the data transfer because
// this QueueMessage was added by Select
// and some other case was executed.
// So call the Receive function again.
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
return recv_return(Receive(item));
// Wake up the blocked process and unlock // Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); lock.unlock();
...@@ -186,6 +251,12 @@ void ChannelImpl<T>::Unlock() { ...@@ -186,6 +251,12 @@ void ChannelImpl<T>::Unlock() {
mu_.unlock(); mu_.unlock();
} }
template <typename T>
bool ChannelImpl<T>::IsClosed() {
std::lock_guard<std::recursive_mutex> lock{mu_};
return closed_;
}
template <typename T> template <typename T>
void ChannelImpl<T>::Close() { void ChannelImpl<T>::Close() {
std::unique_lock<std::recursive_mutex> lock{mu_}; std::unique_lock<std::recursive_mutex> lock{mu_};
...@@ -203,6 +274,12 @@ void ChannelImpl<T>::Close() { ...@@ -203,6 +274,12 @@ void ChannelImpl<T>::Close() {
std::shared_ptr<QueueMessage> m = recvq.front(); std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front(); recvq.pop_front();
m->chan_closed = true; m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify(); m->Notify();
} }
...@@ -211,10 +288,70 @@ void ChannelImpl<T>::Close() { ...@@ -211,10 +288,70 @@ void ChannelImpl<T>::Close() {
std::shared_ptr<QueueMessage> m = sendq.front(); std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front(); sendq.pop_front();
m->chan_closed = true; m->chan_closed = true;
// Execute callback function (if any)
if (m->callback != nullptr) {
m->callback(ChannelAction::CLOSE);
}
m->Notify(); m->Notify();
} }
} }
template <typename T>
void ChannelImpl<T>::AddToSendQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
sendq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::AddToReceiveQ(
const void *referrer, T *data,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) {
std::lock_guard<std::recursive_mutex> lock{mu_};
auto m = std::make_shared<QueueMessage>(data, cond);
m->referrer = referrer;
m->callback = cb;
recvq.push_back(m);
}
template <typename T>
void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = sendq.begin(); it != sendq.end();) {
std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
if (sendMsg->referrer == referrer) {
it = sendq.erase(it);
} else {
++it;
}
}
}
template <typename T>
void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
std::lock_guard<std::recursive_mutex> lock{mu_};
for (auto it = recvq.begin(); it != recvq.end();) {
std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
if (recvMsg->referrer == referrer) {
it = recvq.erase(it);
} else {
++it;
}
}
}
template <typename T> template <typename T>
ChannelImpl<T>::~ChannelImpl() { ChannelImpl<T>::~ChannelImpl() {
Close(); Close();
......
...@@ -19,7 +19,6 @@ limitations under the License. */ ...@@ -19,7 +19,6 @@ limitations under the License. */
#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
USE_NO_KERNEL_OP(go); USE_NO_KERNEL_OP(go);
USE_NO_KERNEL_OP(channel_close); USE_NO_KERNEL_OP(channel_close);
...@@ -27,6 +26,12 @@ USE_NO_KERNEL_OP(channel_create); ...@@ -27,6 +26,12 @@ USE_NO_KERNEL_OP(channel_create);
USE_NO_KERNEL_OP(channel_recv); USE_NO_KERNEL_OP(channel_recv);
USE_NO_KERNEL_OP(channel_send); USE_NO_KERNEL_OP(channel_send);
USE_NO_KERNEL_OP(elementwise_add); USE_NO_KERNEL_OP(elementwise_add);
USE_NO_KERNEL_OP(select);
USE_NO_KERNEL_OP(conditional_block);
USE_NO_KERNEL_OP(equal);
USE_NO_KERNEL_OP(assign);
USE_NO_KERNEL_OP(while);
USE_NO_KERNEL_OP(print);
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
...@@ -35,27 +40,15 @@ namespace paddle { ...@@ -35,27 +40,15 @@ namespace paddle {
namespace framework { namespace framework {
template <typename T> template <typename T>
void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name, LoDTensor *CreateVariable(Scope &scope, p::CPUPlace &place, std::string name,
T value) { T value) {
// Create LoDTensor<int> of dim [1,1] // Create LoDTensor<int> of dim [1]
auto var = scope.Var(name); auto var = scope.Var(name);
auto tensor = var->GetMutable<LoDTensor>(); auto tensor = var->GetMutable<LoDTensor>();
tensor->Resize({1, 1}); tensor->Resize({1});
T *expect = tensor->mutable_data<T>(place); T *expect = tensor->mutable_data<T>(place);
expect[0] = value; expect[0] = value;
} return tensor;
void InitTensorsInScope(Scope &scope, p::CPUPlace &place) {
p::CPUDeviceContext ctx(place);
// Create channel variable
scope.Var("Channel");
// Create Variables, x0 will be put into channel,
// result will be pulled from channel
CreateIntVariable(scope, place, "Status", false);
CreateIntVariable(scope, place, "x0", 99);
CreateIntVariable(scope, place, "result", 0);
} }
void AddOp(const std::string &type, const VariableNameMap &inputs, void AddOp(const std::string &type, const VariableNameMap &inputs,
...@@ -73,12 +66,116 @@ void AddOp(const std::string &type, const VariableNameMap &inputs, ...@@ -73,12 +66,116 @@ void AddOp(const std::string &type, const VariableNameMap &inputs,
op->SetAttrMap(attrs); op->SetAttrMap(attrs);
} }
void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
BlockDesc *casesBlock, int caseId, int caseType,
std::string caseChannel, std::string caseVarName,
std::function<void(BlockDesc *, Scope *)> func) {
std::string caseCondName = std::string("caseCond") + std::to_string(caseId);
std::string caseCondXVarName =
std::string("caseCondX") + std::to_string(caseId);
BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
func(caseBlock, scope);
CreateVariable(*scope, *place, caseCondName, false);
CreateVariable(*scope, *place, caseCondXVarName, caseId);
CreateVariable(*scope, *place, caseVarName, caseId);
scope->Var("step_scope");
AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}},
{{"Out", {caseCondName}}}, {}, casesBlock);
AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}},
{{"Out", {}}, {"Scope", {"step_scope"}}},
{{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock);
}
void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
BlockDesc *parentBlock, std::string dataChanName,
std::string quitChanName) {
BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
CreateVariable(*scope, *place, "whileExitCond", true);
CreateVariable(*scope, *place, "caseToExecute", -1);
CreateVariable(*scope, *place, "case1var", 0);
CreateVariable(*scope, *place, "xtemp", 0);
// TODO(thuan): Need to create fibXToSend, since channel send moves the actual
// data,
// which causes the data to be no longer accessible to do the fib calculation
// TODO(abhinav): Change channel send to do a copy instead of a move!
CreateVariable(*scope, *place, "fibXToSend", 0);
CreateVariable(*scope, *place, "fibX", 0);
CreateVariable(*scope, *place, "fibY", 1);
CreateVariable(*scope, *place, "quitVar", 0);
BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
// TODO(thuan): Remove this once we change channel send to do a copy instead
// of move
AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock);
// Case 0: Send to dataChanName
std::function<void(BlockDesc * caseBlock, Scope * scope)> case0Func = [&](
BlockDesc *caseBlock, Scope *scope) {
AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock);
AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock);
AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}},
{{"Out", {"fibY"}}}, {}, caseBlock);
};
AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend",
case0Func);
std::string case0Config =
std::string("0,1,") + dataChanName + std::string(",fibXToSend");
// Case 1: Receive from quitChanName
std::function<void(BlockDesc * caseBlock, Scope * scope)> case2Func = [&](
BlockDesc *caseBlock, Scope *scope) {
// Exit the while loop after we receive from quit channel.
// We assign a false to "whileExitCond" variable, which will
// break out of while_op loop
CreateVariable(*scope, *place, "whileFalse", false);
AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
caseBlock);
};
AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar",
case2Func);
std::string case1Config =
std::string("1,2,") + quitChanName + std::string(",quitVar");
// Select block
AddOp("select", {{"X", {dataChanName, quitChanName}},
{"case_to_execute", {"caseToExecute"}}},
{}, {{"sub_block", casesBlock},
{"cases", std::vector<std::string>{case0Config, case1Config}}},
whileBlock);
scope->Var("stepScopes");
AddOp("while",
{{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}},
{{"Out", {}}, {"StepScopes", {"stepScopes"}}},
{{"sub_block", whileBlock}}, parentBlock);
}
TEST(Concurrency, Go_Op) { TEST(Concurrency, Go_Op) {
Scope scope; Scope scope;
p::CPUPlace place; p::CPUPlace place;
// Initialize scope variables // Initialize scope variables
InitTensorsInScope(scope, place); p::CPUDeviceContext ctx(place);
// Create channel variable
scope.Var("Channel");
// Create Variables, x0 will be put into channel,
// result will be pulled from channel
CreateVariable(scope, place, "Status", false);
CreateVariable(scope, place, "x0", 99);
CreateVariable(scope, place, "result", 0);
framework::Executor executor(place); framework::Executor executor(place);
ProgramDesc program; ProgramDesc program;
...@@ -118,5 +215,78 @@ TEST(Concurrency, Go_Op) { ...@@ -118,5 +215,78 @@ TEST(Concurrency, Go_Op) {
auto *finalData = tensor.data<int>(); auto *finalData = tensor.data<int>();
EXPECT_EQ(finalData[0], 99); EXPECT_EQ(finalData[0], 99);
} }
/**
* This test implements the fibonacci function using go_op and select_op
*/
TEST(Concurrency, Select) {
Scope scope;
p::CPUPlace place;
// Initialize scope variables
p::CPUDeviceContext ctx(place);
CreateVariable(scope, place, "Status", false);
CreateVariable(scope, place, "result", 0);
CreateVariable(scope, place, "currentXFib", 0);
framework::Executor executor(place);
ProgramDesc program;
BlockDesc *block = program.MutableBlock(0);
// Create channel OP
std::string dataChanName = "Channel";
scope.Var(dataChanName);
AddOp("channel_create", {}, {{"Out", {dataChanName}}},
{{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
std::string quitChanName = "Quit";
scope.Var(quitChanName);
AddOp("channel_create", {}, {{"Out", {quitChanName}}},
{{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
// Create Go Op routine, which loops 10 times over fibonacci sequence
CreateVariable(scope, place, "xReceiveVar", 0);
BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
for (int i = 0; i < 10; ++i) {
AddOp("channel_recv", {{"Channel", {dataChanName}}},
{{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock);
AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}},
{{"first_n", 100},
{"summarize", -1},
{"print_tensor_name", false},
{"print_tensor_type", true},
{"print_tensor_shape", false},
{"print_tensor_lod", false},
{"print_phase", std::string("FORWARD")},
{"message", std::string("X: ")}},
goOpBlock);
}
CreateVariable(scope, place, "quitSignal", 0);
AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
{{"Status", {"Status"}}}, {}, goOpBlock);
// Create Go Op
AddOp("go", {{"X", {dataChanName, quitChanName}}}, {},
{{"sub_block", goOpBlock}}, block);
AddFibonacciSelect(&scope, &place, &program, block, dataChanName,
quitChanName);
// Create Channel Close Op
AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block);
AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block);
executor.Run(program, &scope, 0, true, true);
// After we call executor.run, "result" variable should be equal to 34
// (which is 10 loops through fibonacci sequence)
const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get<LoDTensor>();
auto *finalData = tensor.data<int>();
EXPECT_EQ(finalData[0], 34);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
...@@ -33,6 +34,20 @@ DEFINE_bool(check_nan_inf, false, ...@@ -33,6 +34,20 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace {
// block id starts from 0. This id is used to represent the codeblock
// wrapping the first block 0.
int kProgramId = -1;
} // namespace
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
: prog_(prog), block_id_(block_id) {}
const framework::ProgramDesc& prog_;
size_t block_id_;
std::vector<std::unique_ptr<OperatorBase>> ops_;
};
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
...@@ -85,73 +100,10 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -85,73 +100,10 @@ static void CheckTensorNANOrInf(const std::string& name,
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
// TODO(tonyyang-svail): platform::RecordBlock b(block_id);
// - only runs on the first device (i.e. no interdevice communication) auto* ctx = Prepare(pdesc, block_id);
// - will change to use multiple blocks for RNN op and Cond Op RunPreparedContext(ctx, scope, create_local_scope, create_vars);
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size()); delete ctx;
auto& block = pdesc.Block(block_id);
Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
} // if (create_local_scope)
} // if (create_vars)
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
}
}
}
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(2) << "-------------------------------------------------------";
}
} }
// Check whether the block already has feed operators and feed_holder. // Check whether the block already has feed operators and feed_holder.
...@@ -239,6 +191,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -239,6 +191,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, LoDTensor*>& fetch_targets, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name, const std::string& feed_holder_name,
const std::string& fetch_holder_name) { const std::string& fetch_holder_name) {
platform::RecordBlock b(kProgramId);
auto* copy_program = new ProgramDesc(program); auto* copy_program = new ProgramDesc(program);
auto* global_block = copy_program->MutableBlock(0); auto* global_block = copy_program->MutableBlock(0);
...@@ -313,5 +266,81 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, ...@@ -313,5 +266,81 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
delete copy_program; delete copy_program;
} }
ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program,
int block_id) {
auto* ctx = new ExecutorPrepareContext(program, block_id);
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id);
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
}
return ctx;
}
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
} // if (create_local_scope)
} // if (create_vars)
for (auto& op : ctx->ops_) {
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
}
}
}
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(2) << "-------------------------------------------------------";
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct ExecutorPrepareContext;
class Executor { class Executor {
public: public:
// TODO(dzhwinter) : Do not rely on this function, it will be removed // TODO(dzhwinter) : Do not rely on this function, it will be removed
...@@ -38,8 +38,8 @@ class Executor { ...@@ -38,8 +38,8 @@ class Executor {
* ProgramDesc * ProgramDesc
* Scope * Scope
*/ */
void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, void Run(const ProgramDesc& prog, Scope* scope, int block_id,
bool create_vars = true); bool create_local_scope = true, bool create_vars = true);
void Run(const ProgramDesc& program, Scope* scope, void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets, std::map<std::string, const LoDTensor*>& feed_targets,
...@@ -47,6 +47,13 @@ class Executor { ...@@ -47,6 +47,13 @@ class Executor {
const std::string& feed_holder_name = "feed", const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch"); const std::string& fetch_holder_name = "fetch");
static ExecutorPrepareContext* Prepare(const ProgramDesc& program,
int block_id);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true,
bool create_vars = true);
private: private:
const platform::Place place_; const platform::Place place_;
}; };
......
...@@ -19,6 +19,9 @@ limitations under the License. */ ...@@ -19,6 +19,9 @@ limitations under the License. */
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <algorithm> #include <algorithm>
...@@ -291,6 +294,31 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, ...@@ -291,6 +294,31 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx); TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
} }
void WriteToRecordIO(recordio::Writer &writer,
const std::vector<LoDTensor> &tensor,
const platform::DeviceContext &dev_ctx) {
std::stringstream buffer;
size_t sz = tensor.size();
buffer.write(reinterpret_cast<const char *>(&sz), sizeof(uint32_t));
for (auto &each : tensor) {
SerializeToStream(buffer, each, dev_ctx);
}
writer.Write(buffer.str());
}
std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) {
std::istringstream sin(scanner.Next());
uint32_t sz;
sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
std::vector<LoDTensor> result;
result.resize(sz);
for (uint32_t i = 0; i < sz; ++i) {
DeserializeFromStream(sin, &result[i], dev_ctx);
}
return result;
}
std::vector<LoDTensor> LoDTensor::SplitLoDTensor( std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
const std::vector<platform::Place> places) const { const std::vector<platform::Place> places) const {
check_memory_size(); check_memory_size();
......
...@@ -29,6 +29,12 @@ limitations under the License. */ ...@@ -29,6 +29,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace recordio {
class Writer;
class Scanner;
}
namespace framework { namespace framework {
/* /*
...@@ -209,5 +215,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor, ...@@ -209,5 +215,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
void DeserializeFromStream(std::istream& is, LoDTensor* tensor, void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
const platform::DeviceContext& dev_ctx); const platform::DeviceContext& dev_ctx);
extern void WriteToRecordIO(recordio::Writer& writer,
const std::vector<LoDTensor>& tensor,
const platform::DeviceContext& dev_ctx);
extern std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm> #include <algorithm>
...@@ -224,5 +227,43 @@ TEST(LoD, CheckAbsLoD) { ...@@ -224,5 +227,43 @@ TEST(LoD, CheckAbsLoD) {
abs_lod0.push_back(std::vector<size_t>({0})); abs_lod0.push_back(std::vector<size_t>({0}));
ASSERT_FALSE(CheckAbsLoD(abs_lod0)); ASSERT_FALSE(CheckAbsLoD(abs_lod0));
} }
TEST(LoDTensor, RecordIO) {
LoDTensor tensor;
int* tmp = tensor.mutable_data<int>(make_ddim({4, 5}), platform::CPUPlace());
for (int i = 0; i < 20; ++i) {
tmp[i] = i;
}
std::stringstream* stream = new std::stringstream();
auto& ctx =
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
{
recordio::Writer writer(stream, recordio::Compressor::kSnappy);
WriteToRecordIO(writer, {tensor, tensor}, ctx);
WriteToRecordIO(writer, {tensor, tensor}, ctx);
writer.Flush();
}
auto assert_tensor_ok = [](const LoDTensor& tensor) {
for (int i = 0; i < 20; ++i) {
ASSERT_EQ(tensor.data<int>()[i], i);
}
};
{
std::unique_ptr<std::istream> stream_ptr(stream);
recordio::Scanner scanner(std::move(stream_ptr));
auto tensors = ReadFromRecordIO(scanner, ctx);
ASSERT_EQ(tensors.size(), 2);
assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]);
tensors = ReadFromRecordIO(scanner, ctx);
ASSERT_EQ(tensors.size(), 2);
assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]);
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -442,15 +442,7 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -442,15 +442,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
std::vector<DDim> GetRepeatedDims(const std::string& name) const override { std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
Variable* var = scope_.FindVar(name); PADDLE_THROW("Only compile time support this method");
if (var->IsType<ReaderHolder>()) {
return var->Get<ReaderHolder>().shapes();
} else {
PADDLE_THROW(
"Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
"type_id is %s.",
name, var->Type().name());
}
} }
void SetDim(const std::string& name, const DDim& dim) override { void SetDim(const std::string& name, const DDim& dim) override {
...@@ -467,15 +459,7 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -467,15 +459,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
void SetRepeatedDims(const std::string& name, void SetRepeatedDims(const std::string& name,
const std::vector<DDim>& dims) override { const std::vector<DDim>& dims) override {
Variable* var = scope_.FindVar(name); PADDLE_THROW("Only compile time support this method");
if (var->IsType<ReaderHolder>()) {
var->GetMutable<ReaderHolder>()->set_shapes(dims);
} else {
PADDLE_THROW(
"Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
"type_id is %s.",
name, var->Type().name());
}
} }
proto::VarType::Type GetVarType(const std::string& name) const override { proto::VarType::Type GetVarType(const std::string& name) const override {
...@@ -497,8 +481,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -497,8 +481,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(place);
// profile
// For profiling, don't move out of this function because that will result
// in the failure of multi-GPU profiling.
platform::RecordEvent record_event(Type(), dev_ctx); platform::RecordEvent record_event(Type(), dev_ctx);
// check if op[type] has kernel registered. // check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels(); auto& all_op_kernels = AllOpKernels();
......
...@@ -16,14 +16,22 @@ ...@@ -16,14 +16,22 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
ReaderBase::~ReaderBase() {}
DDim ReaderBase::shape(size_t idx) const { FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
PADDLE_ENFORCE_LT(
idx, shapes_.size(), void FileReader::ReadNext(std::vector<LoDTensor> *out) {
"Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx, ReadNextImpl(out);
shapes_.size()); PADDLE_ENFORCE_EQ(out->size(), dims_.size());
return shapes_[idx]; for (size_t i = 0; i < dims_.size(); ++i) {
} auto &actual = out->at(i).dims();
auto &expect = dims_[i];
PADDLE_ENFORCE_EQ(actual.size(), expect.size());
for (int j = 0; j < actual.size(); ++j) {
PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -16,47 +16,53 @@ ...@@ -16,47 +16,53 @@
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/platform/place.h"
#include <memory>
#include <thread>
#include <vector>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class ReaderBase { class ReaderBase {
public: public:
explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
PADDLE_ENFORCE(!shapes_.empty());
}
virtual void ReadNext(std::vector<LoDTensor>* out) = 0; virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
virtual void ReInit() = 0; virtual void ReInit() = 0;
DDim shape(size_t idx) const; virtual bool HasNext() const = 0;
std::vector<DDim> shapes() const { return shapes_; }
void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
virtual ~ReaderBase() {}
protected:
std::vector<DDim> shapes_;
};
class FileReader : public ReaderBase { virtual ~ReaderBase();
public:
explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
}; };
class DecoratedReader : public ReaderBase { class DecoratedReader : public ReaderBase {
public: public:
explicit DecoratedReader(ReaderBase* reader) explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
: ReaderBase(reader->shapes()), reader_(reader) {
PADDLE_ENFORCE_NOT_NULL(reader_); PADDLE_ENFORCE_NOT_NULL(reader_);
} }
void ReInit() override { reader_->ReInit(); } void ReInit() override { reader_->ReInit(); }
bool HasNext() const override { return reader_->HasNext(); }
protected: protected:
ReaderBase* reader_; ReaderBase* reader_;
}; };
class FileReader : public ReaderBase {
public:
explicit FileReader(const std::vector<DDim>& dims);
void ReadNext(std::vector<LoDTensor>* out) override;
protected:
virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
private:
std::vector<DDim> dims_;
};
// The ReaderHolder is used as reader' unified wrapper, // The ReaderHolder is used as reader' unified wrapper,
// making it easier to access different type reader in Variables. // making it easier to access different type reader in Variables.
class ReaderHolder { class ReaderHolder {
...@@ -65,14 +71,16 @@ class ReaderHolder { ...@@ -65,14 +71,16 @@ class ReaderHolder {
ReaderBase* Get() const { return reader_.get(); } ReaderBase* Get() const { return reader_.get(); }
void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); } void ReadNext(std::vector<LoDTensor>* out) {
void ReInit() { reader_->ReInit(); } PADDLE_ENFORCE_NOT_NULL(reader_);
reader_->ReadNext(out);
DDim shape(size_t idx) const { return reader_->shape(idx); }
std::vector<DDim> shapes() const { return reader_->shapes(); }
void set_shapes(const std::vector<DDim>& shapes) {
reader_->set_shapes(shapes);
} }
void ReInit() {
PADDLE_ENFORCE_NOT_NULL(reader_);
reader_->ReInit();
}
bool HasNext() const { return reader_->HasNext(); }
private: private:
std::unique_ptr<ReaderBase> reader_; std::unique_ptr<ReaderBase> reader_;
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <mutex> // for call_once #include <mutex> // for call_once
#include <set>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -102,6 +103,18 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -102,6 +103,18 @@ void Scope::DeleteScope(Scope* scope) {
} }
} }
void Scope::EraseVars(std::vector<std::string>& var_names) {
std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) {
delete it->second;
it = vars_.erase(it);
} else {
++it;
}
}
}
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const { const std::string& new_name) const {
auto origin_it = vars_.find(origin_name); auto origin_it = vars_.find(origin_name);
......
...@@ -51,6 +51,8 @@ class Scope { ...@@ -51,6 +51,8 @@ class Scope {
/// Create a variable with a scope-unique name. /// Create a variable with a scope-unique name.
Variable* Var(std::string* name = nullptr); Variable* Var(std::string* name = nullptr);
void EraseVars(std::vector<std::string>& var_names);
/// Find a variable in the scope or any of its ancestors. Returns /// Find a variable in the scope or any of its ancestors. Returns
/// nullptr if cannot find. /// nullptr if cannot find.
Variable* FindVar(const std::string& name) const; Variable* FindVar(const std::string& name) const;
......
...@@ -115,11 +115,11 @@ void TestInference(const std::string& dirname, ...@@ -115,11 +115,11 @@ void TestInference(const std::string& dirname,
#endif #endif
} }
// Enable the profiler
paddle::platform::EnableProfiler(state);
// 2. Initialize the inference_program and load parameters // 2. Initialize the inference_program and load parameters
std::unique_ptr<paddle::framework::ProgramDesc> inference_program; std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
// Enable the profiler
paddle::platform::EnableProfiler(state);
{ {
paddle::platform::RecordEvent record_event( paddle::platform::RecordEvent record_event(
"init_program", "init_program",
...@@ -143,6 +143,10 @@ void TestInference(const std::string& dirname, ...@@ -143,6 +143,10 @@ void TestInference(const std::string& dirname,
inference_program = paddle::inference::Load(executor, *scope, dirname); inference_program = paddle::inference::Load(executor, *scope, dirname);
} }
} }
// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
"load_program_profiler.txt");
paddle::platform::ResetProfiler();
// 3. Get the feed_target_names and fetch_target_names // 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names = const std::vector<std::string>& feed_target_names =
...@@ -165,6 +169,12 @@ void TestInference(const std::string& dirname, ...@@ -165,6 +169,12 @@ void TestInference(const std::string& dirname,
// 6. Run the inference program // 6. Run the inference program
{ {
// Ignore the profiling results of the first run
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
// Enable the profiler
paddle::platform::EnableProfiler(state);
// Run repeat times to profile the performance // Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event( paddle::platform::RecordEvent record_event(
...@@ -173,12 +183,13 @@ void TestInference(const std::string& dirname, ...@@ -173,12 +183,13 @@ void TestInference(const std::string& dirname,
executor.Run(*inference_program, scope, feed_targets, fetch_targets); executor.Run(*inference_program, scope, feed_targets, fetch_targets);
} }
}
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, paddle::platform::DisableProfiler(
"profiler.txt"); paddle::platform::EventSortingKey::kDefault,
paddle::platform::ResetProfiler(); "run_inference_profiler.txt");
paddle::platform::ResetProfiler();
}
delete scope; delete scope;
} }
...@@ -165,7 +165,6 @@ op_library(cond_op DEPS framework_proto tensor net_op) ...@@ -165,7 +165,6 @@ op_library(cond_op DEPS framework_proto tensor net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax) op_library(softmax_op DEPS softmax)
op_library(detection_output_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax)
op_library(sum_op DEPS selected_rows_functor) op_library(sum_op DEPS selected_rows_functor)
op_library(sgd_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor)
...@@ -203,6 +202,11 @@ op_library(save_combine_op DEPS lod_tensor) ...@@ -203,6 +202,11 @@ op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor)
op_library(concat_op DEPS concat) op_library(concat_op DEPS concat)
# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
add_subdirectory(concurrency)
op_library(channel_send_op DEPS concurrency)
op_library(channel_recv_op DEPS concurrency)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
...@@ -222,8 +226,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) ...@@ -222,8 +226,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op) cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
...@@ -56,6 +56,7 @@ class AssignFunctor { ...@@ -56,6 +56,7 @@ class AssignFunctor {
private: private:
void copy_tensor(const framework::LoDTensor &lod_tensor, void copy_tensor(const framework::LoDTensor &lod_tensor,
framework::LoDTensor *out) const { framework::LoDTensor *out) const {
if (lod_tensor.numel() == 0) return;
auto &out_tensor = *out; auto &out_tensor = *out;
TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
out_tensor.set_lod(lod_tensor.lod()); out_tensor.set_lod(lod_tensor.lod());
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -88,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>, ...@@ -88,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
ops::CastOpKernel<CPU, double>, ops::CastOpKernel<CPU, double>,
ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int>,
ops::CastOpKernel<CPU, int64_t>, ops::CastOpKernel<CPU, int64_t>,
ops::CastOpKernel<CPU, bool>); ops::CastOpKernel<CPU, bool>,
ops::CastOpKernel<CPU, paddle::platform::float16>);
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/platform/float16.h"
template <typename T> template <typename T>
using CastOpKernel = using CastOpKernel =
...@@ -20,4 +21,5 @@ using CastOpKernel = ...@@ -20,4 +21,5 @@ using CastOpKernel =
REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>, REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
CastOpKernel<int>, CastOpKernel<int64_t>, CastOpKernel<int>, CastOpKernel<int64_t>,
CastOpKernel<bool>); CastOpKernel<bool>,
CastOpKernel<paddle::platform::float16>);
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <paddle/fluid/framework/reader.h> #include <paddle/fluid/framework/reader.h>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/concurrency/channel_util.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
static constexpr char Channel[] = "Channel"; static constexpr char Channel[] = "Channel";
...@@ -36,25 +37,6 @@ void SetReceiveStatus(const platform::Place &dev_place, ...@@ -36,25 +37,6 @@ void SetReceiveStatus(const platform::Place &dev_place,
status_tensor[0] = status; status_tensor[0] = status;
} }
bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) {
// Get type of channel and use that to call mutable data for Variable
auto type = framework::ToVarType(ch->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Receive(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Receive(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Receive(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
return ch->Receive(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Receive(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelReceive:Unsupported type");
}
class ChannelRecvOp : public framework::OperatorBase { class ChannelRecvOp : public framework::OperatorBase {
public: public:
ChannelRecvOp(const std::string &type, ChannelRecvOp(const std::string &type,
...@@ -81,7 +63,7 @@ class ChannelRecvOp : public framework::OperatorBase { ...@@ -81,7 +63,7 @@ class ChannelRecvOp : public framework::OperatorBase {
scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>(); scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
auto output_var = scope.FindVar(Output(Out)); auto output_var = scope.FindVar(Output(Out));
// Receive the data from the channel. // Receive the data from the channel.
bool ok = ChannelReceive(ch, output_var); bool ok = concurrency::ChannelReceive(ch, output_var);
// Set the status output of the `ChannelReceive` call. // Set the status output of the `ChannelReceive` call.
SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok); SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <paddle/fluid/framework/reader.h> #include <paddle/fluid/framework/reader.h>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/concurrency/channel_util.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
static constexpr char Channel[] = "Channel"; static constexpr char Channel[] = "Channel";
...@@ -37,24 +38,6 @@ void SetSendStatus(const platform::Place &dev_place, ...@@ -37,24 +38,6 @@ void SetSendStatus(const platform::Place &dev_place,
status_tensor[0] = status; status_tensor[0] = status;
} }
bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Send(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Send(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Send(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Send(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
return ch->Send(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Send(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelSend:Unsupported type");
}
class ChannelSendOp : public framework::OperatorBase { class ChannelSendOp : public framework::OperatorBase {
public: public:
ChannelSendOp(const std::string &type, ChannelSendOp(const std::string &type,
...@@ -82,7 +65,7 @@ class ChannelSendOp : public framework::OperatorBase { ...@@ -82,7 +65,7 @@ class ChannelSendOp : public framework::OperatorBase {
auto input_var = scope.FindVar(Input(X)); auto input_var = scope.FindVar(Input(X));
// Send the input data through the channel. // Send the input data through the channel.
bool ok = ChannelSend(ch, input_var); bool ok = concurrency::ChannelSend(ch, input_var);
// Set the status output of the `ChannelSend` call. // Set the status output of the `ChannelSend` call.
SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok); SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
......
cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "channel_util.h"
#include "paddle/fluid/framework/var_type.h"
namespace poc = paddle::operators::concurrency;
bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Send(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Send(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Send(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Send(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
return ch->Send(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Send(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelSend:Unsupported type");
}
bool poc::ChannelReceive(framework::ChannelHolder *ch,
framework::Variable *var) {
// Get type of channel and use that to call mutable data for Variable
auto type = framework::ToVarType(ch->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Receive(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Receive(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Receive(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER)
return ch->Receive(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Receive(var->GetMutable<framework::ChannelHolder>());
else
PADDLE_THROW("ChannelReceive:Unsupported type");
}
void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
framework::Variable *var,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(framework::ChannelAction)> cb) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR) {
ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensor>(), cond, cb);
} else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
ch->AddToSendQ(referrer, var->GetMutable<framework::LoDRankTable>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensorArray>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
ch->AddToSendQ(referrer, var->GetMutable<framework::SelectedRows>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_READER) {
ch->AddToSendQ(referrer, var->GetMutable<framework::ReaderHolder>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_CHANNEL) {
ch->AddToSendQ(referrer, var->GetMutable<framework::ChannelHolder>(), cond,
cb);
} else {
PADDLE_THROW("ChannelAddToSendQ:Unsupported type");
}
}
void poc::ChannelAddToReceiveQ(
framework::ChannelHolder *ch, const void *referrer,
framework::Variable *var, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(framework::ChannelAction)> cb) {
auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensor>(), cond,
cb);
} else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDRankTable>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensorArray>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::SelectedRows>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_READER) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::ReaderHolder>(),
cond, cb);
} else if (type == framework::proto::VarType_Type_CHANNEL) {
ch->AddToReceiveQ(referrer, var->GetMutable<framework::ChannelHolder>(),
cond, cb);
} else {
PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type");
}
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
...@@ -12,10 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,10 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detection_output_op.h" #pragma once
namespace ops = paddle::operators; #include "paddle/fluid/framework/channel.h"
REGISTER_OP_CUDA_KERNEL( #include "paddle/fluid/framework/variable.h"
detection_output,
ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>, namespace paddle {
ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>); namespace operators {
namespace concurrency {
bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
framework::Variable *var,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(framework::ChannelAction)> cb);
void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer,
framework::Variable *var,
std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(framework::ChannelAction)> cb);
} // namespace concurrency
} // namespace operators
} // namespace paddle
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,6 +28,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; ...@@ -27,6 +28,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout; using DataLayout = platform::DataLayout;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
static_cast<size_t>(1024) * 1024 * 1024; static_cast<size_t>(1024) * 1024 * 1024;
...@@ -133,7 +136,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> { ...@@ -133,7 +136,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv forward --------------------- // ------------------- cudnn conv forward ---------------------
T alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
for (int i = 0; i < groups; i++) { for (int i = 0; i < groups; i++) {
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in, handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
...@@ -280,7 +283,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -280,7 +283,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
// ------------------- cudnn conv backward data --------------------- // ------------------- cudnn conv backward data ---------------------
T alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
...@@ -315,16 +318,18 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> { ...@@ -315,16 +318,18 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, namespace plat = paddle::platform;
REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>); paddle::operators::CUDNNConvOpKernel<double>,
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace, paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>, paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>); paddle::operators::CUDNNConvGradOpKernel<double>);
REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace, REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>); paddle::operators::CUDNNConvOpKernel<double>);
REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace, REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>, paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>); paddle::operators::CUDNNConvGradOpKernel<double>);
...@@ -83,12 +83,23 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -83,12 +83,23 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
} }
#endif #endif
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("Input")->type());
auto filter_data_type =
framework::ToDataType(ctx.Input<Tensor>("Filter")->type());
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
"input and filter data type should be consistent");
if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used");
}
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), library_);
layout_, library_);
} }
Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace operators {
class DeleteVarOp : public framework::OperatorBase {
public:
DeleteVarOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
dev_ctx.Wait();
auto delete_var_names = Inputs("X");
const_cast<framework::Scope &>(scope).EraseVars(delete_var_names);
}
};
class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
public:
DeleteVarOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of delete op").AsDuplicable();
AddComment(R"DOC(
Delete Operator.
It should not be configured by users directly.
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::DeleteVarOpInfoMaker);
...@@ -85,4 +85,4 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { ...@@ -85,4 +85,4 @@ google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
\ No newline at end of file
...@@ -97,7 +97,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -97,7 +97,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
return true; return true;
} }
bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
const auto ch = GetChannel(ep); const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
...@@ -108,8 +108,18 @@ bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { ...@@ -108,8 +108,18 @@ bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s); rpc->Finish(&s->reply_, &s->status_, (void*)s);
req_count_++; req_count_++;
}
return true; void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
const auto ch = GetChannel(ep);
FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
s->Prepare(time_out);
sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s);
req_count_++;
} }
bool RPCClient::Wait() { bool RPCClient::Wait() {
...@@ -154,7 +164,7 @@ bool RPCClient::Proceed() { ...@@ -154,7 +164,7 @@ bool RPCClient::Proceed() {
PADDLE_ENFORCE(tag); PADDLE_ENFORCE(tag);
// TODO(gongwb): add more retries. // TODO(gongwb): add more retries.
ClientBase* c = static_cast<ClientBase*>(tag); BaseProcessor* c = static_cast<BaseProcessor*>(tag);
if (!c->status_.ok()) { if (!c->status_.ok()) {
LOG(ERROR) << "proc param error:" << c->var_h_.String() LOG(ERROR) << "proc param error:" << c->var_h_.String()
<< " grpc error:" << c->status_.error_message(); << " grpc error:" << c->status_.error_message();
...@@ -174,6 +184,8 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) { ...@@ -174,6 +184,8 @@ std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
} }
grpc::ChannelArguments args; grpc::ChannelArguments args;
args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 5000);
args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
args.SetMaxSendMessageSize(std::numeric_limits<int>::max()); args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max()); args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
......
...@@ -52,14 +52,14 @@ struct VarHandle { ...@@ -52,14 +52,14 @@ struct VarHandle {
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h,
const sendrecv::VariableMessage& msg); const sendrecv::VariableMessage& msg);
class ClientBase { class BaseProcessor {
public: public:
explicit ClientBase(std::shared_ptr<grpc::Channel> ch) { explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
stub_ = sendrecv::SendRecvService::NewStub(ch); stub_ = sendrecv::SendRecvService::NewStub(ch);
context_ = NULL; context_ = NULL;
} }
virtual ~ClientBase() {} virtual ~BaseProcessor() {}
virtual void Prepare(const VarHandle& var_info, int64_t time_out) { virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
context_.reset(new grpc::ClientContext()); context_.reset(new grpc::ClientContext());
...@@ -91,9 +91,10 @@ class ClientBase { ...@@ -91,9 +91,10 @@ class ClientBase {
typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)> typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
RequestSendCallBack; RequestSendCallBack;
class SendProcessor : public ClientBase { class SendProcessor : public BaseProcessor {
public: public:
explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {} explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {}
virtual ~SendProcessor() {} virtual ~SendProcessor() {}
...@@ -110,9 +111,10 @@ class SendProcessor : public ClientBase { ...@@ -110,9 +111,10 @@ class SendProcessor : public ClientBase {
typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)> typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
RequestGetCallBack; RequestGetCallBack;
class GetProcessor : public ClientBase { class GetProcessor : public BaseProcessor {
public: public:
explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {} explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {}
virtual ~GetProcessor() {} virtual ~GetProcessor() {}
...@@ -126,10 +128,10 @@ class GetProcessor : public ClientBase { ...@@ -126,10 +128,10 @@ class GetProcessor : public ClientBase {
RequestGetCallBack response_call_back_ = ProcGetResponse; RequestGetCallBack response_call_back_ = ProcGetResponse;
}; };
class BatchBarrierProcessor : public ClientBase { class BatchBarrierProcessor : public BaseProcessor {
public: public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch) explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: ClientBase(ch) {} : BaseProcessor(ch) {}
virtual ~BatchBarrierProcessor() {} virtual ~BatchBarrierProcessor() {}
...@@ -137,6 +139,17 @@ class BatchBarrierProcessor : public ClientBase { ...@@ -137,6 +139,17 @@ class BatchBarrierProcessor : public ClientBase {
sendrecv::VoidMessage reply_; sendrecv::VoidMessage reply_;
}; };
class FetchBarrierProcessor : public BaseProcessor {
public:
explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: BaseProcessor(ch) {}
virtual ~FetchBarrierProcessor() {}
virtual void Process() {}
sendrecv::VariableMessage reply_;
};
class RPCClient { class RPCClient {
public: public:
bool AsyncSendVariable(const std::string& ep, bool AsyncSendVariable(const std::string& ep,
...@@ -151,7 +164,10 @@ class RPCClient { ...@@ -151,7 +164,10 @@ class RPCClient {
const std::string& var_name, const std::string& var_name,
int64_t time_out = 600 * 1000); int64_t time_out = 600 * 1000);
bool AsyncSendBatchBarrier(const std::string& ep, void AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out = 600 * 1000);
void AsyncSendFetchBarrier(const std::string& ep,
int64_t time_out = 600 * 1000); int64_t time_out = 600 * 1000);
bool Wait(); bool Wait();
......
...@@ -84,7 +84,7 @@ class RequestGet final : public RequestBase { ...@@ -84,7 +84,7 @@ class RequestGet final : public RequestBase {
explicit RequestGet(sendrecv::SendRecvService::AsyncService* service, explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
grpc::ServerCompletionQueue* cq, framework::Scope* scope, grpc::ServerCompletionQueue* cq, framework::Scope* scope,
const platform::DeviceContext* dev_ctx, const platform::DeviceContext* dev_ctx,
SimpleBlockQueue<char>* queue) SimpleBlockQueue<MessageWithName>* queue)
: RequestBase(service, cq), : RequestBase(service, cq),
responder_(&ctx_), responder_(&ctx_),
scope_(scope), scope_(scope),
...@@ -101,11 +101,16 @@ class RequestGet final : public RequestBase { ...@@ -101,11 +101,16 @@ class RequestGet final : public RequestBase {
// proc request. // proc request.
std::string var_name = request_.varname(); std::string var_name = request_.varname();
auto* var = scope_->FindVar(var_name); auto* var = scope_->FindVar(var_name);
SerializeToMessage(var_name, var, *dev_ctx_, &reply_); if (var_name != FETCH_BARRIER_MESSAGE) {
SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
}
// TODO(gongwb): check var's info. // TODO(gongwb): check var's info.
responder_.Finish(reply_, grpc::Status::OK, this); responder_.Finish(reply_, grpc::Status::OK, this);
status_ = FINISH; status_ = FINISH;
queue_->Push('c'); MessageWithName msg_with_name =
// request name reply
std::make_pair(var_name, std::move(reply_));
queue_->Push(msg_with_name);
} }
protected: protected:
...@@ -114,12 +119,16 @@ class RequestGet final : public RequestBase { ...@@ -114,12 +119,16 @@ class RequestGet final : public RequestBase {
ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_; ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
framework::Scope* scope_; framework::Scope* scope_;
const platform::DeviceContext* dev_ctx_; const platform::DeviceContext* dev_ctx_;
SimpleBlockQueue<char>* queue_; SimpleBlockQueue<MessageWithName>* queue_;
}; };
void AsyncGRPCServer::WaitClientGet(int count) { void AsyncGRPCServer::WaitClientGet(int count) {
for (int i = 0; i < count; ++i) { int fetch_barriers = 0;
var_get_queue_.Pop(); while (fetch_barriers < count) {
auto msg = var_get_queue_.Pop();
if (msg.first == FETCH_BARRIER_MESSAGE) {
fetch_barriers++;
}
} }
} }
......
...@@ -77,7 +77,7 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { ...@@ -77,7 +77,7 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
const platform::DeviceContext *dev_ctx_; const platform::DeviceContext *dev_ctx_;
// received variable from RPC, operators fetch variable from this queue. // received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue<MessageWithName> var_recv_queue_; SimpleBlockQueue<MessageWithName> var_recv_queue_;
SimpleBlockQueue<char> var_get_queue_; SimpleBlockQueue<MessageWithName> var_get_queue_;
// condition of the sub program // condition of the sub program
std::mutex barrier_mutex_; std::mutex barrier_mutex_;
......
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
......
...@@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
DestroyCallback destroy_callback = [](void* backing) {}; DestroyCallback destroy_callback = [](void* backing) {};
void* buf = malloc(1024); void* buf = malloc(1024);
void* payload; void* payload = nullptr;
size_t payload_size; size_t payload_size;
ProtoEncodeHelper e((char*)buf, 1024); ProtoEncodeHelper e((char*)buf, 1024);
e.WriteString(VarMsg::kVarnameFieldNumber, name); e.WriteString(VarMsg::kVarnameFieldNumber, name);
...@@ -297,4 +297,4 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -297,4 +297,4 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
\ No newline at end of file
...@@ -32,6 +32,7 @@ namespace detail { ...@@ -32,6 +32,7 @@ namespace detail {
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
typedef void (*DestroyCallback)(void*); typedef void (*DestroyCallback)(void*);
......
...@@ -273,7 +273,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -273,7 +273,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>& true_pos, std::map<int, std::vector<std::pair<T, int>>>& true_pos,
std::map<int, std::vector<std::pair<T, int>>>& false_pos, std::map<int, std::vector<std::pair<T, int>>>& false_pos,
const int class_num) const { const int class_num) const {
constexpr T kEPS = static_cast<T>(1e-6);
const int* pos_count_data = input_pos_count.data<int>(); const int* pos_count_data = input_pos_count.data<int>();
for (int i = 0; i < class_num; ++i) { for (int i = 0; i < class_num; ++i) {
label_pos_count[i] = pos_count_data[i]; label_pos_count[i] = pos_count_data[i];
...@@ -282,12 +281,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -282,12 +281,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto pos_data_lod = pos_tensor.lod(); auto pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size(); ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[0][i]; j < pos_data_lod[0][i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
int flag = 1; int flag = pos_data[j * 2 + 1];
if (pos_data[j * 2 + 1] < kEPS) flag = 0;
pos[i].push_back(std::make_pair(score, flag)); pos[i].push_back(std::make_pair(score, flag));
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection_output_op.h"
namespace paddle {
namespace operators {
class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
public:
DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Loc",
"(Tensor) The input tensor of detection_output operator."
"The input predict locations"
"The format of input tensor is kNCHW. Where K is priorbox point "
"numbers,"
"N is How many boxes are there on each point, "
"C is 4, H and W both are 1.");
AddInput("Conf",
"(Tensor) The input tensor of detection_output operator."
"The input priorbox confidence."
"The format of input tensor is kNCHW. Where K is priorbox point "
"numbers,"
"N is How many boxes are there on each point, "
"C is the number of classes, H and W both are 1.");
AddInput("PriorBox",
"(Tensor) The input tensor of detection_output operator."
"The format of input tensor is the position and variance "
"of the boxes");
AddOutput("Out",
"(Tensor) The output tensor of detection_output operator.");
AddAttr<int>("background_label_id", "(int), The background class index.");
AddAttr<int>("num_classes", "(int), The number of the classification.");
AddAttr<float>("nms_threshold",
"(float), The Non-maximum suppression threshold.");
AddAttr<float>("confidence_threshold",
"(float), The classification confidence threshold.");
AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
AddAttr<int>("nms_top_k",
"(int), The bbox number kept of the NMS’s output.");
AddComment(R"DOC(
detection output for SSD(single shot multibox detector)
Apply the NMS to the output of network and compute the predict
bounding box location. The output’s shape of this layer could
be zero if there is no valid bounding box.
)DOC");
}
};
class DetectionOutputOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Loc"),
"Input(X) of DetectionOutputOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Conf"),
"Input(X) of DetectionOutputOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
"Input(X) of DetectionOutputOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of DetectionOutputOp should not be null.");
std::vector<int64_t> output_shape({1, 7});
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
ops::DetectionOutputOpMaker);
REGISTER_OP_CPU_KERNEL(
detection_output,
ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/detection_util.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/strided_memcpy.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
inline void transpose_fun(const framework::ExecutionContext& context,
const framework::Tensor& src,
framework::Tensor* dst) {
int input_nums = src.dims()[0];
int offset = 0;
for (int j = 0; j < input_nums; ++j) {
framework::Tensor in_p_tensor = src.Slice(j, j + 1);
std::vector<int64_t> shape_vec(
{in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
framework::DDim shape(framework::make_ddim(shape_vec));
framework::Tensor in_p_tensor_transpose;
in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
std::vector<int> shape_axis({0, 1, 3, 4, 2});
math::Transpose<DeviceContext, T, 5> trans5;
trans5(context.template device_context<DeviceContext>(), in_p_tensor,
&in_p_tensor_transpose, shape_axis);
auto dst_stride = framework::stride(dst->dims());
auto src_stride = framework::stride(in_p_tensor_transpose.dims());
StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
src_stride, in_p_tensor_transpose.dims(), dst_stride,
dst->data<T>() + offset);
offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
}
}
template <typename DeviceContext, typename T>
class DetectionOutputKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
const framework::Tensor* in_priorbox =
context.Input<framework::Tensor>("PriorBox");
auto* out = context.Output<framework::Tensor>("Out");
int num_classes = context.template Attr<int>("num_classes");
int top_k = context.template Attr<int>("top_k");
int nms_top_k = context.template Attr<int>("nms_top_k");
int background_label_id = context.template Attr<int>("background_label_id");
float nms_threshold = context.template Attr<float>("nms_threshold");
float confidence_threshold =
context.template Attr<float>("confidence_threshold");
size_t batch_size = in_conf->dims()[1];
int conf_sum_size = in_conf->numel();
// for softmax
std::vector<int64_t> conf_shape_softmax_vec(
{conf_sum_size / num_classes, num_classes});
framework::DDim conf_shape_softmax(
framework::make_ddim(conf_shape_softmax_vec));
// for knchw => nhwc
std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
in_loc->dims()[4],
in_loc->dims()[2] * in_loc->dims()[0]});
std::vector<int64_t> conf_shape_vec(
{1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
in_conf->dims()[2] * in_conf->dims()[0]});
framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
framework::Tensor loc_tensor;
framework::Tensor conf_tensor;
loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
// for cpu
framework::Tensor loc_cpu;
framework::Tensor conf_cpu;
framework::Tensor priorbox_cpu;
const T* priorbox_data = in_priorbox->data<T>();
transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
conf_tensor.Resize(conf_shape_softmax);
math::SoftmaxFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), &conf_tensor,
&conf_tensor);
T* loc_data = loc_tensor.data<T>();
T* conf_data = conf_tensor.data<T>();
if (platform::is_gpu_place(context.GetPlace())) {
loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
framework::TensorCopy(loc_tensor, platform::CPUPlace(),
context.device_context(), &loc_cpu);
loc_data = loc_cpu.data<T>();
conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
framework::TensorCopy(conf_tensor, platform::CPUPlace(),
context.device_context(), &conf_cpu);
conf_data = conf_cpu.data<T>();
priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
framework::TensorCopy(*in_priorbox, platform::CPUPlace(),
context.device_context(), &priorbox_cpu);
priorbox_data = priorbox_cpu.data<T>();
}
// get decode bboxes
size_t num_priors = in_priorbox->numel() / 8;
std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
for (size_t n = 0; n < batch_size; ++n) {
std::vector<operators::math::BBox<T>> decoded_bboxes;
for (size_t i = 0; i < num_priors; ++i) {
size_t prior_offset = i * 8;
size_t loc_pred_offset = n * num_priors * 4 + i * 4;
std::vector<math::BBox<T>> prior_bbox_vec;
math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
prior_bbox_vec);
std::vector<std::vector<T>> prior_bbox_var;
math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
prior_bbox_var);
std::vector<T> loc_pred_data;
for (size_t j = 0; j < 4; ++j)
loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
decoded_bboxes.push_back(bbox);
}
all_decoded_bboxes.push_back(decoded_bboxes);
}
std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
int num_kept = math::GetDetectionIndices<T>(
conf_data, num_priors, num_classes, background_label_id, batch_size,
confidence_threshold, nms_top_k, nms_threshold, top_k,
all_decoded_bboxes, &all_indices);
if (num_kept <= 0) {
std::vector<int64_t> out_shape_vec({0, 0});
framework::DDim out_shape(framework::make_ddim(out_shape_vec));
out->Resize(out_shape);
return;
}
std::vector<int64_t> out_shape_vec({num_kept, 7});
framework::DDim out_shape(framework::make_ddim(out_shape_vec));
out->mutable_data<T>(out_shape, context.GetPlace());
framework::Tensor out_cpu;
T* out_data = out->data<T>();
if (platform::is_gpu_place(context.GetPlace())) {
out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
out_data = out_cpu.data<T>();
}
math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
batch_size, all_indices, all_decoded_bboxes,
out_data);
if (platform::is_gpu_place(context.GetPlace())) {
framework::TensorCopy(out_cpu, platform::CUDAPlace(),
context.device_context(), out);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -29,8 +29,11 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker { ...@@ -29,8 +29,11 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, REGISTER_OPERATOR(elementwise_add, ops::ElementwiseOp,
elementwise_add_grad, ops::ElementwiseOpGrad); ops::ElementwiseAddOpMaker, ops::ElementwiseOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_add, elementwise_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>, ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -41,6 +41,16 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -41,6 +41,16 @@ class ElementwiseOp : public framework::OperatorWithKernel {
} }
}; };
class ElementwiseOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto x_var = op_desc.Input("X")[0];
auto out_var = op_desc.Output("Out")[0];
block->Var(out_var)->SetType(block->Var(x_var)->GetType());
}
};
class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker) ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
......
...@@ -128,8 +128,8 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -128,8 +128,8 @@ class ListenAndServOp : public framework::OperatorBase {
} }
} }
if (exit_flag) { if (exit_flag) {
rpc_service_->ShutDown();
rpc_service_->SetCond(1); rpc_service_->SetCond(1);
rpc_service_->ShutDown();
break; break;
} }
try { try {
...@@ -148,7 +148,7 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -148,7 +148,7 @@ class ListenAndServOp : public framework::OperatorBase {
} }
rpc_service_->SetCond(1); rpc_service_->SetCond(1);
// FIXME(typhoonzero): use another condition to sync wait clients get. // FIXME(typhoonzero): use another condition to sync wait clients get.
rpc_service_->WaitClientGet(ins.size()); rpc_service_->WaitClientGet(fan_in);
sparse_vars.clear(); sparse_vars.clear();
} // while(true) } // while(true)
} }
......
...@@ -33,8 +33,16 @@ class LookupTableOp : public framework::OperatorWithKernel { ...@@ -33,8 +33,16 @@ class LookupTableOp : public framework::OperatorWithKernel {
auto table_dims = ctx->GetInputDim("W"); auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids"); auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2); auto ids_var_type = ctx->GetInputsVarType("Ids").front();
PADDLE_ENFORCE_EQ(ids_dims[1], 1); // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
// is LoDTensor, this tensor contains the ids to be looked up in W
// and it must be a column vector with rank = 2 while the 2nd dimension
// size must be 1, when Ids's type is SelectedRows, the rows of Ids
// contains the ids to be looked up in W;
if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
}
ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
ctx->ShareLoD("Ids", /*->*/ "Out"); ctx->ShareLoD("Ids", /*->*/ "Out");
...@@ -54,17 +62,22 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -54,17 +62,22 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker) LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("W", AddInput("W",
"An input represents embedding tensors, " "(Tensor) The input represents embedding tensors, "
"which is a learnable parameter."); "which is a learnable parameter.");
AddInput("Ids", AddInput(
"An input with type int32 or int64 " "Ids",
"contains the ids to be looked up in W. " "(Tensor or SelectedRows) Ids's type can be Tensor or "
"Ids must be a column vector with rank = 2. " "SelectedRows, when Ids's type is Tensor, this tensor contains "
"The 2nd dimension size must be 1."); "the ids to be looked up in W and it must be a column vector with "
AddOutput("Out", "The lookup results, which have the same type as W."); "rank = 2 while the 2nd dimension size must be 1; when Ids's type is "
"SelectedRows, the rows of Ids contains the ids to be looked up "
"in W.");
AddOutput("Out",
"(Tensor or SelectedRows) The lookup results, which have the "
"same type as W.");
AddAttr<bool>("is_sparse", AddAttr<bool>("is_sparse",
"(boolean, default false) " "(boolean, default false) "
"Sparse update") "Sparse update.")
.SetDefault(false); .SetDefault(false);
AddAttr<int64_t>("padding_idx", AddAttr<int64_t>("padding_idx",
"(int64, default -1) " "(int64, default -1) "
...@@ -76,10 +89,15 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -76,10 +89,15 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
Lookup Table Operator. Lookup Table Operator.
This operator is used to perform lookups on the parameter W, This operator is used to perform lookups on the parameter W,
then concatenated into a dense tensor. then concatenated into a dense or sparse tensor.
The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's
type is SelectedRows, the rows of Ids contains the ids to be looked up in W;
when Ids's type is Tensor, this tensor contains the ids to be looked up in W
and it must be a column vector with rank = 2 while the 2nd dimension size must be 1,
at this time, Ids can carry the LoD (Level of Details) information, or not, and
the output only shares the LoD information with input Ids.
The input Ids can carry the LoD (Level of Details) information,
or not. And the output only shares the LoD information with input Ids.
)DOC"); )DOC");
} }
......
...@@ -74,14 +74,32 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> { ...@@ -74,14 +74,32 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* table_t = context.Input<LoDTensor>("W"); auto* table_t = context.Input<LoDTensor>("W");
auto* ids_t = context.Input<LoDTensor>("Ids");
auto* output_t = context.Output<LoDTensor>("Out");
int64_t padding_idx = context.Attr<int64_t>("padding_idx"); int64_t padding_idx = context.Attr<int64_t>("padding_idx");
auto* ids_var = context.InputVar("Ids");
Tensor* output_t = context.Output<Tensor>("Out");
int64_t* ids;
int64_t K;
// The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
// is LoDTensor, this tensor contains the ids to be looked up in W;
// when Ids's type is SelectedRows, the rows of Ids contains the
// ids to be looked up in W.
if (ids_var->IsType<framework::LoDTensor>()) {
auto* ids_t = context.Input<LoDTensor>("Ids");
ids = const_cast<int64_t*>(ids_t->data<int64_t>());
K = ids_t->numel();
} else if (ids_var->IsType<framework::SelectedRows>()) {
auto* ids_t = context.Input<framework::SelectedRows>("Ids");
ids = const_cast<int64_t*>(ids_t->rows().CUDAData(context.GetPlace()));
K = ids_t->rows().size();
output_t->Resize({K, table_t->dims()[1]});
} else {
PADDLE_THROW("Unsupported Variable Type of Ids");
}
size_t N = table_t->dims()[0]; size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1]; size_t D = table_t->dims()[1];
size_t K = ids_t->numel();
auto* ids = ids_t->data<int64_t>();
auto* table = table_t->data<T>(); auto* table = table_t->data<T>();
auto* output = output_t->mutable_data<T>(context.GetPlace()); auto* output = output_t->mutable_data<T>(context.GetPlace());
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows; using SelectedRows = framework::SelectedRows;
...@@ -29,25 +30,45 @@ template <typename T> ...@@ -29,25 +30,45 @@ template <typename T>
class LookupTableKernel : public framework::OpKernel<T> { class LookupTableKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* table_t = context.Input<LoDTensor>("W"); // float tensor auto* table_t = context.Input<LoDTensor>("W");
auto* ids_t = context.Input<LoDTensor>("Ids"); // int tensor auto* ids_var = context.InputVar("Ids");
auto* output_t = context.Output<LoDTensor>("Out"); // float tensor Tensor* output_t = context.Output<Tensor>("Out");
int64_t* ids;
int64_t ids_numel;
// The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
// is LoDTensor, this tensor contains the ids to be looked up in W;
// when Ids's type is SelectedRows, the rows of Ids contains the
// ids to be looked up in W.
if (ids_var->IsType<LoDTensor>()) {
auto* ids_t = context.Input<LoDTensor>("Ids");
ids = const_cast<int64_t*>(ids_t->data<int64_t>());
ids_numel = ids_t->numel();
} else if (ids_var->IsType<SelectedRows>()) {
auto* ids_t = context.Input<SelectedRows>("Ids");
ids = const_cast<int64_t*>(ids_t->rows().data());
ids_numel = ids_t->rows().size();
output_t->Resize({ids_numel, table_t->dims()[1]});
} else {
PADDLE_THROW("Unsupported Variable Type of Ids");
}
int64_t padding_idx = context.Attr<int64_t>("padding_idx"); int64_t padding_idx = context.Attr<int64_t>("padding_idx");
int N = table_t->dims()[0]; int N = table_t->dims()[0];
int D = table_t->dims()[1]; int D = table_t->dims()[1];
auto* ids = ids_t->data<int64_t>();
auto* table = table_t->data<T>(); auto* table = table_t->data<T>();
auto* output = output_t->mutable_data<T>(context.GetPlace()); auto* output = output_t->mutable_data<T>(context.GetPlace());
if (padding_idx == -1) { if (padding_idx == -1) {
for (int64_t i = 0; i < ids_t->numel(); ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT(ids[i], N); PADDLE_ENFORCE_LT(ids[i], N);
PADDLE_ENFORCE_GE(ids[i], 0); PADDLE_ENFORCE_GE(ids[i], 0);
memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
} }
} else { } else {
for (int64_t i = 0; i < ids_t->numel(); ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
if (ids[i] == padding_idx) { if (ids[i] == padding_idx) {
memset(output + i * D, 0, D * sizeof(T)); memset(output + i * D, 0, D * sizeof(T));
} else { } else {
......
...@@ -36,7 +36,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> { ...@@ -36,7 +36,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
auto e_x = framework::EigenTensor<T, 4>::From(input); auto e_x = framework::EigenTensor<T, 4>::From(input);
for (int m = 0; m < N; m++) { for (int m = 0; m < N; m++) {
for (int i = 0; i < C; i++) { for (int i = 0; i < C; i++) {
for (int c = start; c <= end; c++) { for (int c = start; c < end; c++) {
int ch = i + c; int ch = i + c;
if (ch >= 0 && ch < C) { if (ch >= 0 && ch < C) {
auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}), auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
...@@ -92,7 +92,7 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> { ...@@ -92,7 +92,7 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
Eigen::array<int, 4>({{1, 1, H, W}})); Eigen::array<int, 4>({{1, 1, H, W}}));
i_x_g = i_mid.pow(-beta) * i_out_g; i_x_g = i_mid.pow(-beta) * i_out_g;
for (int c = start; c <= end; c++) { for (int c = start; c < end; c++) {
int ch = i + c; int ch = i + c;
if (ch < 0 || ch >= C) { if (ch < 0 || ch >= C) {
continue; continue;
......
...@@ -38,7 +38,7 @@ math_library(lstm_compute DEPS activation_functions) ...@@ -38,7 +38,7 @@ math_library(lstm_compute DEPS activation_functions)
math_library(math_function DEPS cblas) math_library(math_function DEPS cblas)
math_library(maxouting) math_library(maxouting)
math_library(pooling) math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows) math_library(selected_rows_functor DEPS selected_rows math_function)
math_library(sequence2batch) math_library(sequence2batch)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct BBox {
BBox(T x_min, T y_min, T x_max, T y_max)
: x_min(x_min),
y_min(y_min),
x_max(x_max),
y_max(y_max),
is_difficult(false) {}
BBox() {}
T get_width() const { return x_max - x_min; }
T get_height() const { return y_max - y_min; }
T get_center_x() const { return (x_min + x_max) / 2; }
T get_center_y() const { return (y_min + y_max) / 2; }
T get_area() const { return get_width() * get_height(); }
// coordinate of bounding box
T x_min;
T y_min;
T x_max;
T y_max;
// whether difficult object (e.g. object with heavy occlusion is difficult)
bool is_difficult;
};
// KNCHW ==> NHWC
// template <typename T>
template <typename T>
void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
std::vector<BBox<T>>& bbox_vec);
template <typename T>
void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
std::vector<std::vector<T>>& var_vec);
template <typename T>
BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
const std::vector<T>& prior_bbox_var,
const std::vector<T>& loc_pred_data);
template <typename T1, typename T2>
bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
const std::pair<T1, T2>& pair2);
template <typename T>
bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
const std::pair<T, BBox<T>>& pair2);
template <typename T>
T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
template <typename T>
void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
size_t class_idx, size_t top_k, T conf_threshold,
T nms_threshold, size_t num_priors, size_t num_classes,
std::vector<size_t>* indices);
template <typename T>
int GetDetectionIndices(
const T* conf_data, const size_t num_priors, const size_t num_classes,
const size_t background_label_id, const size_t batch_size,
const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
const size_t top_k,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
template <typename T>
BBox<T> ClipBBox(const BBox<T>& bbox);
template <typename T>
void GetDetectionOutput(
const T* conf_data, const size_t num_kept, const size_t num_priors,
const size_t num_classes, const size_t batch_size,
const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
template <typename T>
void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
std::vector<BBox<T>>& bbox_vec) {
size_t out_offset = bbox_vec.size();
bbox_vec.resize(bbox_vec.size() + num_bboxes);
for (size_t i = 0; i < num_bboxes; ++i) {
BBox<T> bbox;
bbox.x_min = *(prior_data + i * 8);
bbox.y_min = *(prior_data + i * 8 + 1);
bbox.x_max = *(prior_data + i * 8 + 2);
bbox.y_max = *(prior_data + i * 8 + 3);
bbox_vec[out_offset + i] = bbox;
}
}
template <typename T>
void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
std::vector<std::vector<T>>& var_vec) {
size_t out_offset = var_vec.size();
var_vec.resize(var_vec.size() + num);
for (size_t i = 0; i < num; ++i) {
std::vector<T> var;
var.push_back(*(prior_data + i * 8 + 4));
var.push_back(*(prior_data + i * 8 + 5));
var.push_back(*(prior_data + i * 8 + 6));
var.push_back(*(prior_data + i * 8 + 7));
var_vec[out_offset + i] = var;
}
}
template <typename T>
BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
const std::vector<T>& prior_bbox_var,
const std::vector<T>& loc_pred_data) {
T prior_bbox_width = prior_bbox.get_width();
T prior_bbox_height = prior_bbox.get_height();
T prior_bbox_center_x = prior_bbox.get_center_x();
T prior_bbox_center_y = prior_bbox.get_center_y();
T decoded_bbox_center_x =
prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
prior_bbox_center_x;
T decoded_bbox_center_y =
prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
prior_bbox_center_y;
T decoded_bbox_width =
std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
T decoded_bbox_height =
std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
BBox<T> decoded_bbox;
decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
return decoded_bbox;
}
template <typename T1, typename T2>
bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
const std::pair<T1, T2>& pair2) {
return pair1.first > pair2.first;
}
template <typename T>
T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
return 0.0;
} else {
T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
T interX_max = std::min(bbox1.x_max, bbox2.x_max);
T interY_max = std::min(bbox1.y_max, bbox2.y_max);
T inter_width = interX_max - inter_x_min;
T inter_height = interY_max - inter_y_min;
T inter_area = inter_width * inter_height;
T bbox_area1 = bbox1.get_area();
T bbox_area2 = bbox2.get_area();
return inter_area / (bbox_area1 + bbox_area2 - inter_area);
}
}
template <typename T>
void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
size_t class_idx, size_t top_k, T conf_threshold,
T nms_threshold, size_t num_priors, size_t num_classes,
std::vector<size_t>* indices) {
std::vector<std::pair<T, size_t>> scores;
for (size_t i = 0; i < num_priors; ++i) {
size_t conf_offset = i * num_classes + class_idx;
if (conf_score_data[conf_offset] > conf_threshold)
scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
}
std::stable_sort(scores.begin(), scores.end(),
SortScorePairDescend<T, size_t>);
if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
while (scores.size() > 0) {
const size_t idx = scores.front().second;
bool keep = true;
for (size_t i = 0; i < indices->size(); ++i) {
if (keep) {
const size_t saved_idx = (*indices)[i];
T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
keep = overlap <= nms_threshold;
} else {
break;
}
}
if (keep) indices->push_back(idx);
scores.erase(scores.begin());
}
}
template <typename T>
int GetDetectionIndices(
const T* conf_data, const size_t num_priors, const size_t num_classes,
const size_t background_label_id, const size_t batch_size,
const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
const size_t top_k,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
int total_keep_num = 0;
for (size_t n = 0; n < batch_size; ++n) {
const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
size_t num_detected = 0;
std::map<size_t, std::vector<size_t>> indices;
size_t conf_offset = n * num_priors * num_classes;
for (size_t c = 0; c < num_classes; ++c) {
if (c == background_label_id) continue;
ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
conf_threshold, nms_threshold, num_priors, num_classes,
&(indices[c]));
num_detected += indices[c].size();
}
if (top_k > 0 && num_detected > top_k) {
// std::vector<pair<T,T>> score_index_pairs;
std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
for (size_t c = 0; c < num_classes; ++c) {
const std::vector<size_t>& label_indices = indices[c];
for (size_t i = 0; i < label_indices.size(); ++i) {
size_t idx = label_indices[i];
score_index_pairs.push_back(
std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
std::make_pair(c, idx)));
}
}
std::sort(score_index_pairs.begin(), score_index_pairs.end(),
SortScorePairDescend<T, std::pair<size_t, size_t>>);
score_index_pairs.resize(top_k);
std::map<size_t, std::vector<size_t>> new_indices;
for (size_t i = 0; i < score_index_pairs.size(); ++i) {
size_t label = score_index_pairs[i].second.first;
size_t idx = score_index_pairs[i].second.second;
new_indices[label].push_back(idx);
}
all_detection_indices->push_back(new_indices);
total_keep_num += top_k;
} else {
all_detection_indices->push_back(indices);
total_keep_num += num_detected;
}
}
return total_keep_num;
}
template <typename T>
BBox<T> ClipBBox(const BBox<T>& bbox) {
T one = static_cast<T>(1.0);
T zero = static_cast<T>(0.0);
BBox<T> clipped_bbox;
clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
return clipped_bbox;
}
template <typename T>
void GetDetectionOutput(
const T* conf_data, const size_t num_kept, const size_t num_priors,
const size_t num_classes, const size_t batch_size,
const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
size_t count = 0;
for (size_t n = 0; n < batch_size; ++n) {
for (std::map<size_t, std::vector<size_t>>::const_iterator it =
all_indices[n].begin();
it != all_indices[n].end(); ++it) {
size_t label = it->first;
const std::vector<size_t>& indices = it->second;
const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
for (size_t i = 0; i < indices.size(); ++i) {
size_t idx = indices[i];
size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
out_data[count * 7] = n;
out_data[count * 7 + 1] = label;
out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
out_data[count * 7 + 3] = clipped_bbox.x_min;
out_data[count * 7 + 4] = clipped_bbox.y_min;
out_data[count * 7 + 5] = clipped_bbox.x_max;
out_data[count * 7 + 6] = clipped_bbox.y_max;
++count;
}
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -45,6 +45,9 @@ void gemm<platform::CUDADeviceContext, float16>( ...@@ -45,6 +45,9 @@ void gemm<platform::CUDADeviceContext, float16>(
const half* h_B = reinterpret_cast<const half*>(B); const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C); half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemm( PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, N)); h_A, lda, &h_beta, h_C, N));
...@@ -106,6 +109,9 @@ void gemm<platform::CUDADeviceContext, float16>( ...@@ -106,6 +109,9 @@ void gemm<platform::CUDADeviceContext, float16>(
const half* h_B = reinterpret_cast<const half*>(B); const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C); half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemm( PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, ldc)); h_A, lda, &h_beta, h_C, ldc));
...@@ -251,6 +257,9 @@ void batched_gemm<platform::CUDADeviceContext, float16>( ...@@ -251,6 +257,9 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
const half* h_B = reinterpret_cast<const half*>(B); const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C); half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
......
...@@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) { ...@@ -72,6 +72,11 @@ TEST(math_function, notrans_mul_trans_fp16) {
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
...@@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) { ...@@ -149,6 +154,11 @@ TEST(math_function, trans_mul_notrans_fp16) {
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place); float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
...@@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) { ...@@ -248,6 +258,11 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
...@@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) { ...@@ -355,6 +370,11 @@ TEST(math_function, gemm_trans_cublas_fp16) {
CUDAPlace gpu_place(0); CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place); CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
......
...@@ -14,13 +14,86 @@ limitations under the License. */ ...@@ -14,13 +14,86 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h" #include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using DataLayout = platform::DataLayout;
template <typename T>
using CudnnDataType = platform::CudnnDataType<T>;
template <typename T>
void SoftmaxCUDNNFunctor<T>::operator()(
const platform::CUDADeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) {
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor xDesc;
ScopedTensorDescriptor yDesc;
std::vector<int> cudnn_tensor_dims = framework::vectorize2int(X->dims());
DataLayout layout = DataLayout::kNCHW;
if (cudnn_tensor_dims.size() == 5) {
layout = DataLayout::kNCDHW;
}
// NOTE(*) : cudnn softmax only support >= 4D Tensor,
// fill 1 at unused dims
if (cudnn_tensor_dims.size() <= 2) {
cudnn_tensor_dims.resize(4, 1);
}
cudnnTensorDescriptor_t cudnn_x_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_y_desc =
xDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
Y->mutable_data<T>(context.GetPlace())));
}
template <typename T>
void SoftmaxGradCUDNNFunctor<T>::operator()(
const platform::CUDADeviceContext& context, const framework::Tensor* Y,
const framework::Tensor* YGrad, framework::Tensor* XGrad) {
// ------------------- cudnn descriptors ---------------------
ScopedTensorDescriptor yDesc;
ScopedTensorDescriptor dyDesc;
ScopedTensorDescriptor dxDesc;
std::vector<int> cudnn_tensor_dims = framework::vectorize2int(Y->dims());
DataLayout layout = DataLayout::kNCHW;
if (cudnn_tensor_dims.size() == 5) {
layout = DataLayout::kNCDHW;
}
// NOTE(*) : cudnn softmax only support >= 4D Tensor,
// fill 1 at unused dims
if (cudnn_tensor_dims.size() <= 2) {
cudnn_tensor_dims.resize(4, 1);
}
cudnnTensorDescriptor_t cudnn_y_desc =
yDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_xgrad_desc =
dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
cudnnTensorDescriptor_t cudnn_ygrad_desc =
dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
XGrad->mutable_data<T>(context.GetPlace())));
}
template class SoftmaxCUDNNFunctor<float>;
template class SoftmaxCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<float>;
template class SoftmaxGradCUDNNFunctor<double>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float>; template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double>; template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>; template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
......
...@@ -33,6 +33,23 @@ class SoftmaxGradFunctor { ...@@ -33,6 +33,23 @@ class SoftmaxGradFunctor {
const framework::Tensor* y_grad, framework::Tensor* x_grad); const framework::Tensor* y_grad, framework::Tensor* x_grad);
}; };
#ifdef PADDLE_WITH_CUDA
template <typename T>
class SoftmaxCUDNNFunctor {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor* X, framework::Tensor* Y);
};
template <typename T>
class SoftmaxGradCUDNNFunctor {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor* Y, const framework::Tensor* y_grad,
framework::Tensor* x_grad);
};
#endif
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -17,11 +17,14 @@ limitations under the License. */ ...@@ -17,11 +17,14 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::OpKernelType;
using framework::Tensor; using framework::Tensor;
class MulOpShapeInference : public framework::InferShapeBase { class MulOp : public framework::OperatorWithKernel {
public: public:
void operator()(framework::InferShapeContext* ctx) const override { using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
...@@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$. ...@@ -122,7 +125,7 @@ or not. But the output only shares the LoD information with input $X$.
} }
}; };
class MulOpGrad : public framework::OperatorWithKernel { class MulGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel { ...@@ -156,10 +159,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulGradOp);
ops::MulOpShapeInference,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>); mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
......
...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/mul_op.h" #include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( namespace plat = paddle::platform;
mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>); REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
REGISTER_OP_CUDA_KERNEL( ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>); REGISTER_OP_CUDA_KERNEL(mul_grad,
ops::MulGradKernel<plat::CUDADeviceContext, float>);
...@@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel<T> { ...@@ -48,7 +48,7 @@ class MulKernel : public framework::OpKernel<T> {
} }
math::matmul<DeviceContext, T>( math::matmul<DeviceContext, T>(
context.template device_context<DeviceContext>(), x_matrix, false, context.template device_context<DeviceContext>(), x_matrix, false,
y_matrix, false, 1, z, 0); y_matrix, false, static_cast<T>(1), z, static_cast<T>(0));
if (z_dim.size() != 2) { if (z_dim.size() != 2) {
z->Resize(z_dim); z->Resize(z_dim);
} }
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -105,19 +104,38 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel { ...@@ -105,19 +104,38 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
" Input(Communicator) of AllReduce op input should not be NULL"); " Input(Communicator) of AllReduce op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of AllReduce op output should not be NULL"); " Output(Out) of AllReduce op output should not be NULL");
auto x_dims = ctx->GetInputsDim("X");
std::string reduction = ctx->Attrs().Get<std::string>("reduction"); std::string reduction = ctx->Attrs().Get<std::string>("reduction");
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" || PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
reduction == "ncclMin" || reduction == "ncclMax"), reduction == "ncclMin" || reduction == "ncclMax"),
"invalid reduction."); "invalid reduction.");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims); ctx->SetOutputsDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
}; };
// AllReduceOp
class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of AllReduce op");
AddInput("Communicator", "Communicator for communicating between gpus");
AddOutput("Out", "The output of AllReduce op");
AddAttr<std::string>("reduction",
"(string, default 'ncclSum') "
"{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
.SetDefault("ncclSum");
AddComment(R"DOC(
NCCLAllReduce Operator.
AllReduce the input tensors.
)DOC");
}
};
// ReduceOp // ReduceOp
class NCCLReduceOp : public framework::OperatorWithKernel { class NCCLReduceOp : public framework::OperatorWithKernel {
public: public:
...@@ -144,50 +162,6 @@ class NCCLReduceOp : public framework::OperatorWithKernel { ...@@ -144,50 +162,6 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
} }
}; };
// BcastOp
class NCCLBcastOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
" Input(X) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasInput("Communicator"),
" Input(Communicator) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of Bcast op output should not be NULL");
int root = ctx->Attrs().Get<int>("root");
PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
// AllreduceOp
class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of AllReduce op");
AddInput("Communicator", "Communicator for communicating between gpus");
AddOutput("Out", "The output of AllReduce op");
AddAttr<std::string>("reduction",
"(string, default 'ncclSum') "
"{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
.SetDefault("ncclSum");
AddComment(R"DOC(
NCCLAllReduce Operator.
AllReduce the input tensors.
)DOC");
}
};
// ReduceOp // ReduceOp
class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker { class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
...@@ -214,6 +188,29 @@ Reduce the tensors. ...@@ -214,6 +188,29 @@ Reduce the tensors.
} }
}; };
// BcastOp
class NCCLBcastOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
" Input(X) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasInput("Communicator"),
" Input(Communicator) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of Bcast op output should not be NULL");
int root = ctx->Attrs().Get<int>("root");
PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out");
}
};
// BcastOp // BcastOp
class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker { class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
......
...@@ -43,13 +43,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -43,13 +43,12 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
auto* x = ctx.Input<LoDTensor>("X");
auto ins = ctx.MultiInput<LoDTensor>("X"); auto* out = ctx.Output<LoDTensor>("Out");
auto outs = ctx.MultiOutput<LoDTensor>("Out"); auto* comm = ctx.Input<Communicator>("Communicator");
std::string reduction = ctx.Attr<std::string>("reduction"); std::string reduction = ctx.Attr<std::string>("reduction");
ncclRedOp_t reduction_op_ = ncclSum;
ncclRedOp_t reduction_op_ = ncclSum;
if (reduction == "ncclMin") { if (reduction == "ncclMin") {
reduction_op_ = ncclMin; reduction_op_ = ncclMin;
} else if (reduction == "ncclMax") { } else if (reduction == "ncclMax") {
...@@ -61,30 +60,19 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> { ...@@ -61,30 +60,19 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
} else { } else {
PADDLE_THROW("Invalid reduction. default ncclSum."); PADDLE_THROW("Invalid reduction. default ncclSum.");
} }
auto* comm = ctx.Input<Communicator>("Communicator");
auto stream = ctx.cuda_device_context().stream();
// device id // device id
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
VLOG(3) << "gpu : "
for (size_t i = 0; i < ins.size(); ++i) { << " invoke allreduce. send " << x->numel() << " recv "
VLOG(1) << "gpu : " << out->numel();
<< " invoke allreduce. send " << ins[i]->numel() << " recv " PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
<< outs[i]->numel(); x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( ctx.cuda_device_context().stream()));
ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()), VLOG(3) << "gpu : "
outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_, << " finished allreduce. send " << x->numel() << " recv "
comm->comms().at(idx), stream)); << out->numel();
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << "gpu : "
<< " finished allreduce. send " << ins[i]->numel() << " recv "
<< outs[i]->numel();
}
} }
}; };
...@@ -94,13 +82,13 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -94,13 +82,13 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
auto x = ctx.Input<LoDTensor>("X"); // x0, x1, x2
auto ins = ctx.MultiInput<LoDTensor>("X"); // x0, x1, x2 auto out = ctx.Output<LoDTensor>("Out");
auto outs = ctx.MultiOutput<LoDTensor>("Out"); auto* comm = ctx.Input<Communicator>("Communicator");
int root = ctx.Attr<int>("root");
std::string reduction = ctx.Attr<std::string>("reduction"); std::string reduction = ctx.Attr<std::string>("reduction");
ncclRedOp_t reduction_op_ = ncclSum;
ncclRedOp_t reduction_op_ = ncclSum;
if (reduction == "ncclMin") { if (reduction == "ncclMin") {
reduction_op_ = ncclMin; reduction_op_ = ncclMin;
} else if (reduction == "ncclMax") { } else if (reduction == "ncclMax") {
...@@ -112,40 +100,23 @@ class NCCLReduceKernel : public framework::OpKernel<T> { ...@@ -112,40 +100,23 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
} else { } else {
PADDLE_THROW("Invalid reduction. default ncclSum."); PADDLE_THROW("Invalid reduction. default ncclSum.");
} }
int root = ctx.Attr<int>("root");
auto* comm = ctx.Input<Communicator>("Communicator");
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream();
// device id // device id
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
T* recvbuffer = nullptr;
auto ins_names = ctx.Inputs("X"); if (root == gpu_id) {
std::hash<std::string> hasher; recvbuffer = out->mutable_data<T>(ctx.GetPlace());
for (size_t i = 0; i < ins.size(); ++i) { } else {
if (root == platform::kInvalidGPUId) { out->Resize(framework::make_ddim({0}));
root = hasher(ins_names[i]) % comm->comms().size();
}
T* recvbuffer = nullptr;
if (root == gpu_id) {
recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
}
VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
<< ins[i]->numel() << " recv " << outs[i]->numel();
PADDLE_ENFORCE(platform::dynload::ncclReduce(
ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms().at(idx),
stream));
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
<< ins[i]->numel() << " recv " << outs[i]->numel();
} }
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel();
PADDLE_ENFORCE(platform::dynload::ncclReduce(
x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
reduction_op_, root, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
VLOG(3) << "gpu : " << gpu_id << " finished reduce. send " << x->numel()
<< " recv " << out->numel();
} }
}; };
...@@ -155,47 +126,27 @@ class NCCLBcastKernel : public framework::OpKernel<T> { ...@@ -155,47 +126,27 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
int root = ctx.Attr<int>("root"); int root = ctx.Attr<int>("root");
auto* comm = ctx.Input<Communicator>("Communicator"); auto* comm = ctx.Input<Communicator>("Communicator");
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream();
// device id // device id
int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId(); int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
int idx = comm->GetCommId(gpu_id); int idx = comm->GetCommId(gpu_id);
if (idx == root) { if (idx == root) {
auto ins = ctx.MultiInput<LoDTensor>("X"); auto* x = ctx.Input<LoDTensor>("X");
for (size_t i = 0; i < ins.size(); ++i) { VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send " PADDLE_ENFORCE(platform::dynload::ncclBcast(
<< ins[i]->numel(); (void*)x->data<T>(), x->numel(), NCCLTypeWrapper<T>::type, root,
comm->comms().at(idx), ctx.cuda_device_context().stream()));
VLOG(1) << " before ncclBcast"; VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
PADDLE_ENFORCE(platform::dynload::ncclBcast(
(void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
root, comm->comms().at(idx), stream));
VLOG(1) << " after ncclBcast";
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
}
} else { } else {
auto outs = ctx.MultiOutput<LoDTensor>("Out"); auto* out = ctx.Output<LoDTensor>("Out");
for (size_t i = 0; i < outs.size(); ++i) { VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer " << framework::product(out->dims());
<< framework::product(outs[i]->dims()); PADDLE_ENFORCE(platform::dynload::ncclBcast(
out->mutable_data<T>(ctx.GetPlace()), out->numel(),
PADDLE_ENFORCE(platform::dynload::ncclBcast( NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(), ctx.cuda_device_context().stream()));
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx), stream)); VLOG(3) << "gpu : " << gpu_id << " finished Bcast. recv " << out->numel();
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
<< outs[i]->numel();
}
} }
} }
}; };
......
...@@ -14,19 +14,15 @@ limitations under the License. */ ...@@ -14,19 +14,15 @@ limitations under the License. */
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm>
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <thread> #include <thread>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/init.h" #include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast); ...@@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast);
namespace f = paddle::framework; namespace f = paddle::framework;
namespace p = paddle::platform; namespace p = paddle::platform;
static std::vector<int> gpu_list;
// test data amount // test data amount
const f::DDim kDims = {100, 100}; const f::DDim kDims = {20, 20};
// nccl op common tester, init communicator. // nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test { class NCCLTester : public ::testing::Test {
public: public:
virtual void SetUp() override { virtual void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
gpu_list_.emplace_back(i);
}
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i); p::CUDAPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
} }
NCCLInitOp(); NCCLInitOp();
} }
virtual void TearDown() override { virtual void TearDown() override {
for (auto &device_context : dev_ctxs) { for (auto &device_context : dev_ctxs_) {
delete device_context; delete device_context;
} }
} }
...@@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test { ...@@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test {
std::unique_ptr<f::OpDesc> op1(new f::OpDesc); std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
op1->SetType("ncclInit"); op1->SetType("ncclInit");
op1->SetInput("parallel_scopes", {"p_scopes"});
op1->SetOutput("Communicator", {"comm"}); op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});
auto *var = g_scope.Var("comm"); auto *var = g_scope_.Var("comm");
var->GetMutable<p::Communicator>(); var->GetMutable<p::Communicator>();
auto *scope_var = g_scope_.Var("p_scopes");
auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
(*p_scopes).resize(gpu_list_.size());
auto op = f::OpRegistry::CreateOp(*op1); auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp."; VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place); op->Run(g_scope_, cpu_place);
VLOG(1) << "NCCLInitOp finished."; VLOG(1) << "NCCLInitOp finished.";
} }
int GetGPUData(int gpu_id) { return gpu_id + 42; }
template <class T> template <class T>
void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) { void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
std::unique_lock<std::mutex> lk(mu); std::unique_lock<std::mutex> lk(mu_);
const f::OpDesc *op1 = &op_desc; const f::OpDesc *op1 = &op_desc;
p::CUDAPlace place(gpu_id); p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs.at(gpu_id); auto &ctx = dev_ctxs_.at(gpu_id);
auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>(); auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>(); auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
if (!send_tensor->numel()) { if (!send_tensor->numel()) {
send_tensor->Resize(kDims);
send_tensor->mutable_data<T>(kDims, place); send_tensor->mutable_data<T>(kDims, place);
std::vector<T> send_vector(f::product(kDims), gpu_id); std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor); paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
ctx->Wait();
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel(); VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
} }
...@@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test { ...@@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test {
} }
public: public:
std::vector<p::DeviceContext *> dev_ctxs; std::vector<p::DeviceContext *> dev_ctxs_;
f::Scope g_scope; f::Scope g_scope_;
std::mutex mu; std::mutex mu_;
std::vector<int> gpu_list_;
}; };
// ncclInitOp with desc // ncclInitOp with desc
TEST(NCCL, ncclInitOp) { TEST_F(NCCLTester, ncclInitOp) {}
std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
op_desc->SetType("ncclInit");
op_desc->SetOutput("Communicator", {"x1"});
op_desc->SetAttr("gpus", {gpu_list});
f::Scope g_scope;
paddle::platform::CPUPlace cpu_place;
auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>();
auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}
// ncclAllReduceOp with desc // ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) { TEST_F(NCCLTester, ncclAllReduceOp) {
...@@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
std::vector<std::thread> ths; std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope()); dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]); *op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
} }
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join(); ths[i].join();
} }
// check results float expected_result = 0.0;
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}
for (size_t i = 0; i < dev_scopes.size(); ++i) { for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[i]); p::CUDAPlace gpu_place(gpu_list_[i]);
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) { ...@@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt, cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
for (int64_t j = 0; j < f::product(kDims); ++j) { for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5); ASSERT_NEAR(ct[j], expected_result, 1e-5);
} }
} }
} }
...@@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) {
std::vector<std::thread> ths; std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope()); dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]); *op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
} }
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join(); ths[i].join();
} }
// check results on float expected_result = 0.0;
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[kRoot]); p::CUDAPlace gpu_place(gpu_list_[kRoot]);
auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt, cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
for (int64_t j = 0; j < f::product(kDims); ++j) { for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5); ASSERT_NEAR(ct[j], expected_result, 1e-5);
} }
} }
...@@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) {
std::vector<std::thread> ths; std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope()); dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i], std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]); *op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th)); ths.emplace_back(std::move(th));
} }
for (size_t i = 0; i < gpu_list.size(); ++i) { for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join(); ths[i].join();
} }
const int idx = 1; const int idx = 1;
// check results on float result = GetGPUData(kRoot);
float result = kRoot;
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[idx]); p::CUDAPlace gpu_place(gpu_list_[idx]);
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>(); auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>(); auto *rt = recv_tensor.data<float>();
...@@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) { ...@@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place); auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy( paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt, cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
recv_tensor.numel() * sizeof(float), recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream()); static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
for (int64_t j = 0; j < f::product(kDims); ++j) { for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5); ASSERT_NEAR(ct[j], result, 1e-5);
} }
} }
int main(int argc, char **argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
const int dev_count = p::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i);
}
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv);
// device context should be release before scope.
// otherwise driver will down.
return RUN_ALL_TESTS();
}
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -158,11 +159,14 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -158,11 +159,14 @@ class ParallelDoOp : public framework::OperatorBase {
auto &place = places[place_idx]; auto &place = places[place_idx];
auto *cur_scope = sub_scopes[place_idx]; auto *cur_scope = sub_scopes[place_idx];
workers.emplace_back(framework::Async([program, cur_scope, place, block] { workers.emplace_back(
framework::Executor executor(place); framework::Async([program, cur_scope, place, block, place_idx] {
executor.Run(*program, cur_scope, block->ID(), // Give the thread an id to distinguish parallel block with same id.
false /*create_local_scope*/); platform::RecordThread rt(static_cast<int>(place_idx) + 1);
})); framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
} }
for (auto &worker : workers) { for (auto &worker : workers) {
worker.wait(); worker.wait();
...@@ -234,11 +238,14 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -234,11 +238,14 @@ class ParallelDoGradOp : public framework::OperatorBase {
auto *cur_scope = sub_scopes[i]; auto *cur_scope = sub_scopes[i];
// execute // execute
workers.emplace_back(framework::Async([program, cur_scope, place, block] { workers.emplace_back(
framework::Executor executor(place); framework::Async([program, cur_scope, place, block, i] {
executor.Run(*program, cur_scope, block->ID(), // Give the thread an id to distinguish parallel block with same id.
false /*create_local_scope*/); platform::RecordThread rt(static_cast<int>(i) + 1);
})); framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
} }
for (auto &worker : workers) { for (auto &worker : workers) {
worker.wait(); worker.wait();
......
...@@ -24,6 +24,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; ...@@ -24,6 +24,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
using DataLayout = platform::DataLayout; using DataLayout = platform::DataLayout;
using PoolingMode = platform::PoolingMode; using PoolingMode = platform::PoolingMode;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
template <typename T> template <typename T>
class PoolCUDNNOpKernel : public framework::OpKernel<T> { class PoolCUDNNOpKernel : public framework::OpKernel<T> {
...@@ -78,8 +80,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> { ...@@ -78,8 +80,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn pool algorithm --------------------- // ------------------- cudnn pool algorithm ---------------------
auto handle = ctx.cuda_device_context().cudnn_handle(); auto handle = ctx.cuda_device_context().cudnn_handle();
T alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward( PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
cudnn_output_desc, output_data)); cudnn_output_desc, output_data));
...@@ -144,8 +145,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> { ...@@ -144,8 +145,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn pool algorithm --------------------- // ------------------- cudnn pool algorithm ---------------------
auto handle = ctx.cuda_device_context().cudnn_handle(); auto handle = ctx.cuda_device_context().cudnn_handle();
T alpha = 1.0f, beta = 0.0f; ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
if (input_grad) { if (input_grad) {
T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
// Because beta is zero, it is unnecessary to reset input_grad. // Because beta is zero, it is unnecessary to reset input_grad.
...@@ -162,17 +162,19 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> { ...@@ -162,17 +162,19 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace, REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<float>, ops::PoolCUDNNOpKernel<float>,
ops::PoolCUDNNOpKernel<double>); ops::PoolCUDNNOpKernel<double>,
REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace, ops::PoolCUDNNOpKernel<plat::float16>);
REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>, ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>); ops::PoolCUDNNGradOpKernel<double>);
REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace, REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNOpKernel<float>, ops::PoolCUDNNOpKernel<float>,
ops::PoolCUDNNOpKernel<double>); ops::PoolCUDNNOpKernel<double>);
REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace, REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
ops::PoolCUDNNGradOpKernel<float>, ops::PoolCUDNNGradOpKernel<float>,
ops::PoolCUDNNGradOpKernel<double>); ops::PoolCUDNNGradOpKernel<double>);
...@@ -124,11 +124,15 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( ...@@ -124,11 +124,15 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
} }
#endif #endif
auto input_data_type = framework::ToDataType(ctx.Input<Tensor>("X")->type());
if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used");
}
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(), library_);
layout_, library_);
} }
Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
......
...@@ -111,7 +111,8 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -111,7 +111,8 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
}); });
AddAttr<std::vector<float>>( AddAttr<std::vector<float>>(
"max_sizes", "max_sizes",
"(vector<float>) List of max sizes of generated prior boxes."); "(vector<float>) List of max sizes of generated prior boxes.")
.SetDefault(std::vector<float>{});
AddAttr<std::vector<float>>( AddAttr<std::vector<float>>(
"aspect_ratios", "aspect_ratios",
"(vector<float>) List of aspect ratios of generated prior boxes."); "(vector<float>) List of aspect ratios of generated prior boxes.");
......
...@@ -97,9 +97,6 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -97,9 +97,6 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace()); boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace()); vars->mutable_data<T>(ctx.GetPlace());
T inv_img_width = 1.0 / img_width;
T inv_img_height = 1.0 / img_height;
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes); auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
for (int h = 0; h < feature_height; ++h) { for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) { for (int w = 0; w < feature_width; ++w) {
...@@ -110,36 +107,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -110,36 +107,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
for (size_t s = 0; s < min_sizes.size(); ++s) { for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s]; auto min_size = min_sizes[s];
// first prior: aspect_ratio = 1, size = min_size // first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size; box_width = box_height = min_size / 2.;
// xmin // xmin
e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width; e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin // ymin
e_boxes(h, w, idx, 1) = e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
(center_y - box_height * 0.5) * inv_img_height;
// xmax // xmax
e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width; e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax // ymax
e_boxes(h, w, idx, 3) = e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
(center_y + box_height * 0.5) * inv_img_height;
idx++; idx++;
if (max_sizes.size() > 0) { if (max_sizes.size() > 0) {
auto max_size = max_sizes[s]; auto max_size = max_sizes[s];
// second prior: aspect_ratio = 1, // second prior: aspect_ratio = 1,
// size = sqrt(min_size * max_size) // size = sqrt(min_size * max_size)
box_width = box_height = sqrt(min_size * max_size); box_width = box_height = sqrt(min_size * max_size) / 2.;
// xmin // xmin
e_boxes(h, w, idx, 0) = e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
(center_x - box_width * 0.5) * inv_img_width;
// ymin // ymin
e_boxes(h, w, idx, 1) = e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
(center_y - box_height * 0.5) * inv_img_height;
// xmax // xmax
e_boxes(h, w, idx, 2) = e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
(center_x + box_width * 0.5) * inv_img_width;
// ymax // ymax
e_boxes(h, w, idx, 3) = e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
(center_y + box_height * 0.5) * inv_img_height;
idx++; idx++;
} }
...@@ -149,20 +140,16 @@ class PriorBoxOpKernel : public framework::OpKernel<T> { ...@@ -149,20 +140,16 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
if (fabs(ar - 1.) < 1e-6) { if (fabs(ar - 1.) < 1e-6) {
continue; continue;
} }
box_width = min_size * sqrt(ar); box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar); box_height = min_size / sqrt(ar) / 2.;
// xmin // xmin
e_boxes(h, w, idx, 0) = e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
(center_x - box_width * 0.5) * inv_img_width;
// ymin // ymin
e_boxes(h, w, idx, 1) = e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
(center_y - box_height * 0.5) * inv_img_height;
// xmax // xmax
e_boxes(h, w, idx, 2) = e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
(center_x + box_width * 0.5) * inv_img_width;
// ymax // ymax
e_boxes(h, w, idx, 3) = e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
(center_y + box_height * 0.5) * inv_img_height;
idx++; idx++;
} }
} }
......
cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader) cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry) set(LOCAL_READER_LIBS)
op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry)
op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry) function(reader_library TARGET_NAME)
op_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS reader_op_registry) set(oneValueArgs "")
set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op create_double_buffer_reader_op PARENT_SCOPE) set(multiValueArgs SRCS DEPS)
set(options "")
set(common_deps reader_op_registry)
cmake_parse_arguments(reader_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
op_library(${TARGET_NAME} SRCS ${reader_library_SRCS} DEPS ${common_deps} ${reader_library_DEPS})
set(LOCAL_READER_LIBS
${TARGET_NAME}
${LOCAL_READER_LIBS}
PARENT_SCOPE)
endfunction()
reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
# Export local libraries to parent
set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
...@@ -24,11 +24,31 @@ static constexpr size_t kDoubleBufferSize = 2; ...@@ -24,11 +24,31 @@ static constexpr size_t kDoubleBufferSize = 2;
class DoubleBufferReader : public framework::DecoratedReader { class DoubleBufferReader : public framework::DecoratedReader {
public: public:
explicit DoubleBufferReader(ReaderBase* reader) struct Item {
: DecoratedReader(reader), Item() : ctx_(nullptr) {}
buffer_(framework::MakeChannel<std::vector<framework::LoDTensor>>(
kDoubleBufferSize)) { std::vector<framework::LoDTensor> payloads_;
std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this); platform::DeviceContext* ctx_;
};
explicit DoubleBufferReader(
ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
: DecoratedReader(reader), place_(target_place) {
for (size_t i = 0; i < kDoubleBufferSize; ++i) {
if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
ctxs_.emplace_back(new platform::CUDADeviceContext(
boost::get<platform::CUDAPlace>(place_)));
#endif
}
}
start_thread();
}
void start_thread() {
buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
std::thread prefetch([this] { PrefetchThreadFunc(); });
prefetch.detach(); prefetch.detach();
} }
...@@ -37,10 +57,15 @@ class DoubleBufferReader : public framework::DecoratedReader { ...@@ -37,10 +57,15 @@ class DoubleBufferReader : public framework::DecoratedReader {
~DoubleBufferReader() { buffer_->Close(); } ~DoubleBufferReader() { buffer_->Close(); }
bool HasNext() const override;
private: private:
void PrefetchThreadFunc(); void PrefetchThreadFunc();
framework::Channel<std::vector<framework::LoDTensor>>* buffer_; framework::Channel<Item>* buffer_;
platform::Place place_;
std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
mutable Item local_buffer_;
}; };
class CreateDoubleBufferReaderOp : public framework::OperatorBase { class CreateDoubleBufferReaderOp : public framework::OperatorBase {
...@@ -54,7 +79,20 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { ...@@ -54,7 +79,20 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out")) auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>(); ->template GetMutable<framework::ReaderHolder>();
out->Reset(new DoubleBufferReader(underlying_reader.Get()));
auto place_str = Attr<std::string>("place");
platform::Place place;
if (place_str == "CPU") {
place = platform::CPUPlace();
} else {
std::istringstream sin(place_str);
sin.seekg(std::string("CUDA:").size(), std::ios::beg);
size_t num;
sin >> num;
place = platform::CUDAPlace(static_cast<int>(num));
}
out->Reset(new DoubleBufferReader(underlying_reader.Get(), place));
} }
}; };
...@@ -69,41 +107,72 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -69,41 +107,72 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
It launches another thread to execute the 'underlying reader' asynchronously, It launches another thread to execute the 'underlying reader' asynchronously,
which prevents reading process from blocking subsequent training. which prevents reading process from blocking subsequent training.
)DOC"); )DOC");
std::unordered_set<std::string> enum_range;
constexpr size_t kMaxCUDADevs = 128;
for (size_t i = 0; i < kMaxCUDADevs; ++i) {
enum_range.insert(string::Sprintf("CUDA:%d", i));
}
enum_range.insert("CPU");
AddAttr<std::string>("place", "The double buffer place, default is CPU")
.SetDefault("CPU")
.InEnum({enum_range});
} }
}; };
void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) { void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
out->clear(); if (local_buffer_.payloads_.empty()) {
buffer_->Receive(out); buffer_->Receive(&local_buffer_);
}
*out = local_buffer_.payloads_;
local_buffer_.payloads_.clear();
if (local_buffer_.ctx_) {
local_buffer_.ctx_->Wait();
}
} }
void DoubleBufferReader::ReInit() { void DoubleBufferReader::ReInit() {
reader_->ReInit(); reader_->ReInit();
buffer_->Close(); buffer_->Close();
// The existing prefetch thread will terminate for the buffer_ is closed. start_thread();
buffer_ = framework::MakeChannel<std::vector<framework::LoDTensor>>(
kDoubleBufferSize);
std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
prefetch.detach();
} }
void DoubleBufferReader::PrefetchThreadFunc() { void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts."; VLOG(5) << "A new prefetch thread starts.";
while (true) { size_t gpu_ctx_offset = 0;
std::vector<framework::LoDTensor> batch; while (reader_->HasNext()) {
reader_->ReadNext(&batch); Item batch;
if (batch.empty()) { reader_->ReadNext(&batch.payloads_);
// EOF if (platform::is_gpu_place(place_)) {
buffer_->Close(); std::vector<framework::LoDTensor> gpu_batch;
VLOG(5) << "Reached the end of the file. The prefetch thread terminates."; auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++];
break; gpu_ctx_offset %= this->ctxs_.size();
gpu_batch.resize(batch.payloads_.size());
for (size_t i = 0; i < batch.payloads_.size(); ++i) {
framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx,
&gpu_batch[i]);
gpu_batch[i].set_lod(batch.payloads_[i].lod());
}
batch.ctx_ = gpu_ctx.get();
std::swap(gpu_batch, batch.payloads_);
} }
if (!buffer_->Send(&batch)) { if (!buffer_->Send(&batch)) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The " VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread terminates."; "prefetch thread terminates.";
break; break;
} }
} }
buffer_->Close();
}
bool DoubleBufferReader::HasNext() const {
if (local_buffer_.payloads_.empty()) {
bool ok = buffer_->Receive(&local_buffer_);
return ok;
} else {
return true;
}
} }
} // namespace reader } // namespace reader
......
...@@ -19,11 +19,11 @@ namespace operators { ...@@ -19,11 +19,11 @@ namespace operators {
namespace reader { namespace reader {
template <typename T> template <typename T>
class RandomDataGenerator : public framework::FileReader { class RandomDataGenerator : public framework::ReaderBase {
public: public:
RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min, RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
float max) float max)
: FileReader(shapes), min_(min), max_(max) { : framework::ReaderBase(), min_(min), max_(max), shapes_(shapes) {
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
unsigned int seed = std::random_device()(); unsigned int seed = std::random_device()();
...@@ -52,11 +52,14 @@ class RandomDataGenerator : public framework::FileReader { ...@@ -52,11 +52,14 @@ class RandomDataGenerator : public framework::FileReader {
void ReInit() override { return; } void ReInit() override { return; }
bool HasNext() const override { return true; }
private: private:
float min_; float min_;
float max_; float max_;
std::minstd_rand engine_; std::minstd_rand engine_;
std::uniform_real_distribution<float> dist_; std::uniform_real_distribution<float> dist_;
std::vector<framework::DDim> shapes_;
}; };
template <typename T> template <typename T>
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
#include "paddle/fluid/recordio/scanner.h"
namespace paddle {
namespace operators {
namespace reader {
class RecordIOFileReader : public framework::FileReader {
public:
explicit RecordIOFileReader(const std::string& filename,
const std::vector<framework::DDim>& dims)
: FileReader(dims),
scanner_(filename),
dev_ctx_(*platform::DeviceContextPool::Instance().Get(
platform::CPUPlace())) {}
bool HasNext() const override { return scanner_.HasNext(); }
void ReInit() override { scanner_.Reset(); }
protected:
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
*out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
}
private:
recordio::Scanner scanner_;
const platform::DeviceContext& dev_ctx_;
};
class CreateRecordIOReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()),
"The accumulate of all ranks should be equal to the "
"shape concat's length.");
std::string filename = Attr<std::string>("filename");
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(
new RecordIOFileReader(filename, RestoreShapes(shape_concat, ranks)));
}
};
class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
public:
CreateRecordIOReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: FileReaderMakerBase(op_proto, op_checker) {
AddAttr<std::string>("filename", "The filename of record io reader");
AddComment(R"DOC(
CreateRecordIOReader Operator
Create a reader from a record io file
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace reader = paddle::operators::reader;
REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader,
reader::CreateRecordIOReaderOp,
reader::CreateRecordIOReaderOpMaker);
REGISTER_FILE_READER(recordio, reader::RecordIOFileReader);
...@@ -12,6 +12,9 @@ ...@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <random>
#include "glog/logging.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle { namespace paddle {
...@@ -20,43 +23,53 @@ namespace reader { ...@@ -20,43 +23,53 @@ namespace reader {
class ShuffleReader : public framework::DecoratedReader { class ShuffleReader : public framework::DecoratedReader {
public: public:
ShuffleReader(ReaderBase* reader, int buffer_size) ShuffleReader(ReaderBase* reader, size_t buffer_size, size_t seed = 0)
: DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) { : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
buffer_.reserve(buffer_size); VLOG(10) << "Create shuffle reader of " << reader_;
if (seed_ == 0) {
std::random_device device;
seed_ = device();
}
ReadIntoBuffers();
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override; void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (iteration_pos_ >= buffer_.size()) {
VLOG(10) << "Resetting shuffle buffer";
ReadIntoBuffers();
}
*out = buffer_[iteration_pos_++];
}
private: bool HasNext() const override {
int buffer_size_; return iteration_pos_ < buffer_.size() || reader_->HasNext();
std::vector<std::vector<framework::LoDTensor>> buffer_; }
size_t iteration_pos_;
};
void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) { private:
if (iteration_pos_ >= buffer_.size()) { void ReadIntoBuffers() {
// Reload buffer with new data
buffer_.clear(); buffer_.clear();
buffer_.reserve(buffer_size_); buffer_.reserve(buffer_size_);
for (int i = 0; i < buffer_size_; ++i) { iteration_pos_ = 0;
buffer_.push_back(std::vector<framework::LoDTensor>()); PADDLE_ENFORCE(reader_->HasNext());
reader_->ReadNext(&buffer_.back()); for (size_t i = 0; i < buffer_size_; ++i) {
if (buffer_.back().empty()) { if (!reader_->HasNext()) {
buffer_.pop_back();
break; break;
} }
buffer_.emplace_back();
reader_->ReadNext(&buffer_.back());
} }
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be std::mt19937 g(seed_);
// optimize. std::shuffle(buffer_.begin(), buffer_.end(), g);
std::random_shuffle(buffer_.begin(), buffer_.end()); seed_ = g(); // update seed_;
iteration_pos_ = 0; VLOG(10) << "random buffer size = " << buffer_.size();
} }
out->clear();
if (!buffer_.empty()) { size_t buffer_size_;
std::swap(*out, buffer_[iteration_pos_++]); std::vector<std::vector<framework::LoDTensor>> buffer_;
}
// if buffer_ is empty, the 'out' will return as an empty vector. size_t iteration_pos_;
} size_t seed_;
};
class CreateShuffleReaderOp : public framework::OperatorBase { class CreateShuffleReaderOp : public framework::OperatorBase {
public: public:
...@@ -67,10 +80,10 @@ class CreateShuffleReaderOp : public framework::OperatorBase { ...@@ -67,10 +80,10 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out")) auto& var = detail::Ref(scope.FindVar(Output("Out")));
->template GetMutable<framework::ReaderHolder>(); var.GetMutable<framework::ReaderHolder>()->Reset(
out->Reset( new ShuffleReader(underlying_reader.Get(),
new ShuffleReader(underlying_reader.Get(), Attr<int>("buffer_size"))); static_cast<size_t>(Attr<int>("buffer_size"))));
} }
}; };
......
...@@ -31,11 +31,16 @@ std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat, ...@@ -31,11 +31,16 @@ std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
return res; return res;
} }
std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
static std::unordered_map<std::string, FileReaderCreator> regs;
return regs;
}
FileReaderMakerBase::FileReaderMakerBase( FileReaderMakerBase::FileReaderMakerBase(
framework::OpProtoAndCheckerMaker::OpProto* op_proto, framework::OpProtoAndCheckerMaker::OpProto* op_proto,
framework::OpAttrChecker* op_checker) framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) { : OpProtoAndCheckerMaker(op_proto, op_checker) {
AddOutput("Out", "(ReaderHolder) The created random reader."); AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes."); AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"ranks", "ranks",
...@@ -49,6 +54,10 @@ FileReaderMakerBase::FileReaderMakerBase( ...@@ -49,6 +54,10 @@ FileReaderMakerBase::FileReaderMakerBase(
} }
void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(
!ctx->IsRuntime(),
"'FileReaderInferShape' should only be invoked during compile time.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output file reader should not be null."); "The output file reader should not be null.");
const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat"); const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
...@@ -56,16 +65,14 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { ...@@ -56,16 +65,14 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks); std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
ctx->SetReaderDims("Out", shapes); ctx->SetReaderDims("Out", shapes);
if (ctx->IsRuntime()) { const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels"); PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), "The number of 'lod_levels'(%d) doesn't match the number "
"The number of 'lod_levels'(%d) doesn't match the number " "of 'shapes'(%d).",
"of 'shapes'(%d).", lod_levels.size(), shapes.size());
lod_levels.size(), shapes.size()); framework::VarDesc* reader =
framework::VarDesc* reader = boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]); reader->SetLoDLevels(lod_levels);
reader->SetLoDLevels(lod_levels);
}
} }
void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
...@@ -77,19 +84,21 @@ void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, ...@@ -77,19 +84,21 @@ void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
void DecoratedReaderInferShape::operator()( void DecoratedReaderInferShape::operator()(
framework::InferShapeContext* ctx) const { framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(!ctx->IsRuntime(),
"'DecoratedReaderInferShape' should only be invoked during "
"compile time.");
PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
"Input(UnderlyingReader) should not be null."); "Input(UnderlyingReader) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output decorated reader should not be null."); "The output decorated reader should not be null.");
ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
if (ctx->IsRuntime()) { framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
framework::VarDesc* in_reader = boost::get<framework::VarDesc*>( ctx->GetInputVarPtrs("UnderlyingReader")[0]);
ctx->GetInputVarPtrs("UnderlyingReader")[0]); framework::VarDesc* out_reader =
framework::VarDesc* out_reader = boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]); out_reader->SetLoDLevels(in_reader->GetLoDLevels());
out_reader->SetLoDLevels(in_reader->GetLoDLevels());
}
} }
void DecoratedReaderInferVarType::operator()( void DecoratedReaderInferVarType::operator()(
const framework::OpDesc& op_desc, framework::BlockDesc* block) const { const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
......
...@@ -21,6 +21,20 @@ namespace paddle { ...@@ -21,6 +21,20 @@ namespace paddle {
namespace operators { namespace operators {
namespace reader { namespace reader {
using FileReaderCreator = std::function<framework::ReaderBase*(
const std::string&, const std::vector<framework::DDim>&)>;
std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
template <typename Reader>
int RegisterFileReader(const std::string& filetype) {
FileReaderRegistry()[filetype] = [](
const std::string& fn, const std::vector<paddle::framework::DDim>& dim) {
return new Reader(fn, dim);
};
return 0;
}
extern std::vector<framework::DDim> RestoreShapes( extern std::vector<framework::DDim> RestoreShapes(
const std::vector<int>& shape_concat, const std::vector<int>& ranks); const std::vector<int>& shape_concat, const std::vector<int>& ranks);
...@@ -73,3 +87,15 @@ class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker { ...@@ -73,3 +87,15 @@ class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
paddle::operators::reader::DecoratedReaderInferShape, \ paddle::operators::reader::DecoratedReaderInferShape, \
paddle::framework::EmptyGradOpMaker, \ paddle::framework::EmptyGradOpMaker, \
paddle::operators::reader::DecoratedReaderInferVarType) paddle::operators::reader::DecoratedReaderInferVarType)
#define REGISTER_FILE_READER(_filetype, _reader) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
_reg_file_reader_##_filetype, \
"Must use REGISTER_FILE_READER in global namespace"); \
int TouchFileReader##_filetype() { return 0; } \
int _reg_file_reader_entry_##filetype = \
paddle::operators::reader::RegisterFileReader<_reader>(#_filetype)
#define USE_FILE_READER(filetype) \
extern int TouchFileReader##filetype(); \
static int _use_##filetype = TouchFileReader##filetype()
...@@ -173,6 +173,15 @@ class ReduceMinOpMaker : public ReduceOpMaker { ...@@ -173,6 +173,15 @@ class ReduceMinOpMaker : public ReduceOpMaker {
} }
}; };
class ReduceProdOpMaker : public ReduceOpMaker {
public:
ReduceProdOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: ReduceOpMaker(proto, op_checker) {
SetComment("ReduceProd", "production");
AddComment(comment_);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -190,6 +199,9 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, ...@@ -190,6 +199,9 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
ops::ReduceGradOp); ops::ReduceGradOp);
REGISTER_OP(reduce_prod, ops::ReduceOp, ops::ReduceProdOpMaker,
reduce_prod_grad, ops::ReduceGradOp);
#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ #define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL(reduce_type, \ REGISTER_OP_CPU_KERNEL(reduce_type, \
ops::ReduceKernel<paddle::platform::CPUDeviceContext, \ ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
......
...@@ -93,6 +93,22 @@ struct MaxOrMinGradFunctor { ...@@ -93,6 +93,22 @@ struct MaxOrMinGradFunctor {
} }
}; };
struct ProdFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
y.device(place) = x.prod(dim);
}
};
struct ProdGradFunctor {
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
const Dim& dim, int size) {
dx.device(place) = dy.broadcast(dim) * y.broadcast(dim) * x.inverse();
}
};
template <typename DeviceContext, typename T, typename Functor> template <typename DeviceContext, typename T, typename Functor>
class ReduceKernel : public framework::OpKernel<T> { class ReduceKernel : public framework::OpKernel<T> {
public: public:
...@@ -254,4 +270,5 @@ class ReduceGradKernel : public framework::OpKernel<T> { ...@@ -254,4 +270,5 @@ class ReduceGradKernel : public framework::OpKernel<T> {
__macro(reduce_sum, SumFunctor, SumGradFunctor); \ __macro(reduce_sum, SumFunctor, SumGradFunctor); \
__macro(reduce_mean, MeanFunctor, MeanGradFunctor); \ __macro(reduce_mean, MeanFunctor, MeanGradFunctor); \
__macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \ __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
__macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor); \
__macro(reduce_prod, ProdFunctor, ProdGradFunctor);
...@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel { ...@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ref"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(Ref) of ScatterOp should not be null."); "Input(X) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Index"), PADDLE_ENFORCE(ctx->HasInput("Ids"),
"Input(Index) of ScatterOp should not be null."); "Input(Ids) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Updates"), PADDLE_ENFORCE(ctx->HasInput("Updates"),
"Input(Updates) of ScatterOp should not be null."); "Input(Updates) of ScatterOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of ScatterOp should not be null."); "Output(Out) of ScatterOp should not be null.");
auto updates_dims = ctx->GetInputDim("Updates"); auto updates_dims = ctx->GetInputDim("Updates");
auto ref_dims = ctx->GetInputDim("Ref"); auto ref_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1, PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1,
"Update Index should be 1-D."); "Update Ids should be 1-D.");
PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(), PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
"Reference and Updates should have the same shape size"); "Xerence and Updates should have the same shape size");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
ctx->GetInputDim("Index")[0], ctx->GetInputDim("Ids")[0],
"Updates and Index should have same batch-size."); "Updates and Ids should have same batch-size.");
framework::DDim data_dim(updates_dims); framework::DDim data_dim(updates_dims);
for (int i = 1; i < data_dim.size(); ++i) { for (int i = 1; i < data_dim.size(); ++i) {
PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]); PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
...@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel { ...@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
ctx.device_context()); ctx.device_context());
} }
}; };
...@@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel { ...@@ -64,14 +64,14 @@ class ScatterGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->SetOutputDim(framework::GradVarName("Updates"),
ctx->GetInputDim("Updates")); ctx->GetInputDim("Updates"));
ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
ctx.device_context()); ctx.device_context());
} }
}; };
...@@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -80,9 +80,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker) ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Ref", "The source input of scatter op"); AddInput("X", "The source input of scatter op");
AddInput("Index", AddInput("Ids", "The index input of scatter op where X will be updated");
"The index input of scatter op where Ref will be updated");
AddInput("Updates", "The updated value of updates op"); AddInput("Updates", "The updated value of updates op");
AddOutput("Out", "The output of add op"); AddOutput("Out", "The output of add op");
AddComment(R"DOC( AddComment(R"DOC(
...@@ -91,8 +90,8 @@ Scatter Operator. ...@@ -91,8 +90,8 @@ Scatter Operator.
This operator obtains output by updating the input on selected indices on the first axis: This operator obtains output by updating the input on selected indices on the first axis:
$$ $$
Out = Ref \\ Out = X \\
Out[Index] = Ref[Index] + Updates Out[Ids] = X[Ids] + Updates
$$ $$
)DOC"); )DOC");
......
...@@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> { ...@@ -25,14 +25,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
auto *Ref = ctx.Input<Tensor>("Ref"); auto *X = ctx.Input<Tensor>("X");
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *Updates = ctx.Input<Tensor>("Updates"); auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out"); auto *Out = ctx.Output<Tensor>("Out");
Out->ShareDataWith(*Ref); Out->ShareDataWith(*X);
GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out); GPUScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
} }
}; };
...@@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -42,16 +42,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device."); "This kernel only runs on GPU device.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref")); auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates")); auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO // In place gradient: dX = dO
dRef->ShareDataWith(*dOut); dX->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index] // Gradient by Gather: dUpdates = dO[Ids]
GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates); GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
} }
}; };
......
...@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> { ...@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU."); "This kernel only runs on CPU.");
auto *Ref = ctx.Input<Tensor>("Ref"); auto *X = ctx.Input<Tensor>("X");
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *Updates = ctx.Input<Tensor>("Updates"); auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out"); auto *Out = ctx.Output<Tensor>("Out");
// In place output: Out = Ref, Out[Index] += Updates // In place output: Out = X, Out[Ids] += Updates
Out->ShareDataWith(*Ref); Out->ShareDataWith(*X);
// Apply ScatterUpdate: Out[index] += Updates[:] // Apply ScatterUpdate: Out[index] += Updates[:]
ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out); ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
} }
}; };
...@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> { ...@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU."); "This kernel only runs on CPU.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref")); auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates")); auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index"); auto *Ids = ctx.Input<Tensor>("Ids");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO // In place gradient: dX = dO
dRef->ShareDataWith(*dOut); dX->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index] // Gradient by Gather: dUpdates += dO[Ids]
CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates); CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <boost/tokenizer.hpp>
#include <memory>
#include <thread>
#include <vector>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/concurrency/channel_util.h"
namespace paddle {
namespace operators {
static constexpr char kX[] = "X";
static constexpr char kCaseToExecute[] = "case_to_execute";
static constexpr char kCases[] = "cases";
static constexpr char kCasesBlock[] = "sub_block";
class SelectOp : public framework::OperatorBase {
public:
SelectOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {}
private:
enum class SelectOpCaseType {
DEFAULT = 0,
SEND = 1,
RECEIVE = 2,
};
struct SelectOpCase {
int caseIndex;
SelectOpCaseType caseType;
std::string channelName;
std::string varName;
SelectOpCase() {}
SelectOpCase(int caseIndex, SelectOpCaseType caseType,
std::string channelName, std::string varName)
: caseIndex(caseIndex),
caseType(caseType),
channelName(channelName),
varName(varName) {}
};
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
std::vector<std::string> casesConfigs =
Attr<std::vector<std::string>>(kCases);
framework::BlockDesc *casesBlock =
Attr<framework::BlockDesc *>(kCasesBlock);
framework::Scope &casesBlockScope = scope.NewScope();
std::string caseToExecuteVarName = Input(kCaseToExecute);
framework::Variable *caseToExecuteVar =
casesBlockScope.FindVar(caseToExecuteVarName);
// Construct cases from "conditional_block_op"(s) in the casesBlock
std::vector<std::shared_ptr<SelectOpCase>> cases =
ParseAndShuffleCases(&casesConfigs);
// Get all unique channels involved in select
std::set<framework::ChannelHolder *> channelsSet;
for (auto c : cases) {
if (!c->channelName.empty()) {
auto channelVar = scope.FindVar(c->channelName);
framework::ChannelHolder *ch =
channelVar->GetMutable<framework::ChannelHolder>();
if (channelsSet.find(ch) == channelsSet.end()) {
channelsSet.insert(ch);
}
}
}
// Order all channels by their pointer address
std::vector<framework::ChannelHolder *> channels(channelsSet.begin(),
channelsSet.end());
std::sort(channels.begin(), channels.end());
// Poll all cases
int32_t caseToExecute = pollCases(&scope, &cases, channels);
// At this point, the case to execute has already been determined,
// so we can proceed with executing the cases block
framework::LoDTensor *caseToExecuteTensor =
caseToExecuteVar->GetMutable<framework::LoDTensor>();
caseToExecuteTensor->data<int32_t>()[0] = caseToExecute;
// Execute the cases block, only one case will be executed since we set the
// case_to_execute value to the index of the case we want to execute
framework::Executor executor(dev_place);
framework::ProgramDesc *program = casesBlock->Program();
executor.Run(*program, &casesBlockScope, casesBlock->ID(),
false /*create_local_scope*/);
}
/**
* Goes through all operators in the casesConfigs and processes
* "conditional_block" operators. These operators are mapped to our
* SelectOpCase objects. We randomize the case orders, and set the
* default case (if any exists) as the last case)
* @param casesBlock
* @return
*/
std::vector<std::shared_ptr<SelectOpCase>> ParseAndShuffleCases(
std::vector<std::string> *casesConfigs) const {
std::vector<std::shared_ptr<SelectOpCase>> cases;
std::shared_ptr<SelectOpCase> defaultCase;
if (casesConfigs != nullptr) {
boost::char_delimiters_separator<char> sep(false, ",", "");
for (std::vector<std::string>::iterator itr = casesConfigs->begin();
itr < casesConfigs->end(); ++itr) {
std::string caseConfig = *itr;
boost::tokenizer<> tokens(caseConfig, sep);
boost::tokenizer<>::iterator tok_iter = tokens.begin();
PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index");
std::string caseIndexString = *tok_iter;
int caseIndex = std::stoi(caseIndexString);
++tok_iter;
PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type");
std::string caseTypeString = *tok_iter;
SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString);
std::string caseChannel;
std::string caseChannelVar;
++tok_iter;
if (caseType != SelectOpCaseType::DEFAULT) {
PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel");
caseChannel = *tok_iter;
++tok_iter;
PADDLE_ENFORCE(tok_iter != tokens.end(),
"Cannot get case channel variable");
caseChannelVar = *tok_iter;
}
auto c = std::make_shared<SelectOpCase>(caseIndex, caseType,
caseChannel, caseChannelVar);
if (caseType == SelectOpCaseType::DEFAULT) {
PADDLE_ENFORCE(defaultCase == nullptr,
"Select can only contain one default case.");
defaultCase = c;
} else {
cases.push_back(c);
}
}
}
// Randomly sort cases, with default case being last
std::random_shuffle(cases.begin(), cases.end());
if (defaultCase != nullptr) {
cases.push_back(defaultCase);
}
return cases;
}
/**
* This method will recursively poll the cases and determines if any case
* condition is true.
* If none of the cases conditions are true (and there is no default case),
* then block
* the thread. The thread may be woken up by a channel operation, at which
* point we
* execute the case.
* @param scope
* @param cases
* @param channels
* @return
*/
int32_t pollCases(const framework::Scope *scope,
std::vector<std::shared_ptr<SelectOpCase>> *cases,
std::vector<framework::ChannelHolder *> channels) const {
// Lock all involved channels
lockChannels(channels);
std::atomic<int> caseToExecute(-1);
std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
while (it != cases->end()) {
std::shared_ptr<SelectOpCase> c = *it;
auto chVar = scope->FindVar(c->channelName);
framework::ChannelHolder *ch =
chVar->GetMutable<framework::ChannelHolder>();
switch (c->caseType) {
case SelectOpCaseType::SEND:
PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel");
if (ch->CanSend()) {
// We can send to channel directly, send the data to channel
// and execute case
auto chVar = scope->FindVar(c->varName);
concurrency::ChannelSend(ch, chVar);
caseToExecute = c->caseIndex;
}
break;
case SelectOpCaseType::RECEIVE:
if (ch->CanReceive()) {
// We can receive from channel directly, send the data to channel
// and execute case
auto chVar = scope->FindVar(c->varName);
concurrency::ChannelReceive(ch, chVar);
caseToExecute = c->caseIndex;
}
break;
case SelectOpCaseType::DEFAULT:
caseToExecute = c->caseIndex;
break;
}
if (caseToExecute != -1) {
// We found a case to execute, stop looking at other case statements
break;
}
++it;
}
if (caseToExecute == -1) {
// None of the cases are eligible to execute, enqueue current thread
// into all the sending/receiving queue of each involved channel
std::atomic<bool> completed(false);
std::recursive_mutex mutex;
std::unique_lock<std::recursive_mutex> lock{mutex};
// std::condition_variable_any selectCond;
auto selectCond = std::make_shared<std::condition_variable_any>();
std::recursive_mutex callbackMutex;
pushThreadOnChannelQueues(scope, cases, selectCond, caseToExecute,
completed, callbackMutex);
// TODO(thuan): Atomically unlock all channels and sleep current thread
unlockChannels(channels);
selectCond->wait(lock, [&completed]() { return completed.load(); });
// Select has been woken up by case operation
lockChannels(channels);
removeThreadOnChannelQueues(scope, cases);
if (caseToExecute == -1) {
// Recursively poll cases, since we were woken up by a channel close
// TODO(thuan): Need to test if this is a valid case
unlockChannels(channels);
return pollCases(scope, cases, channels);
}
}
// At this point, caseToExecute != -1, and we can proceed with executing
// the case block
unlockChannels(channels);
return caseToExecute;
}
void lockChannels(std::vector<framework::ChannelHolder *> chs) const {
std::vector<framework::ChannelHolder *>::iterator it = chs.begin();
while (it != chs.end()) {
framework::ChannelHolder *ch = *it;
ch->Lock();
++it;
}
}
void unlockChannels(std::vector<framework::ChannelHolder *> chs) const {
std::vector<framework::ChannelHolder *>::reverse_iterator it = chs.rbegin();
while (it != chs.rend()) {
framework::ChannelHolder *ch = *it;
ch->Unlock();
++it;
}
}
void pushThreadOnChannelQueues(
const framework::Scope *scope,
std::vector<std::shared_ptr<SelectOpCase>> *cases,
std::shared_ptr<std::condition_variable_any> rCond,
std::atomic<int> &caseToExecute, std::atomic<bool> &completed,
std::recursive_mutex &callbackMutex) const {
std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
while (it != cases->end()) {
std::shared_ptr<SelectOpCase> c = *it;
auto chVar = scope->FindVar(c->channelName);
framework::ChannelHolder *ch =
chVar->GetMutable<framework::ChannelHolder>();
std::function<bool(framework::ChannelAction channelAction)> cb =
[&caseToExecute, &completed, &callbackMutex,
c](framework::ChannelAction channelAction) {
std::lock_guard<std::recursive_mutex> lock{callbackMutex};
bool canProcess = false;
if (!completed) {
// If the channel wasn't closed, we set the caseToExecute index
// as this current case
if (channelAction != framework::ChannelAction::CLOSE) {
caseToExecute = c->caseIndex;
}
// This will allow our conditional variable to break out of wait
completed = true;
canProcess = true;
}
return canProcess;
};
switch (c->caseType) {
case SelectOpCaseType::SEND: {
auto chOutputVar = scope->FindVar(c->varName);
concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb);
break;
}
case SelectOpCaseType::RECEIVE: {
auto chOutputVar = scope->FindVar(c->varName);
concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb);
break;
}
default:
break;
}
++it;
}
}
void removeThreadOnChannelQueues(
const framework::Scope *scope,
std::vector<std::shared_ptr<SelectOpCase>> *cases) const {
std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
while (it != cases->end()) {
std::shared_ptr<SelectOpCase> c = *it;
auto chVar = scope->FindVar(c->channelName);
framework::ChannelHolder *ch =
chVar->GetMutable<framework::ChannelHolder>();
switch (c->caseType) {
case SelectOpCaseType::SEND: {
ch->RemoveFromSendQ(this);
break;
}
case SelectOpCaseType::RECEIVE: {
ch->RemoveFromReceiveQ(this);
break;
}
default:
break;
}
++it;
}
}
};
class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SelectOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(kX,
"A set of variables, which are required by operators inside the "
"cases of Select Op")
.AsDuplicable();
AddInput(kCaseToExecute,
"(Int) The variable the sets the index of the case to execute, "
"after evaluating the channels being sent to and received from")
.AsDuplicable();
AddAttr<std::vector<std::string>>(kCases,
"(String vector) Serialized list of"
"all cases in the select op. Each"
"case is serialized as: "
"'<index>,<type>,<channel>,<value>'"
"where type is 0 for default, 1 for"
"send, and 2 for receive"
"No channel and values are needed for"
"default cases.");
AddAttr<framework::BlockDesc *>(kCasesBlock,
"The cases block inside select_op");
AddComment(R"DOC(
)DOC");
}
};
// TODO(thuan): Implement Gradient Operator for SELECT_OP
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(select, paddle::operators::SelectOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::SelectOpMaker);
...@@ -88,6 +88,12 @@ class SendOp : public framework::OperatorBase { ...@@ -88,6 +88,12 @@ class SendOp : public framework::OperatorBase {
rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
// tell pservers that current trainer have called fetch
for (auto& ep : endpoints) {
VLOG(3) << "send fetch barrier, ep: " << ep;
rpc_client->AsyncSendFetchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait());
} }
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/softmax.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T>
class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<LoDTensor>("X");
auto* out = ctx.Output<LoDTensor>("Out");
auto lod = x->lod();
auto dims = x->dims();
const size_t level = lod.size() - 1;
PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
"The first dimension of Input(X) should be equal to the "
"sum of all sequences' lengths.");
PADDLE_ENFORCE_EQ(dims[0], x->numel(),
"The width of each timestep in Input(X) of "
"SequenceSoftmaxOp should be 1.");
out->mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
int start_pos = static_cast<int>(lod[level][i]);
int end_pos = static_cast<int>(lod[level][i + 1]);
Tensor x_i = x->Slice(start_pos, end_pos);
Tensor out_i = out->Slice(start_pos, end_pos);
// Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
framework::DDim dims_i =
// framework::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
framework::make_ddim({1UL, end_pos - start_pos});
x_i.Resize(dims_i);
out_i.Resize(dims_i);
math::SoftmaxCUDNNFunctor<T>()(
ctx.template device_context<platform::CUDADeviceContext>(), &x_i,
&out_i);
}
}
};
template <typename T>
class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<LoDTensor>("Out");
auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
auto* x = ctx.Input<LoDTensor>("X");
auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
auto lod = x->lod();
const size_t level = lod.size() - 1;
x_grad->mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
int start_pos = static_cast<int>(lod[level][i]);
int end_pos = static_cast<int>(lod[level][i + 1]);
Tensor out_i = out->Slice(start_pos, end_pos);
Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
// Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
out_i.Resize(dims_i);
out_grad_i.Resize(dims_i);
x_grad_i.Resize(dims_i);
math::SoftmaxGradCUDNNFunctor<T>()(
ctx.template device_context<platform::CUDADeviceContext>(), &out_i,
&out_grad_i, &x_grad_i);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
ops::SequenceSoftmaxCUDNNKernel<float>,
ops::SequenceSoftmaxCUDNNKernel<double>)
REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::SequenceSoftmaxGradCUDNNKernel<float>,
ops::SequenceSoftmaxGradCUDNNKernel<double>)
...@@ -29,6 +29,29 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel { ...@@ -29,6 +29,29 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
}
#endif
framework::LibraryType library_ = framework::LibraryType::kPlain;
if (use_cudnn && runtime_cudnn_support) {
library_ = framework::LibraryType::kCUDNN;
}
std::string data_format = ctx.Attr<std::string>("data_format");
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
framework::StringToDataLayout(data_format), library_);
}
}; };
class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -41,6 +64,17 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -41,6 +64,17 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", AddOutput("Out",
"(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension " "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
"of length 1."); "of length 1.");
AddAttr<bool>(
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
"An optional string from: \"NHWC\", \"NCHW\". "
"Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ")
.SetDefault("AnyLayout");
AddComment(R"DOC( AddComment(R"DOC(
Sequence Softmax Operator. Sequence Softmax Operator.
...@@ -91,6 +125,29 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel { ...@@ -91,6 +125,29 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
}
#endif
framework::LibraryType library_ = framework::LibraryType::kPlain;
if (use_cudnn && runtime_cudnn_support) {
library_ = framework::LibraryType::kCUDNN;
}
std::string data_format = ctx.Attr<std::string>("data_format");
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
framework::StringToDataLayout(data_format), library_);
}
}; };
} // namespace operators } // namespace operators
...@@ -102,7 +159,9 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp, ...@@ -102,7 +159,9 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
ops::SequenceSoftmaxGradOp); ops::SequenceSoftmaxGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
sequence_softmax, sequence_softmax,
ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>); ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
sequence_softmax_grad, sequence_softmax_grad,
ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>); ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
...@@ -17,7 +17,10 @@ limitations under the License. */ ...@@ -17,7 +17,10 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
sequence_softmax, sequence_softmax,
ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>) ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>)
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
sequence_softmax_grad, sequence_softmax_grad,
ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>); ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext,
double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<Tensor>("X");
auto* Out = context.Output<Tensor>("Out");
// allocate memory on device.
Out->mutable_data<T>(context.GetPlace());
math::SoftmaxCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(), X, Out);
}
};
template <typename T>
class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* Out = context.Input<Tensor>("Out");
auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
// allocate memory on device.
dX->mutable_data<T>(context.GetPlace());
math::SoftmaxGradCUDNNFunctor<T>()(
context.template device_context<platform::CUDADeviceContext>(), Out,
dOut, dX);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace,
ops::SoftmaxCUDNNKernel<float>);
REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
ops::SoftmaxGradCUDNNKernel<float>);
...@@ -33,6 +33,29 @@ class SoftmaxOp : public framework::OperatorWithKernel { ...@@ -33,6 +33,29 @@ class SoftmaxOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", x_dims); ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
}
#endif
framework::LibraryType library_ = framework::LibraryType::kPlain;
if (use_cudnn && runtime_cudnn_support) {
library_ = framework::LibraryType::kCUDNN;
}
std::string data_format = ctx.Attr<std::string>("data_format");
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
framework::StringToDataLayout(data_format), library_);
}
}; };
class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -43,6 +66,17 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -43,6 +66,17 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
"The input tensor of softmax. " "The input tensor of softmax. "
"2-D with shape [batch_size, input_feature_dimensions]."); "2-D with shape [batch_size, input_feature_dimensions].");
AddOutput("Out", "The normalized values with the same shape as X."); AddOutput("Out", "The normalized values with the same shape as X.");
AddAttr<bool>(
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
"An optional string from: \"NHWC\", \"NCHW\". "
"Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ")
.SetDefault("AnyLayout");
AddComment(R"DOC( AddComment(R"DOC(
Softmax Operator. Softmax Operator.
...@@ -80,6 +114,29 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { ...@@ -80,6 +114,29 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
// choose cudnn kernel if the runtime supported.
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
bool runtime_cudnn_support = false;
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
}
#endif
framework::LibraryType library_ = framework::LibraryType::kPlain;
if (use_cudnn && runtime_cudnn_support) {
library_ = framework::LibraryType::kCUDNN;
}
std::string data_format = ctx.Attr<std::string>("data_format");
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
framework::StringToDataLayout(data_format), library_);
}
}; };
} // namespace operators } // namespace operators
......
...@@ -23,21 +23,21 @@ using Tensor = framework::Tensor; ...@@ -23,21 +23,21 @@ using Tensor = framework::Tensor;
namespace { namespace {
template <typename T> template <typename T>
__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad, __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
const int64_t* labels, const int batch_size, const int batch_size, const int class_num) {
const int class_num) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
int tid = blockIdx.x * blockDim.x + threadIdx.x; i += blockDim.x * gridDim.x) {
int sample_idx = tid / class_num; int idx = i * class_num + labels[i];
logit_grad[idx] -= static_cast<T>(1.);
if (tid < batch_size) {
PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
} }
}
__syncthreads(); template <typename T>
__global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
if (tid < batch_size * class_num) { const int class_num) {
logit_grad[tid] *= loss_grad[sample_idx]; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
logit_grad[i] *= loss_grad[i / class_num];
} }
} }
...@@ -94,22 +94,22 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -94,22 +94,22 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
const int batch_size = logit_grad->dims()[0]; const int batch_size = logit_grad->dims()[0];
const int class_num = logit_grad->dims()[1]; const int class_num = logit_grad->dims()[1];
int block = 512; int block = 512;
int grid = (batch_size * class_num + block - 1) / block; auto stream = context.cuda_device_context().stream();
if (context.Attr<bool>("soft_label")) { if (context.Attr<bool>("soft_label")) {
int grid = (batch_size * class_num + block - 1) / block;
const T* label_data = labels->data<T>(); const T* label_data = labels->data<T>();
SoftCrossEntropyGradientKernel< SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
T><<<grid, block, 0, logit_grad_data, loss_grad_data, label_data, batch_size, class_num);
context.template device_context<platform::CUDADeviceContext>()
.stream()>>>(logit_grad_data, loss_grad_data, label_data,
batch_size, class_num);
} else { } else {
int grid = (batch_size + block - 1) / block;
const int64_t* label_data = labels->data<int64_t>(); const int64_t* label_data = labels->data<int64_t>();
CrossEntropyGrad< CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
T><<<grid, block, 0, logit_grad_data, label_data, batch_size, class_num);
context.template device_context<platform::CUDADeviceContext>() int num = batch_size * class_num;
.stream()>>>(logit_grad_data, loss_grad_data, label_data, grid = (num + block - 1) / block;
batch_size, class_num); Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
class_num);
} }
} }
}; };
......
...@@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ ...@@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
namespace paddle { namespace paddle {
...@@ -80,6 +81,22 @@ enum class PoolingMode { ...@@ -80,6 +81,22 @@ enum class PoolingMode {
template <typename T> template <typename T>
class CudnnDataType; class CudnnDataType;
template <>
class CudnnDataType<float16> {
public:
static const cudnnDataType_t type = CUDNN_DATA_HALF;
// The scaling param type is float for HALF and FLOAT tensors
typedef const float ScalingParamType;
static ScalingParamType* kOne() {
static ScalingParamType v = 1.0;
return &v;
}
static ScalingParamType* kZero() {
static ScalingParamType v = 0.0;
return &v;
}
};
template <> template <>
class CudnnDataType<float> { class CudnnDataType<float> {
public: public:
...@@ -289,7 +306,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { ...@@ -289,7 +306,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (use_cudnn) { if (use_cudnn) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr; use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
} }
#endif #endif
......
...@@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device); multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
...@@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const { ...@@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE(cudaGetLastError()); PADDLE_ENFORCE(cudaGetLastError());
} }
int CUDADeviceContext::GetComputeCapability() const {
return compute_capability;
}
int CUDADeviceContext::GetMaxPhysicalThreadCount() const { int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp; return multi_process * max_threads_per_mp;
} }
......
...@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext { ...@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */ /*! \brief Return place in the device context. */
Place GetPlace() const override; Place GetPlace() const override;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */ /*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const; int GetMaxPhysicalThreadCount() const;
...@@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext {
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
int compute_capability;
int multi_process; int multi_process;
int max_threads_per_mp; int max_threads_per_mp;
}; };
......
...@@ -26,8 +26,14 @@ limitations under the License. */ ...@@ -26,8 +26,14 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace { namespace {
// Current thread's id. Note, we don't distinguish nested threads
// for now.
thread_local int cur_thread_id = 0;
// Tracking the nested block stacks of each thread.
thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks.
thread_local std::deque<std::string> annotation_stack;
thread_local const char *cur_annotation = nullptr;
std::once_flag tracer_once_flag; std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr; DeviceTracer *tracer = nullptr;
} // namespace } // namespace
...@@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -191,19 +197,19 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_[id] = anno; correlations_[id] = anno;
} }
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) { void AddCPURecords(const std::string &anno, uint64_t start_ns,
if (!anno) { uint64_t end_ns, int64_t device_id, int64_t thread_id) {
// TODO(panyx0718): Currently, it doesn't support nested situation if (anno.empty()) {
// Up-level can be cleared by low-level and therefore get nullptr VLOG(1) << "Empty timeline annotation.";
// here.
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0}); cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns, device_id, thread_id});
} }
void AddMemRecords(const std::string &name, uint64_t start_ns, void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id, uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) { uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) { if (start_ns == 0 || end_ns == 0) {
...@@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -215,8 +221,8 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
uint32_t stream_id, uint32_t correlation_id) { int64_t stream_id, uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) { if (start == 0 || end == 0) {
VLOG(3) << correlation_id << " cannot be traced"; VLOG(3) << correlation_id << " cannot be traced";
...@@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -270,27 +276,30 @@ class DeviceTracerImpl : public DeviceTracer {
continue; continue;
} }
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
event->set_name(correlations_.at(r.correlation_id)); event->set_name(correlations_.at(r.correlation_id));
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id); event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
for (const CPURecord &r : cpu_records_) { for (const CPURecord &r : cpu_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
event->set_name(r.name); event->set_name(r.name);
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_stream_id(r.thread_id); event->set_sub_device_id(r.thread_id);
event->set_device_id(-1); event->set_device_id(r.device_id);
} }
for (const MemRecord &r : mem_records_) { for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
event->set_name(r.name); event->set_name(r.name);
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id); event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
...@@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -323,8 +332,9 @@ class DeviceTracerImpl : public DeviceTracer {
if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) { (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
if (cbInfo->callbackSite == CUPTI_API_ENTER) { if (cbInfo->callbackSite == CUPTI_API_ENTER) {
const std::string anno = const std::string anno = !annotation_stack.empty()
cur_annotation ? cur_annotation : cbInfo->symbolName; ? annotation_stack.back()
: cbInfo->symbolName;
tracer->AddAnnotation(cbInfo->correlationId, anno); tracer->AddAnnotation(cbInfo->correlationId, anno);
} }
} else { } else {
...@@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer { ...@@ -351,14 +361,15 @@ class DeviceTracerDummy : public DeviceTracer {
void AddAnnotation(uint64_t id, const std::string &anno) {} void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {} void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
void AddMemRecords(const std::string &name, uint64_t start_ns, void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id, uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) {} uint32_t correlation_id, uint64_t bytes) {}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
uint32_t stream_id, uint32_t correlation_id) {} int64_t stream_id, uint32_t correlation_id) {}
bool IsEnabled() { return false; } bool IsEnabled() { return false; }
...@@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() { ...@@ -384,11 +395,28 @@ DeviceTracer *GetDeviceTracer() {
return tracer; return tracer;
} }
void SetCurAnnotation(const char *anno) { cur_annotation = anno; } void SetCurAnnotation(const std::string &anno) {
annotation_stack.push_back(anno);
}
void ClearCurAnnotation() { annotation_stack.pop_back(); }
std::string CurAnnotation() {
if (annotation_stack.empty()) return "";
return annotation_stack.back();
}
void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
void ClearCurBlock() { block_id_stack.pop_back(); }
int BlockDepth() { return block_id_stack.size(); }
void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
void ClearCurAnnotation() { cur_annotation = nullptr; } void ClearCurThread() { cur_thread_id = 0; }
const char *CurAnnotation() { return cur_annotation; } int CurThread() { return cur_thread_id; }
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -32,22 +32,23 @@ class DeviceTracer { ...@@ -32,22 +32,23 @@ class DeviceTracer {
struct KernelRecord { struct KernelRecord {
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
uint32_t device_id; int64_t device_id;
uint32_t stream_id; int64_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
struct CPURecord { struct CPURecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
uint64_t thread_id; int64_t device_id;
int64_t thread_id;
}; };
struct MemRecord { struct MemRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
uint32_t device_id; int64_t device_id;
uint32_t stream_id; int64_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
uint64_t bytes; uint64_t bytes;
}; };
...@@ -64,18 +65,18 @@ class DeviceTracer { ...@@ -64,18 +65,18 @@ class DeviceTracer {
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
virtual void AddMemRecords(const std::string& name, uint64_t start_ns, virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint64_t end_ns, int64_t device_id,
uint32_t stream_id, uint32_t correlation_id, int64_t stream_id, uint32_t correlation_id,
uint64_t bytes) = 0; uint64_t bytes) = 0;
virtual void AddCPURecords(const char* anno, uint64_t start_ns, virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns) = 0; uint64_t end_ns, int64_t device_id,
int64_t thread_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end, virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
uint32_t device_id, uint32_t stream_id, int64_t stream_id, uint32_t correlation_id) = 0;
uint32_t correlation_id) = 0;
// Generate a proto after done (Disabled). // Generate a proto after done (Disabled).
virtual proto::Profile GenProfile(const std::string& profile_path) = 0; virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
...@@ -87,10 +88,18 @@ class DeviceTracer { ...@@ -87,10 +88,18 @@ class DeviceTracer {
DeviceTracer* GetDeviceTracer(); DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread. // Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(const char* anno); void SetCurAnnotation(const std::string& anno);
// Clear the name after the operation is done. // Clear the name after the operation is done.
void ClearCurAnnotation(); void ClearCurAnnotation();
// Current name of the operation being run in the thread. // Current name of the operation being run in the thread.
const char* CurAnnotation(); std::string CurAnnotation();
void SetCurBlock(int block_id);
void ClearCurBlock();
int BlockDepth();
void SetCurThread(int thread_id);
void ClearCurThread();
int CurThread();
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -33,6 +33,15 @@ int GetCUDADeviceCount() { ...@@ -33,6 +33,15 @@ int GetCUDADeviceCount() {
return count; return count;
} }
int GetCUDAComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
cudaDeviceProp device_prop;
PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id),
"cudaGetDeviceProperties failed in "
"paddle::platform::GetCUDAComputeCapability");
return device_prop.major * 10 + device_prop.minor;
}
int GetCUDAMultiProcessors(int id) { int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count; int count;
......
...@@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse = ...@@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse =
//! Get the total number of GPU devices in system. //! Get the total number of GPU devices in system.
int GetCUDADeviceCount(); int GetCUDADeviceCount();
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
int GetCUDAComputeCapability(int i);
//! Get the MultiProcessors of the ith GPU. //! Get the MultiProcessors of the ith GPU.
int GetCUDAMultiProcessors(int i); int GetCUDAMultiProcessors(int i);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/device_vector.h>
#include <memory>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
static int dev_count = 0;
namespace paddle {
namespace platform {
TEST(NCCL, init) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]);
}
}
template <typename T>
struct PerThreadData {
thrust::device_vector<T> send_buff;
thrust::device_vector<T> recv_buff;
CUDADeviceContext dev_ctx;
T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
send_buff.resize(size);
for (size_t i = 0; i < size; ++i) {
send_buff[i] = static_cast<T>(i);
}
recv_buff.resize(size);
}
};
static constexpr int ELEM_COUNT = 10000;
TEST(NCCL, all_reduce) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
VLOG(1) << "Initializing ncclComm";
dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
VLOG(1) << "ncclComm initialized";
VLOG(1) << "Creating thread data";
std::vector<std::unique_ptr<PerThreadData<double>>> data;
data.reserve(dev_count);
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Creating thread data for device " << i;
SetDeviceId(i);
data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
}
VLOG(1) << "Thread data created";
VLOG(1) << "Check send_buf data";
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Check on device " << i;
SetDeviceId(i);
thrust::host_vector<double> tmp = data[i]->send_buff;
for (size_t j = 0; j < tmp.size(); ++j) {
ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
}
}
VLOG(1) << "Invoking ncclAllReduce";
dynload::ncclGroupStart();
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Invoking ncclAllReduce with device " << i;
SetDeviceId(i);
PADDLE_ENFORCE(dynload::ncclAllReduce(
data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
ncclSum, comms[i], data[i]->dev_ctx.stream()));
VLOG(1) << "Invoked ncclAllReduce for device " << i;
}
dynload::ncclGroupEnd();
VLOG(1) << "Invoked ncclAllReduce";
VLOG(1) << "Sync devices";
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Sync device " << i;
SetDeviceId(i);
data[i]->dev_ctx.Wait();
}
VLOG(1) << "device synced";
for (int i = 0; i < dev_count; ++i) {
SetDeviceId(i);
VLOG(1) << "Checking vector on device " << i;
thrust::host_vector<double> tmp = data[i]->recv_buff;
for (size_t j = 0; j < tmp.size(); ++j) {
auto elem = static_cast<double>(j);
elem *= dev_count;
ASSERT_NEAR(tmp[j], elem, 1e-4);
}
}
for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]);
}
}
} // namespace platform
} // namespace paddle
int main(int argc, char** argv) {
dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
}
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
...@@ -147,19 +147,48 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) ...@@ -147,19 +147,48 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
name_ = name; name_ = name;
PushEvent(name_, dev_ctx_); PushEvent(name_, dev_ctx_);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(name_.c_str()); SetCurAnnotation(name_);
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec()); tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
BlockDepth(), CurThread());
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_, dev_ctx_); PopEvent(name_, dev_ctx_);
} }
RecordBlock::RecordBlock(int block_id) : start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return;
SetCurBlock(block_id);
name_ = string::Sprintf("block_%d", block_id);
}
RecordBlock::~RecordBlock() {
if (g_state == ProfilerState::kDisabled) return;
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
CurThread());
}
ClearCurBlock();
}
RecordThread::RecordThread(int thread_id) {
if (g_state == ProfilerState::kDisabled) return;
SetCurThread(thread_id);
}
RecordThread::~RecordThread() {
if (g_state == ProfilerState::kDisabled) return;
ClearCurThread();
}
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
......
...@@ -118,6 +118,20 @@ struct RecordEvent { ...@@ -118,6 +118,20 @@ struct RecordEvent {
std::string full_name_; std::string full_name_;
}; };
struct RecordBlock {
explicit RecordBlock(int block_id);
~RecordBlock();
private:
std::string name_;
uint64_t start_ns_;
};
struct RecordThread {
explicit RecordThread(int thread_id);
~RecordThread();
};
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
......
...@@ -18,12 +18,17 @@ package paddle.platform.proto; ...@@ -18,12 +18,17 @@ package paddle.platform.proto;
message MemCopy { optional uint64 bytes = 1; } message MemCopy { optional uint64 bytes = 1; }
message Event { message Event {
enum EventType {
CPU = 0;
GPUKernel = 1;
}
optional EventType type = 8;
optional string name = 1; optional string name = 1;
optional uint64 start_ns = 2; optional uint64 start_ns = 2;
optional uint64 end_ns = 3; optional uint64 end_ns = 3;
// When positive, it represents gpu id. When -1, it represents CPU. // When positive, it represents gpu id. When -1, it represents CPU.
optional int64 device_id = 5; optional int64 device_id = 5;
optional uint32 stream_id = 6; optional int64 sub_device_id = 6;
optional MemCopy memcopy = 7; optional MemCopy memcopy = 7;
} }
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID) if(NOT APPLE AND NOT ANDROID)
......
...@@ -161,6 +161,8 @@ void BindBlockDesc(py::module &m) { ...@@ -161,6 +161,8 @@ void BindBlockDesc(py::module &m) {
py::return_value_policy::reference) py::return_value_policy::reference)
.def("prepend_op", &BlockDesc::PrependOp, .def("prepend_op", &BlockDesc::PrependOp,
py::return_value_policy::reference) py::return_value_policy::reference)
.def("insert_op", &BlockDesc::InsertOp,
py::return_value_policy::reference)
.def("remove_op", &BlockDesc::RemoveOp) .def("remove_op", &BlockDesc::RemoveOp)
.def("var", .def("var",
[](BlockDesc &self, py::bytes byte_name) { [](BlockDesc &self, py::bytes byte_name) {
......
...@@ -26,16 +26,20 @@ limitations under the License. */ ...@@ -26,16 +26,20 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/cond_op.h" #include "paddle/fluid/operators/cond_op.h"
#include "paddle/fluid/operators/net_op.h" #include "paddle/fluid/operators/net_op.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/pybind/recordio.h"
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h" #include "paddle/fluid/string/to_string.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -100,12 +104,14 @@ PYBIND11_PLUGIN(core) { ...@@ -100,12 +104,14 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCPUTensorSetFromArray<double>) .def("set", PyCPUTensorSetFromArray<double>)
.def("set", PyCPUTensorSetFromArray<int64_t>) .def("set", PyCPUTensorSetFromArray<int64_t>)
.def("set", PyCPUTensorSetFromArray<bool>) .def("set", PyCPUTensorSetFromArray<bool>)
.def("set", PyCPUTensorSetFromArray<uint16_t>)
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
.def("set", PyCUDATensorSetFromArray<float>) .def("set", PyCUDATensorSetFromArray<float>)
.def("set", PyCUDATensorSetFromArray<int>) .def("set", PyCUDATensorSetFromArray<int>)
.def("set", PyCUDATensorSetFromArray<double>) .def("set", PyCUDATensorSetFromArray<double>)
.def("set", PyCUDATensorSetFromArray<int64_t>) .def("set", PyCUDATensorSetFromArray<int64_t>)
.def("set", PyCUDATensorSetFromArray<bool>) .def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>)
#endif #endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>) .def("set_float_element", TensorSetElement<float>)
...@@ -217,8 +223,18 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -217,8 +223,18 @@ All parameter, weight, gradient are variables in Paddle.
[](Variable &self) -> operators::NetOp * { [](Variable &self) -> operators::NetOp * {
return self.GetMutable<operators::NetOp>(); return self.GetMutable<operators::NetOp>();
}, },
py::return_value_policy::reference)
.def("get_reader",
[](Variable &self) -> framework::ReaderHolder * {
PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
return self.GetMutable<framework::ReaderHolder>();
},
py::return_value_policy::reference); py::return_value_policy::reference);
py::class_<framework::ReaderHolder>(m, "Reader", "")
.def("has_next", &framework::ReaderHolder::HasNext)
.def("reset", &framework::ReaderHolder::ReInit);
py::class_<Scope>(m, "Scope", "") py::class_<Scope>(m, "Scope", "")
.def("var", .def("var",
[](Scope &self, const std::string &name) -> Variable * { [](Scope &self, const std::string &name) -> Variable * {
...@@ -302,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -302,7 +318,6 @@ All parameter, weight, gradient are variables in Paddle.
#endif #endif
}); });
// clang-format on // clang-format on
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
...@@ -410,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -410,6 +425,12 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", &framework::InitDevices); m.def("init_devices", &framework::InitDevices);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
#ifdef PADDLE_WITH_CUDA
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
// Only GPUs with Compute Capability >= 53 support float16
return platform::GetCUDAComputeCapability(place.device) >= 53;
});
#endif
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
...@@ -474,6 +495,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -474,6 +495,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def("enable_profiler", platform::EnableProfiler); m.def("enable_profiler", platform::EnableProfiler);
m.def("disable_profiler", platform::DisableProfiler); m.def("disable_profiler", platform::DisableProfiler);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
BindRecordIOWriter(m);
return m.ptr(); return m.ptr();
} }
} // namespace pybind } // namespace pybind
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/pybind/recordio.h"
#include <fstream>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/writer.h"
namespace paddle {
namespace pybind {
class RecordIOWriter {
public:
RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
size_t max_num_record)
: stream_(filename), writer_(&stream_, compressor, max_num_record) {}
void AppendTensor(const framework::LoDTensor& tensor) {
tensors_.push_back(tensor);
}
void CompleteAppendTensor() {
auto& ctx =
*platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
framework::WriteToRecordIO(writer_, tensors_, ctx);
tensors_.clear();
}
void Close() {
PADDLE_ENFORCE(tensors_.empty());
writer_.Flush();
stream_.close();
}
private:
std::vector<framework::LoDTensor> tensors_;
std::ofstream stream_;
recordio::Writer writer_;
};
void BindRecordIOWriter(py::module& m) {
py::class_<RecordIOWriter> writer(m, "RecordIOWriter", "");
py::enum_<recordio::Compressor>(writer, "Compressor", "")
.value("Snappy", recordio::Compressor::kSnappy)
.value("NoCompress", recordio::Compressor::kNoCompress);
writer
.def("__init__",
[](RecordIOWriter& self, const std::string& filename,
recordio::Compressor compressor, size_t max_num_record) {
new (&self) RecordIOWriter(filename, compressor, max_num_record);
})
.def("append_tensor", &RecordIOWriter::AppendTensor)
.def("complete_append_tensor", &RecordIOWriter::CompleteAppendTensor)
.def("close", &RecordIOWriter::Close);
}
} // namespace pybind
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
extern void BindRecordIOWriter(py::module& m);
} // namespace pybind
} // namespace paddle
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -71,27 +72,39 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -71,27 +72,39 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
paddle::platform::GpuMemcpyAsync( paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
cudaMemcpyDeviceToHost, dev_ctx->stream()); cudaMemcpyDeviceToHost, dev_ctx->stream());
dev_ctx->Wait();
#else #else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif #endif
} else if (paddle::platform::is_cpu_place(tensor.place())) { } else if (paddle::platform::is_cpu_place(tensor.place())) {
dst_tensor = tensor; dst_tensor = tensor;
} }
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
py::format_descriptor<CUR_TYPE>::format(), if (std::type_index(typeid(CUR_TYPE)) ==
(size_t)framework::arity(dst_tensor.dims()), std::type_index(typeid(platform::float16))) {
dims_outside, strides); return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
"e", /* np.dtype('e') == np.float16 */
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
} else {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
py::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
}
} else { } else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value; constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor); return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
} }
} }
}; };
} // namespace details } // namespace details
inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) { inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
auto buffer_info = auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()( details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
tensor); platform::float16>()(tensor);
return buffer_info; return buffer_info;
} }
...@@ -136,6 +149,22 @@ void PyCPUTensorSetFromArray( ...@@ -136,6 +149,22 @@ void PyCPUTensorSetFromArray(
std::memcpy(dst, array.data(), sizeof(T) * array.size()); std::memcpy(dst, array.data(), sizeof(T) * array.size());
} }
template <>
void PyCPUTensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CPUPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
...@@ -157,6 +186,28 @@ void PyCUDATensorSetFromArray( ...@@ -157,6 +186,28 @@ void PyCUDATensorSetFromArray(
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice, dev_ctx->stream());
} }
template <>
void PyCUDATensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CUDAPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(),
sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
}
#endif #endif
} // namespace pybind } // namespace pybind
......
...@@ -3,4 +3,7 @@ cc_library(header SRCS header.cc) ...@@ -3,4 +3,7 @@ cc_library(header SRCS header.cc)
cc_test(header_test SRCS header_test.cc DEPS header) cc_test(header_test SRCS header_test.cc DEPS header)
cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib) cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
cc_test(chunk_test SRCS chunk_test.cc DEPS chunk) cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
cc_library(recordio DEPS chunk header) cc_library(writer SRCS writer.cc DEPS chunk)
cc_library(scanner SRCS scanner.cc DEPS chunk)
cc_test(writer_scanner_test SRCS writer_scanner_test.cc DEPS writer scanner)
cc_library(recordio DEPS chunk header writer scanner)
...@@ -24,33 +24,52 @@ namespace paddle { ...@@ -24,33 +24,52 @@ namespace paddle {
namespace recordio { namespace recordio {
constexpr size_t kMaxBufSize = 1024; constexpr size_t kMaxBufSize = 1024;
/**
* Read Stream by a fixed sized buffer.
* @param in input stream
* @param limit read at most `limit` bytes from input stream. 0 means no limit
* @param callback A function object with (const char* buf, size_t size) -> void
* as its type.
*/
template <typename Callback> template <typename Callback>
static void ReadStreamByBuf(std::istream& in, int limit, Callback callback) { static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
char buf[kMaxBufSize]; char buf[kMaxBufSize];
std::streamsize actual_size; std::streamsize actual_size;
size_t counter = 0; size_t counter = 0;
do { size_t actual_max;
auto actual_max = while (!in.eof() ||
limit > 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize; (limit != 0 && counter >= limit)) { // End of file or reach limit
actual_size = in.readsome(buf, actual_max); actual_max =
limit != 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize;
in.read(buf, actual_max);
actual_size = in.gcount();
if (actual_size == 0) { if (actual_size == 0) {
break; break;
} }
callback(buf, actual_size); callback(buf, actual_size);
if (limit > 0) { if (limit != 0) {
counter += actual_size; counter += actual_size;
} }
} while (actual_size == kMaxBufSize); }
in.clear(); // unset eof state
} }
/**
* Copy stream in to another stream
*/
static void PipeStream(std::istream& in, std::ostream& os) { static void PipeStream(std::istream& in, std::ostream& os) {
ReadStreamByBuf( ReadStreamByBuf(
in, -1, [&os](const char* buf, size_t len) { os.write(buf, len); }); in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); });
} }
static uint32_t Crc32Stream(std::istream& in, int limit = -1) {
auto crc = crc32(0, nullptr, 0); /**
* Calculate CRC32 from an input stream.
*/
static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) { ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
crc = crc32(crc, reinterpret_cast<const Bytef*>(buf), len); crc = static_cast<uint32_t>(crc32(
crc, reinterpret_cast<const Bytef*>(buf), static_cast<uInt>(len)));
}); });
return crc; return crc;
} }
...@@ -85,28 +104,29 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const { ...@@ -85,28 +104,29 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const {
compressed_stream.reset(); compressed_stream.reset();
} }
auto end_pos = sout.tellg(); sout.seekg(0, std::ios::end);
uint32_t len = static_cast<uint32_t>(sout.tellg());
sout.seekg(0, std::ios::beg); sout.seekg(0, std::ios::beg);
uint32_t len = static_cast<uint32_t>(end_pos - sout.tellg());
uint32_t crc = Crc32Stream(sout); uint32_t crc = Crc32Stream(sout);
sout.seekg(0, std::ios::beg);
Header hdr(static_cast<uint32_t>(records_.size()), crc, ct, len); Header hdr(static_cast<uint32_t>(records_.size()), crc, ct, len);
hdr.Write(os); hdr.Write(os);
sout.seekg(0, std::ios::beg);
sout.clear();
PipeStream(sout, os); PipeStream(sout, os);
return true; return true;
} }
void Chunk::Parse(std::istream& sin) { bool Chunk::Parse(std::istream& sin) {
Header hdr; Header hdr;
hdr.Parse(sin); bool ok = hdr.Parse(sin);
if (!ok) {
return ok;
}
auto beg_pos = sin.tellg(); auto beg_pos = sin.tellg();
auto crc = Crc32Stream(sin, hdr.CompressSize()); uint32_t crc = Crc32Stream(sin, hdr.CompressSize());
PADDLE_ENFORCE_EQ(hdr.Checksum(), crc); PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
Clear(); Clear();
sin.seekg(beg_pos, sin.beg);
sin.seekg(beg_pos, std::ios::beg);
std::unique_ptr<std::istream> compressed_stream; std::unique_ptr<std::istream> compressed_stream;
switch (hdr.CompressType()) { switch (hdr.CompressType()) {
case Compressor::kNoCompress: case Compressor::kNoCompress:
...@@ -126,8 +146,10 @@ void Chunk::Parse(std::istream& sin) { ...@@ -126,8 +146,10 @@ void Chunk::Parse(std::istream& sin) {
std::string buf; std::string buf;
buf.resize(rec_len); buf.resize(rec_len);
stream.read(&buf[0], rec_len); stream.read(&buf[0], rec_len);
PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
Add(buf); Add(buf);
} }
return true;
} }
} // namespace recordio } // namespace recordio
......
...@@ -26,9 +26,9 @@ namespace recordio { ...@@ -26,9 +26,9 @@ namespace recordio {
class Chunk { class Chunk {
public: public:
Chunk() : num_bytes_(0) {} Chunk() : num_bytes_(0) {}
void Add(std::string buf) { void Add(const std::string& buf) {
records_.push_back(buf);
num_bytes_ += buf.size(); num_bytes_ += buf.size();
records_.emplace_back(buf);
} }
// dump the chunk into w, and clears the chunk and makes it ready for // dump the chunk into w, and clears the chunk and makes it ready for
// the next add invocation. // the next add invocation.
...@@ -37,10 +37,15 @@ public: ...@@ -37,10 +37,15 @@ public:
records_.clear(); records_.clear();
num_bytes_ = 0; num_bytes_ = 0;
} }
void Parse(std::istream& sin);
size_t NumBytes() { return num_bytes_; } // returns true if ok, false if eof
bool Parse(std::istream& sin);
size_t NumBytes() const { return num_bytes_; }
size_t NumRecords() const { return records_.size(); }
const std::string& Record(int i) const { return records_[i]; } const std::string& Record(int i) const { return records_[i]; }
bool Empty() const { return records_.empty(); }
private: private:
std::vector<std::string> records_; std::vector<std::string> records_;
// sum of record lengths in bytes. // sum of record lengths in bytes.
......
...@@ -26,7 +26,7 @@ TEST(Chunk, SaveLoad) { ...@@ -26,7 +26,7 @@ TEST(Chunk, SaveLoad) {
ch.Add(std::string("123", 4)); ch.Add(std::string("123", 4));
std::stringstream ss; std::stringstream ss;
ch.Write(ss, Compressor::kNoCompress); ch.Write(ss, Compressor::kNoCompress);
ch.Clear(); ss.seekg(0);
ch.Parse(ss); ch.Parse(ss);
ASSERT_EQ(ch.NumBytes(), 10U); ASSERT_EQ(ch.NumBytes(), 10U);
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/recordio/header.h" #include "paddle/fluid/recordio/header.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
...@@ -26,23 +27,33 @@ Header::Header() ...@@ -26,23 +27,33 @@ Header::Header()
Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs) Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
: num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {} : num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {}
void Header::Parse(std::istream& is) { bool Header::Parse(std::istream& is) {
uint32_t magic;
size_t read_size =
is.readsome(reinterpret_cast<char*>(&magic), sizeof(uint32_t));
if (read_size < sizeof(uint32_t)) {
return false;
}
PADDLE_ENFORCE_EQ(magic, kMagicNumber);
is.read(reinterpret_cast<char*>(&num_records_), sizeof(uint32_t)) is.read(reinterpret_cast<char*>(&num_records_), sizeof(uint32_t))
.read(reinterpret_cast<char*>(&checksum_), sizeof(uint32_t)) .read(reinterpret_cast<char*>(&checksum_), sizeof(uint32_t))
.read(reinterpret_cast<char*>(&compressor_), sizeof(uint32_t)) .read(reinterpret_cast<char*>(&compressor_), sizeof(uint32_t))
.read(reinterpret_cast<char*>(&compress_size_), sizeof(uint32_t)); .read(reinterpret_cast<char*>(&compress_size_), sizeof(uint32_t));
return true;
} }
void Header::Write(std::ostream& os) const { void Header::Write(std::ostream& os) const {
os.write(reinterpret_cast<const char*>(&num_records_), sizeof(uint32_t)) os.write(reinterpret_cast<const char*>(&kMagicNumber), sizeof(uint32_t))
.write(reinterpret_cast<const char*>(&num_records_), sizeof(uint32_t))
.write(reinterpret_cast<const char*>(&checksum_), sizeof(uint32_t)) .write(reinterpret_cast<const char*>(&checksum_), sizeof(uint32_t))
.write(reinterpret_cast<const char*>(&compressor_), sizeof(uint32_t)) .write(reinterpret_cast<const char*>(&compressor_), sizeof(uint32_t))
.write(reinterpret_cast<const char*>(&compress_size_), sizeof(uint32_t)); .write(reinterpret_cast<const char*>(&compress_size_), sizeof(uint32_t));
} }
std::ostream& operator<<(std::ostream& os, Header h) { std::ostream& operator<<(std::ostream& os, Header h) {
os << h.NumRecords() << h.Checksum() os << "Header: " << h.NumRecords() << ", " << h.Checksum() << ", "
<< static_cast<uint32_t>(h.CompressType()) << h.CompressSize(); << static_cast<uint32_t>(h.CompressType()) << ", " << h.CompressSize();
return os; return os;
} }
......
...@@ -19,8 +19,6 @@ ...@@ -19,8 +19,6 @@
namespace paddle { namespace paddle {
namespace recordio { namespace recordio {
// Default ChunkSize
constexpr size_t kDefaultMaxChunkSize = 32 * 1024 * 1024;
// MagicNumber for memory checking // MagicNumber for memory checking
constexpr uint32_t kMagicNumber = 0x01020304; constexpr uint32_t kMagicNumber = 0x01020304;
...@@ -44,7 +42,9 @@ public: ...@@ -44,7 +42,9 @@ public:
Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs); Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
void Write(std::ostream& os) const; void Write(std::ostream& os) const;
void Parse(std::istream& is);
// returns true if OK, false if eof
bool Parse(std::istream& is);
uint32_t NumRecords() const { return num_records_; } uint32_t NumRecords() const { return num_records_; }
uint32_t Checksum() const { return checksum_; } uint32_t Checksum() const { return checksum_; }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace recordio {
Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
: stream_(std::move(stream)) {
Reset();
}
Scanner::Scanner(const std::string &filename) {
stream_.reset(new std::ifstream(filename));
Reset();
}
void Scanner::Reset() {
stream_->seekg(0, std::ios::beg);
ParseNextChunk();
}
std::string Scanner::Next() {
PADDLE_ENFORCE(!eof_, "StopIteration");
auto rec = cur_chunk_.Record(offset_++);
if (offset_ == cur_chunk_.NumRecords()) {
ParseNextChunk();
}
return rec;
}
void Scanner::ParseNextChunk() {
eof_ = !cur_chunk_.Parse(*stream_);
offset_ = 0;
}
bool Scanner::HasNext() const { return !eof_; }
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <fstream>
#include <memory>
#include "paddle/fluid/recordio/chunk.h"
namespace paddle {
namespace recordio {
class Scanner {
public:
explicit Scanner(std::unique_ptr<std::istream>&& stream);
explicit Scanner(const std::string& filename);
void Reset();
std::string Next();
bool HasNext() const;
private:
std::unique_ptr<std::istream> stream_;
Chunk cur_chunk_;
size_t offset_;
bool eof_;
void ParseNextChunk();
};
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/writer.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace recordio {
void Writer::Write(const std::string& record) {
cur_chunk_.Add(record);
if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {
Flush();
}
}
void Writer::Flush() {
cur_chunk_.Write(stream_, compressor_);
cur_chunk_.Clear();
}
Writer::~Writer() {
PADDLE_ENFORCE(cur_chunk_.Empty(), "Writer must be flushed when destroy.");
}
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/recordio/chunk.h"
namespace paddle {
namespace recordio {
class Writer {
public:
Writer(std::ostream* sout,
Compressor compressor,
size_t max_num_records_in_chunk = 1000)
: stream_(*sout),
max_num_records_in_chunk_(max_num_records_in_chunk),
compressor_(compressor) {}
void Write(const std::string& record);
void Flush();
~Writer();
private:
std::ostream& stream_;
size_t max_num_records_in_chunk_;
Chunk cur_chunk_;
Compressor compressor_;
};
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include <sstream>
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
TEST(WriterScanner, Normal) {
std::stringstream* stream = new std::stringstream();
{
paddle::recordio::Writer writer(stream,
paddle::recordio::Compressor::kSnappy);
writer.Write("ABC");
writer.Write("BCD");
writer.Write("CDE");
writer.Flush();
}
{
stream->seekg(0, std::ios::beg);
std::unique_ptr<std::istream> stream_ptr(stream);
paddle::recordio::Scanner scanner(std::move(stream_ptr));
ASSERT_TRUE(scanner.HasNext());
ASSERT_EQ(scanner.Next(), "ABC");
ASSERT_EQ("BCD", scanner.Next());
ASSERT_TRUE(scanner.HasNext());
ASSERT_EQ("CDE", scanner.Next());
ASSERT_FALSE(scanner.HasNext());
}
}
TEST(WriterScanner, TinyChunk) {
std::stringstream* stream = new std::stringstream();
{
paddle::recordio::Writer writer(
stream, paddle::recordio::Compressor::kNoCompress, 2 /*max chunk num*/);
writer.Write("ABC");
writer.Write("BCD");
writer.Write("CDE");
writer.Write("DEFG");
writer.Flush();
}
{
stream->seekg(0, std::ios::beg);
std::unique_ptr<std::istream> stream_ptr(stream);
paddle::recordio::Scanner scanner(std::move(stream_ptr));
ASSERT_TRUE(scanner.HasNext());
ASSERT_EQ(scanner.Next(), "ABC");
ASSERT_EQ(scanner.Next(), "BCD");
ASSERT_EQ(scanner.Next(), "CDE");
ASSERT_EQ(scanner.Next(), "DEFG");
ASSERT_FALSE(scanner.HasNext());
}
}
\ No newline at end of file
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py) file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py) file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
set(PY_FILES paddle/__init__.py set(PY_FILES paddle/__init__.py
${TRAINER_PY_FILES}
${HELPERS_PY_FILES}
${UTILS_PY_FILES} ${UTILS_PY_FILES}
${V2_PY_FILES}
${FLUID_PY_FILES}) ${FLUID_PY_FILES})
add_custom_target(copy_paddle_master) if(NOT WITH_FLUID)
file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
set(PY_FILES ${PY_FILES}
${TRAINER_PY_FILES}
${HELPERS_PY_FILES}
${V2_PY_FILES})
SET(COPY_PADDLE_MASTER "") add_custom_target(copy_paddle_master)
if(WITH_GOLANG)
SET(COPY_PADDLE_MASTER "copy_paddle_master") SET(COPY_PADDLE_MASTER "")
add_custom_command(TARGET ${COPY_PADDLE_MASTER} if(WITH_GOLANG)
COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/ SET(COPY_PADDLE_MASTER "copy_paddle_master")
) add_custom_command(TARGET ${COPY_PADDLE_MASTER}
add_dependencies(copy_paddle_master paddle_master) COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
endif(WITH_GOLANG) )
add_dependencies(copy_paddle_master paddle_master)
endif(WITH_GOLANG)
endif()
set(MKL_SHARED_LIBS "") set(MKL_SHARED_LIBS "")
set(MKL_DEPENDS "") set(MKL_DEPENDS "")
...@@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ...@@ -59,23 +61,28 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS}) set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
if(WITH_SWIG_PY) if(NOT WITH_FLUID)
list(APPEND paddle_python_deps python_api_wheel) set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
if(WITH_SWIG_PY)
list(APPEND paddle_python_deps python_api_wheel)
endif()
endif() endif()
add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
if (WITH_TESTING) if (WITH_TESTING)
add_subdirectory(paddle/trainer_config_helpers/tests) if(NOT WITH_FLUID)
if (WITH_SWIG_PY) add_subdirectory(paddle/trainer_config_helpers/tests)
# enable v2 API unittest only when paddle swig api is compiled if (WITH_SWIG_PY)
add_subdirectory(paddle/v2/tests) # enable v2 API unittest only when paddle swig api is compiled
add_subdirectory(paddle/v2/reader/tests) add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/plot/tests) add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/fluid/tests) add_subdirectory(paddle/v2/plot/tests)
endif()
endif() endif()
add_subdirectory(paddle/fluid/tests)
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
DESTINATION opt/paddle/share/wheels DESTINATION opt/paddle/share/wheels
......
...@@ -35,11 +35,12 @@ from core import LoDTensor, CPUPlace, CUDAPlace ...@@ -35,11 +35,12 @@ from core import LoDTensor, CPUPlace, CUDAPlace
from distribute_transpiler import DistributeTranspiler from distribute_transpiler import DistributeTranspiler
from distribute_transpiler_simple import SimpleDistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler
from concurrency import (Go, make_channel, channel_send, channel_recv, from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close) channel_close, Select)
import clip import clip
from memory_optimization_transpiler import memory_optimize from memory_optimization_transpiler import memory_optimize, release_memory
import profiler import profiler
import unique_name import unique_name
import recordio_writer
Tensor = LoDTensor Tensor = LoDTensor
...@@ -63,8 +64,10 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ ...@@ -63,8 +64,10 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
'SimpleDistributeTranspiler', 'SimpleDistributeTranspiler',
'DistributeTranspiler', 'DistributeTranspiler',
'memory_optimize', 'memory_optimize',
'release_memory',
'profiler', 'profiler',
'unique_name', 'unique_name',
'recordio_writer',
] ]
......
...@@ -248,12 +248,15 @@ def _callback_lookup_(op): ...@@ -248,12 +248,15 @@ def _callback_lookup_(op):
if o_argu in self.param_grad_names: if o_argu in self.param_grad_names:
allreduce_out_name = o_argu + "__nccl_all_reduce__" allreduce_out_name = o_argu + "__nccl_all_reduce__"
op_desc = _create_op_desc_( op_desc = _create_op_desc_(
"ncclAllReduce", { "ncclReduce",
{
"X": [o_argu], "X": [o_argu],
"Communicator": "Communicator":
['nccl_com__do_not_change_'] ['nccl_com__do_not_change_']
}, {"Out": [allreduce_out_name]}, },
{"reduction": "ncclSum"}) {"Out": [allreduce_out_name]},
{"reduction": "ncclSum",
"root": 0}, )
block.desc.append_op().copy_from(op_desc) block.desc.append_op().copy_from(op_desc)
op_desc = _create_op_desc_( op_desc = _create_op_desc_(
...@@ -457,7 +460,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, ...@@ -457,7 +460,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
"Out": [_append_grad_suffix_(loss.name)] "Out": [_append_grad_suffix_(loss.name)]
}, {"shape": [1], }, {"shape": [1],
"value": 1.0, "value": 1.0,
"dtype": loss.dtype}) "dtype": loss.dtype,
"force_cpu": False})
root_block.desc.append_op().copy_from(op_desc) root_block.desc.append_op().copy_from(op_desc)
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
......
...@@ -12,17 +12,14 @@ ...@@ -12,17 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from layers.control_flow import BlockGuard from layers.control_flow import BlockGuard, Select
from layer_helper import LayerHelper, unique_name from layer_helper import LayerHelper, unique_name
from layers import fill_constant from layers import fill_constant
import core import core
__all__ = [ __all__ = [
'Go', 'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',
'make_channel', 'Select'
'channel_send',
'channel_recv',
'channel_close',
] ]
...@@ -198,7 +195,7 @@ def channel_recv(channel, return_value): ...@@ -198,7 +195,7 @@ def channel_recv(channel, return_value):
ch = fluid.make_channel(dtype='int32', capacity=10) ch = fluid.make_channel(dtype='int32', capacity=10)
with fluid.Go(): with fluid.Go():
returned_value = fluid.channel_recv(ch, 'int32') returned_value, return_status = fluid.channel_recv(ch, 'int32')
# Code to send data through the channel. # Code to send data through the channel.
""" """
......
...@@ -16,6 +16,7 @@ import sys ...@@ -16,6 +16,7 @@ import sys
import re import re
from graphviz import GraphPreviewGenerator from graphviz import GraphPreviewGenerator
import proto.framework_pb2 as framework_pb2 import proto.framework_pb2 as framework_pb2
import paddle.fluid.core as core
_vartype2str_ = [ _vartype2str_ = [
"UNK", "UNK",
...@@ -52,9 +53,11 @@ reprtpl = "{ttype} {name} ({reprs})" ...@@ -52,9 +53,11 @@ reprtpl = "{ttype} {name} ({reprs})"
def repr_lodtensor(proto): def repr_lodtensor(proto):
if not proto.lod_tensor: return if proto.type.type != framework_pb2.VarType.LOD_TENSOR:
level = proto.lod_tensor.lod_level return
reprs = repr_tensor(proto.lod_tensor.tensor)
level = proto.type.lod_tensor.lod_level
reprs = repr_tensor(proto.type.lod_tensor.tensor)
return reprtpl.format( return reprtpl.format(
ttype="LoDTensor" if level > 0 else "Tensor", ttype="LoDTensor" if level > 0 else "Tensor",
name=proto.name, name=proto.name,
...@@ -62,20 +65,24 @@ def repr_lodtensor(proto): ...@@ -62,20 +65,24 @@ def repr_lodtensor(proto):
def repr_selected_rows(proto): def repr_selected_rows(proto):
if not proto.selected_rows: return if proto.type.type != framework_pb2.VarType.SELECTED_ROWS:
return
return reprtpl.format( return reprtpl.format(
ttype="SelectedRows", ttype="SelectedRows",
name=proto.name, name=proto.name,
reprs=repr_tensor(proto.selected_rows)) reprs=repr_tensor(proto.type.selected_rows))
def repr_tensor_array(proto): def repr_tensor_array(proto):
if not proto.tensor_array: return if proto.type.type != framework_pb2.VarType.LOD_TENSOR_ARRAY:
return
return reprtpl.format( return reprtpl.format(
ttype="TensorArray", ttype="TensorArray",
name=proto.name, name=proto.name,
reprs="level=%d, %s" % (proto.tensor_array.lod_level, reprs="level=%d, %s" % (proto.type.tensor_array.lod_level,
repr_tensor(proto.lod_tensor))) repr_tensor(proto.type.lod_tensor.tensor)))
type_handlers = [ type_handlers = [
...@@ -119,6 +126,7 @@ def pprint_block_codes(block_desc, show_backward=False): ...@@ -119,6 +126,7 @@ def pprint_block_codes(block_desc, show_backward=False):
def is_var_backward(var_desc): def is_var_backward(var_desc):
return "@GRAD" in var_desc.name return "@GRAD" in var_desc.name
#print(type(block_desc))
if type(block_desc) is not framework_pb2.BlockDesc: if type(block_desc) is not framework_pb2.BlockDesc:
block_desc = framework_pb2.BlockDesc.FromString( block_desc = framework_pb2.BlockDesc.FromString(
block_desc.serialize_to_string()) block_desc.serialize_to_string())
......
...@@ -250,6 +250,8 @@ class DistributeTranspiler: ...@@ -250,6 +250,8 @@ class DistributeTranspiler:
def get_trainer_program(self): def get_trainer_program(self):
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
self.program.global_block().delete_ops(self.optimize_ops) self.program.global_block().delete_ops(self.optimize_ops)
# FIXME(typhoonzero): serialize once will fix error occurs when clone.
self.program.__str__()
return self.program return self.program
def get_pserver_program(self, endpoint): def get_pserver_program(self, endpoint):
...@@ -309,7 +311,8 @@ class DistributeTranspiler: ...@@ -309,7 +311,8 @@ class DistributeTranspiler:
for _, opt_op in enumerate(opt_op_on_pserver): for _, opt_op in enumerate(opt_op_on_pserver):
if ufind.is_connected(op, opt_op): if ufind.is_connected(op, opt_op):
if self._is_opt_op(op): if self._is_opt_op(op):
self._append_pserver_ops(optimize_block, op, endpoint) self._append_pserver_ops(optimize_block, op, endpoint,
default_main_program())
else: else:
self._append_pserver_non_opt_ops(optimize_block, op) self._append_pserver_non_opt_ops(optimize_block, op)
break break
...@@ -520,7 +523,8 @@ class DistributeTranspiler: ...@@ -520,7 +523,8 @@ class DistributeTranspiler:
orig_var_name = varname[:suff_idx] orig_var_name = varname[:suff_idx]
return orig_var_name return orig_var_name
def _append_pserver_ops(self, optimize_block, opt_op, endpoint): def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
origin_program):
program = optimize_block.program program = optimize_block.program
pserver_block = program.global_block() pserver_block = program.global_block()
new_inputs = dict() new_inputs = dict()
...@@ -576,7 +580,17 @@ class DistributeTranspiler: ...@@ -576,7 +580,17 @@ class DistributeTranspiler:
elif key == "LearningRate": elif key == "LearningRate":
# leraning rate variable has already be created by non-optimize op, # leraning rate variable has already be created by non-optimize op,
# don't create it once again. # don't create it once again.
new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]] lr_varname = opt_op.input(key)[0]
if pserver_block.vars.has_key(lr_varname):
new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
else:
origin_var = origin_program.global_block().vars[lr_varname]
tmpvar = pserver_block.create_var(
name=origin_var.name,
persistable=origin_var.persistable,
dtype=origin_var.dtype,
shape=origin_var.shape)
new_inputs[key] = tmpvar
for key in opt_op.input_names: for key in opt_op.input_names:
new_shape = None new_shape = None
......
...@@ -487,7 +487,7 @@ class Operator(object): ...@@ -487,7 +487,7 @@ class Operator(object):
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
'load_combine', 'ncclInit', 'channel_create', 'channel_close', 'load_combine', 'ncclInit', 'channel_create', 'channel_close',
'channel_send', 'channel_recv' 'channel_send', 'channel_recv', 'select'
} }
if type not in no_kernel_op_set: if type not in no_kernel_op_set:
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
......
...@@ -47,7 +47,7 @@ def is_parameter(var): ...@@ -47,7 +47,7 @@ def is_parameter(var):
def is_persistable(var): def is_persistable(var):
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST: var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
return False return False
return var.persistable return var.persistable
......
...@@ -16,7 +16,7 @@ import contextlib ...@@ -16,7 +16,7 @@ import contextlib
from layer_function_generator import autodoc from layer_function_generator import autodoc
from tensor import assign, fill_constant from tensor import assign, fill_constant
from .. import core from .. import core
from ..framework import Program, Variable, Operator from ..framework import Program, Variable, Operator, Block
from ..layer_helper import LayerHelper, unique_name from ..layer_helper import LayerHelper, unique_name
from ops import logical_and, logical_not, logical_or from ops import logical_and, logical_not, logical_or
...@@ -29,6 +29,7 @@ __all__ = [ ...@@ -29,6 +29,7 @@ __all__ = [
'WhileGuard', 'WhileGuard',
'While', 'While',
'Switch', 'Switch',
'Select',
'lod_rank_table', 'lod_rank_table',
'max_sequence_len', 'max_sequence_len',
'topk', 'topk',
...@@ -1211,6 +1212,186 @@ class Switch(object): ...@@ -1211,6 +1212,186 @@ class Switch(object):
return True return True
class SelectCase(object):
DEFAULT = 0
SEND = 1
RECEIVE = 2
def __init__(self,
case_idx,
case_to_execute,
channel_action_fn=None,
channel=None,
value=None):
self.helper = LayerHelper('conditional_block')
self.main_program = self.helper.main_program
self.is_scalar_condition = True
self.case_to_execute = case_to_execute
self.idx = case_idx
# Since we aren't going to use the `channel_send` or `channel_recv`
# functions directly, we just need to capture the name.
self.action = (self.SEND
if channel_action_fn.__name__ == ('channel_send') else
self.RECEIVE) if channel_action_fn else (self.DEFAULT)
self.value = value
self.channel = channel
def __enter__(self):
self.block = self.main_program.create_block()
def construct_op(self):
main_program = self.helper.main_program
cases_block = main_program.current_block()
inner_outputs = set()
input_set = set()
params = set()
for op in self.block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the SelectCase operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
input_set.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
param_list = [
cases_block.var(each_name) for each_name in params
if each_name not in input_set
]
# Iterate over all operators, get all the outputs
# add to the output list of SelectCase operator only if
# they exist in the parent block.
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in cases_block.vars:
out_vars.append(cases_block.var(inner_out_name))
# First, create an op that will determine whether or not this is the
# conditional variable to execute.
should_execute_block = equal(
fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
self.case_to_execute)
step_scope = cases_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
cases_block.append_op(
type='conditional_block',
inputs={'X': [should_execute_block],
'Params': param_list},
outputs={'Out': out_vars,
'Scope': [step_scope]},
attrs={
'sub_block': self.block,
'is_scalar_condition': self.is_scalar_condition
})
return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
if self.channel else '', self.value.name
if self.value else '')
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.rollback()
if exc_type is not None:
return False # re-raise exception
return True
class Select(BlockGuard):
def __init__(self, name=None):
self.helper = LayerHelper('select', name=name)
self.cases = []
super(Select, self).__init__(self.helper.main_program)
self.case_to_execute = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
def __enter__(self):
super(Select, self).__enter__()
return self
def case(self, channel_action_fn, channel, value):
"""Create a new block for this condition.
"""
select_case = SelectCase(
len(self.cases), self.case_to_execute, channel_action_fn, channel,
value)
self.cases.append(select_case)
return select_case
def default(self):
"""Create a default case block for this condition.
"""
default_case = SelectCase(len(self.cases), self.case_to_execute)
self.cases.append(default_case)
return default_case
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
# Create a select op and another block to wrap its
# case blocks.
select_block = self.helper.main_program.current_block()
parent_block = self.helper.main_program.block(select_block.parent_idx)
# Construct each case op, inside the newly created select block.
serialized_cases = []
for case in self.cases:
serialized_cases.append(case.construct_op())
intermediate = set()
params = set()
for case_block in select_block.ops:
if case_block.attrs and 'sub_block' in case_block.attrs:
for each_op in case_block.attrs['sub_block'].ops:
assert isinstance(each_op, Operator)
for iname in each_op.input_names:
for in_var_name in each_op.input(iname):
if in_var_name not in intermediate:
params.add(in_var_name)
for oname in each_op.output_names:
for out_var_name in each_op.output(oname):
intermediate.add(out_var_name)
# TODO(varunarora): Figure out if defining output is needed.
out_list = [
parent_block.var(var_name) for var_name in parent_block.vars
if var_name in intermediate
]
X = [select_block.var_recursive(x_name) for x_name in params]
# Needs to be used by `equal` inside the cases block.
X.append(self.case_to_execute)
# Construct the select op.
parent_block.append_op(
type='select',
inputs={'X': X,
'case_to_execute': self.case_to_execute},
attrs={'sub_block': select_block,
'cases': serialized_cases},
outputs={})
return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
class IfElseBlockGuard(object): class IfElseBlockGuard(object):
def __init__(self, is_true, ifelse): def __init__(self, is_true, ifelse):
if not isinstance(ifelse, IfElse): if not isinstance(ifelse, IfElse):
......
...@@ -129,8 +129,13 @@ def detection_output(loc, ...@@ -129,8 +129,13 @@ def detection_output(loc,
target_box=loc, target_box=loc,
code_type='decode_center_size') code_type='decode_center_size')
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) old_shape = scores.shape
scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
scores = nn.softmax(input=scores)
scores = ops.reshape(x=scores, shape=old_shape)
scores = nn.transpose(scores, perm=[0, 2, 1]) scores = nn.transpose(scores, perm=[0, 2, 1])
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
helper.append_op( helper.append_op(
type="multiclass_nms", type="multiclass_nms",
inputs={'Scores': scores, inputs={'Scores': scores,
...@@ -561,16 +566,16 @@ def multi_box_head(inputs, ...@@ -561,16 +566,16 @@ def multi_box_head(inputs,
base_size, base_size,
num_classes, num_classes,
aspect_ratios, aspect_ratios,
min_ratio, min_ratio=None,
max_ratio, max_ratio=None,
min_sizes=None, min_sizes=None,
max_sizes=None, max_sizes=None,
steps=None, steps=None,
step_w=None, step_w=None,
step_h=None, step_h=None,
offset=0.5, offset=0.5,
variance=[0.1, 0.1, 0.1, 0.1], variance=[0.1, 0.1, 0.2, 0.2],
flip=False, flip=True,
clip=False, clip=False,
kernel_size=1, kernel_size=1,
pad=0, pad=0,
...@@ -613,7 +618,7 @@ def multi_box_head(inputs, ...@@ -613,7 +618,7 @@ def multi_box_head(inputs,
the inputs[i] will be automatically calculated. Default: None. the inputs[i] will be automatically calculated. Default: None.
offset(float): Prior boxes center offset. Default: 0.5 offset(float): Prior boxes center offset. Default: 0.5
variance(list|tuple): the variances to be encoded in prior boxes. variance(list|tuple): the variances to be encoded in prior boxes.
Default:[0.1, 0.1, 0.1, 0.1]. Default:[0.1, 0.1, 0.2, 0.2].
flip(bool): Whether to flip aspect ratios. Default:False. flip(bool): Whether to flip aspect ratios. Default:False.
clip(bool): Whether to clip out-of-boundary boxes. Default: False. clip(bool): Whether to clip out-of-boundary boxes. Default: False.
kernel_size(int): The kernel size of conv2d. Default: 1. kernel_size(int): The kernel size of conv2d. Default: 1.
...@@ -667,6 +672,19 @@ def multi_box_head(inputs, ...@@ -667,6 +672,19 @@ def multi_box_head(inputs,
helper = LayerHelper("prior_box", **locals()) helper = LayerHelper("prior_box", **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
attrs = {
'min_sizes': min_sizes,
'aspect_ratios': aspect_ratios,
'variances': variance,
'flip': flip,
'clip': clip,
'step_w': step_w,
'step_h': step_h,
'offset': offset
}
if len(max_sizes) > 0 and max_sizes[0] > 0:
attrs['max_sizes'] = max_sizes
box = helper.create_tmp_variable(dtype) box = helper.create_tmp_variable(dtype)
var = helper.create_tmp_variable(dtype) var = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
...@@ -675,17 +693,7 @@ def multi_box_head(inputs, ...@@ -675,17 +693,7 @@ def multi_box_head(inputs,
"Image": image}, "Image": image},
outputs={"Boxes": box, outputs={"Boxes": box,
"Variances": var}, "Variances": var},
attrs={ attrs=attrs, )
'min_sizes': min_sizes,
'max_sizes': max_sizes,
'aspect_ratios': aspect_ratios,
'variances': variance,
'flip': flip,
'clip': clip,
'step_w': step_w,
'step_h': step_h,
'offset': offset
})
return box, var return box, var
def _reshape_with_axis_(input, axis=1): def _reshape_with_axis_(input, axis=1):
...@@ -713,7 +721,7 @@ def multi_box_head(inputs, ...@@ -713,7 +721,7 @@ def multi_box_head(inputs,
if num_layer <= 2: if num_layer <= 2:
assert min_sizes is not None and max_sizes is not None assert min_sizes is not None and max_sizes is not None
assert len(min_sizes) == num_layer and len(max_sizes) == num_layer assert len(min_sizes) == num_layer and len(max_sizes) == num_layer
else: elif min_sizes is None and max_sizes is None:
min_sizes = [] min_sizes = []
max_sizes = [] max_sizes = []
step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2))) step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
...@@ -758,9 +766,6 @@ def multi_box_head(inputs, ...@@ -758,9 +766,6 @@ def multi_box_head(inputs,
min_size = [min_size] min_size = [min_size]
if not _is_list_or_tuple_(max_size): if not _is_list_or_tuple_(max_size):
max_size = [max_size] max_size = [max_size]
if not (len(max_size) == len(min_size)):
raise ValueError(
'the length of max_size and min_size should be equal.')
aspect_ratio = [] aspect_ratio = []
if aspect_ratios is not None: if aspect_ratios is not None:
...@@ -778,7 +783,7 @@ def multi_box_head(inputs, ...@@ -778,7 +783,7 @@ def multi_box_head(inputs,
num_boxes = box.shape[2] num_boxes = box.shape[2]
# get box_loc # get loc
num_loc_output = num_boxes * 4 num_loc_output = num_boxes * 4
mbox_loc = nn.conv2d( mbox_loc = nn.conv2d(
input=input, input=input,
...@@ -795,7 +800,7 @@ def multi_box_head(inputs, ...@@ -795,7 +800,7 @@ def multi_box_head(inputs,
mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape) mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
mbox_locs.append(mbox_loc_flatten) mbox_locs.append(mbox_loc_flatten)
# get conf_loc # get conf
num_conf_output = num_boxes * num_classes num_conf_output = num_boxes * num_classes
conf_loc = nn.conv2d( conf_loc = nn.conv2d(
input=input, input=input,
......
...@@ -13,11 +13,16 @@ ...@@ -13,11 +13,16 @@
# limitations under the License. # limitations under the License.
from .. import core from .. import core
from ..layer_helper import LayerHelper from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program
from ..unique_name import generate as unique_name
from control_flow import BlockGuard from control_flow import BlockGuard
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..executor import global_scope
__all__ = ['data', 'BlockGuardServ', 'ListenAndServ', 'Send'] __all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
'read_file', 'create_shuffle_reader', 'create_double_buffer_reader'
]
def data(name, def data(name,
...@@ -224,3 +229,101 @@ def Recv(endpoints, get_vars): ...@@ -224,3 +229,101 @@ def Recv(endpoints, get_vars):
outputs={"Out": get_vars}, outputs={"Out": get_vars},
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
def monkey_patch_reader_methods(reader):
def __get_reader__():
scope = global_scope()
var = scope.find_var(reader.name)
return var.get_reader()
def eof():
return not __get_reader__().has_next()
def reset():
return __get_reader__().reset()
reader.eof = eof
reader.reset = reset
reader.stop_gradient = True
reader.persistable = True
return reader
def _copy_reader_var_(block, var):
new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER)
new_var.desc.set_shapes(var.desc.shapes())
new_var.desc.set_dtypes(var.desc.dtypes())
new_var.persistable = True
return monkey_patch_reader_methods(new_var)
def open_recordio_file(filename, shapes, lod_levels, dtypes):
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
shape_concat = []
ranks = []
for shape in shapes:
shape_concat.extend(shape)
ranks.append(len(shape))
var_name = unique_name('open_recordio_file')
startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name)
startup_blk.append_op(
type='create_recordio_file_reader',
outputs={'Out': [startup_var]},
attrs={
'shape_concat': shape_concat,
'lod_levels': lod_levels,
'filename': filename,
'ranks': ranks
})
startup_var.desc.set_dtypes(dtypes)
startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(),
startup_var)
def __create_decorated_reader__(op_type, reader, attrs):
var_name = unique_name(op_type)
startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=var_name)
startup_blk.append_op(
type=op_type,
inputs={'UnderlyingReader': reader},
outputs={'Out': [startup_var]},
attrs=attrs)
startup_var.persistable = True
return _copy_reader_var_(default_main_program().current_block(),
startup_var)
def create_shuffle_reader(reader, buffer_size):
return __create_decorated_reader__('create_shuffle_reader', reader,
{'buffer_size': int(buffer_size)})
def create_double_buffer_reader(reader, place=None):
attrs = dict()
if place is not None:
attrs['place'] = str(place).upper()
return __create_decorated_reader__('create_double_buffer_reader', reader,
attrs)
def read_file(file_obj):
helper = LayerHelper('read_file')
out = [
helper.create_tmp_variable(
stop_gradient=True, dtype='float32')
for _ in range(len(file_obj.desc.shapes()))
]
helper.append_op(
type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out})
if len(out) == 1:
return out[0]
else:
return out
...@@ -16,10 +16,7 @@ import cStringIO ...@@ -16,10 +16,7 @@ import cStringIO
import functools import functools
import warnings import warnings
from .. import proto from ..proto import framework_pb2
framework_pb2 = proto.framework_pb2
from ..framework import OpProtoHolder, Variable from ..framework import OpProtoHolder, Variable
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
......
...@@ -39,6 +39,8 @@ __all__ = [ ...@@ -39,6 +39,8 @@ __all__ = [
'sequence_conv', 'sequence_conv',
'conv2d', 'conv2d',
'sequence_pool', 'sequence_pool',
'sequence_softmax',
'softmax',
'pool2d', 'pool2d',
'batch_norm', 'batch_norm',
'beam_search_decode', 'beam_search_decode',
...@@ -49,6 +51,7 @@ __all__ = [ ...@@ -49,6 +51,7 @@ __all__ = [
'reduce_mean', 'reduce_mean',
'reduce_max', 'reduce_max',
'reduce_min', 'reduce_min',
'reduce_prod',
'sequence_first_step', 'sequence_first_step',
'sequence_last_step', 'sequence_last_step',
'dropout', 'dropout',
...@@ -85,13 +88,12 @@ def fc(input, ...@@ -85,13 +88,12 @@ def fc(input,
**Fully Connected Layer** **Fully Connected Layer**
The fully connected layer can take multiple tensors as its inputs. It The fully connected layer can take multiple tensors as its inputs. It
creates a variable (one for each input tensor) called weights for each creates a variable called weights for each input tensor, which represents
input tensor, which represents a fully connected weight matrix from a fully connected weight matrix from each input unit to each output unit.
each input unit to each output unit. The fully connected layer The fully connected layer multiplies each input tensor with its coresponding
multiplies each input tensor with its coresponding weight to produce weight to produce an output Tensor. If multiple input tensors are given,
an output Tensor. If multiple input tensors are given, the results of the results of multiple multiplications will be sumed up. If bias_attr is
multiple multiplications will be sumed up. If bias_attr is not None, not None, a bias variable will be created and added to the output. Finally,
a biases variable will be created and added to the output. Finally,
if activation is not None, it will be applied to the output as well. if activation is not None, it will be applied to the output as well.
This process can be formulated as follows: This process can be formulated as follows:
...@@ -110,44 +112,27 @@ def fc(input, ...@@ -110,44 +112,27 @@ def fc(input,
* :math:`Out`: The output tensor. * :math:`Out`: The output tensor.
Args: Args:
input(Variable|list): The input tensor(s) to the fully connected layer. input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
size(int): The number of output units in the fully connected layer. the input tensor(s) is at least 2.
num_flatten_dims(int): The fc layer can accept an input tensor with more size(int): The number of output units in this layer.
than two dimensions. If this happens, the num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
multidimensional tensor will first be flattened two dimensions. If this happens, the multidimensional tensor will first be flattened
into a 2-dimensional matrix. The parameter into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
`num_flatten_dims` determines how the input tensor tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
is flattened: the first `num_flatten_dims` dimensions will be flatten to form the first dimension of the final matrix (height of
(inclusive, index starts from 1) dimensions will the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
be flatten to form the first dimension of the form the second dimension of the final matrix (width of the matrix). For example, suppose
final matrix (height of the matrix), and the rest `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
`rank(X) - num_flatten_dims` dimensions are Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
flattened to form the second dimension of the param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
final matrix (width of the matrix). For example, parameters/weights of this layer.
suppose `X` is a 6-dimensional tensor with a shape bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
[2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then, of this layer. If it is set to None, no bias will be added to the output units.
the flattened matrix will have a shape act (str, default None): Activation to be applied to the output of this layer.
[2 x 3 x 4, 5 x 6] = [24, 30]. By default, name (str, default None): The name of this layer.
`num_flatten_dims` is set to 1.
param_attr(ParamAttr|list): The parameter attribute for learnable
parameters/weights of the fully connected
layer.
param_initializer(ParamAttr|list): The initializer used for the
weight/parameter. If set None,
XavierInitializer() will be used.
bias_attr(ParamAttr|list): The parameter attribute for the bias parameter
for this layer. If set None, no bias will be
added to the output units.
bias_initializer(ParamAttr|list): The initializer used for the bias.
If set None, then ConstantInitializer()
will be used.
act(str): Activation to be applied to the output of the fully connected
layer.
name(str): Name/alias of the fully connected layer.
Returns: Returns:
Variable: The output tensor variable. A tensor variable storing the transformation result.
Raises: Raises:
ValueError: If rank of the input tensor is less than 2. ValueError: If rank of the input tensor is less than 2.
...@@ -1103,6 +1088,30 @@ def sequence_conv(input, ...@@ -1103,6 +1088,30 @@ def sequence_conv(input,
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
helper = LayerHelper('sequence_softmax', **locals())
dtype = helper.input_dtype()
softmax_out = helper.create_tmp_variable(dtype)
helper.append_op(
type="sequence_softmax",
inputs={"X": input},
outputs={"Out": softmax_out},
attrs={"use_cudnn": use_cudnn})
return softmax_out
def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
helper = LayerHelper('softmax', **locals())
dtype = helper.input_dtype()
softmax_out = helper.create_tmp_variable(dtype)
helper.append_op(
type="softmax",
inputs={"X": input},
outputs={"Out": softmax_out},
attrs={"use_cudnn": use_cudnn})
return softmax_out
def conv2d(input, def conv2d(input,
num_filters, num_filters,
filter_size, filter_size,
...@@ -2203,6 +2212,53 @@ def reduce_min(input, dim=None, keep_dim=False, name=None): ...@@ -2203,6 +2212,53 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
return out return out
def reduce_prod(input, dim=None, keep_dim=False, name=None):
"""
Computes the product of tensor elements over the given dimension.
Args:
input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the product is performed. If
:attr:`None`, multipy all elements of :attr:`input` and return a
Tensor variable with a single element, otherwise must be in the
range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
the dimension to reduce is :math:`rank + dim`.
keep_dim (bool|False): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true.
name(str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
Returns:
Variable: The reduced Tensor variable.
Examples:
.. code-block:: python
# x is a Tensor variable with following elements:
# [[0.2, 0.3, 0.5, 0.9]
# [0.1, 0.2, 0.6, 0.7]]
# Each example is followed by the correspending output tensor.
fluid.layers.reduce_prod(x) # [0.0002268]
fluid.layers.reduce_prod(x, dim=0) # [0.02, 0.06, 0.3, 0.63]
fluid.layers.reduce_prod(x, dim=-1) # [0.027, 0.0084]
fluid.layers.reduce_prod(x, dim=1,
keep_dim=True) # [[0.027], [0.0084]]
"""
helper = LayerHelper('reduce_prod', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(
type='reduce_prod',
inputs={'X': input},
outputs={'Out': out},
attrs={
'dim': dim if dim != None else 0,
'keep_dim': keep_dim,
'reduce_all': True if dim == None else False
})
return out
def split(input, num_or_sections, dim=-1, name=None): def split(input, num_or_sections, dim=-1, name=None):
""" """
Split the input tensor into multiple sub-tensors. Split the input tensor into multiple sub-tensors.
......
...@@ -58,8 +58,6 @@ __all__ = [ ...@@ -58,8 +58,6 @@ __all__ = [
'elementwise_pow', 'elementwise_pow',
'clip', 'clip',
'clip_by_norm', 'clip_by_norm',
'softmax',
'sequence_softmax',
'logical_and', 'logical_and',
'logical_or', 'logical_or',
'logical_xor', 'logical_xor',
...@@ -69,6 +67,7 @@ __all__ = [ ...@@ -69,6 +67,7 @@ __all__ = [
'gaussian_random', 'gaussian_random',
'gaussian_random_batch_size_like', 'gaussian_random_batch_size_like',
'cumsum', 'cumsum',
'scatter',
] + __activations__ ] + __activations__
for _OP in set(__all__): for _OP in set(__all__):
......
...@@ -29,7 +29,10 @@ dtype_to_size = { ...@@ -29,7 +29,10 @@ dtype_to_size = {
core.VarDesc.VarType.BOOL: 1 core.VarDesc.VarType.BOOL: 1
} }
sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"] sub_block_ops = [
"while", "while_grad", "parallel_do", "parallel_do_grad",
"conditional_block", "conditional_block_grad"
]
PRINT_LOG = False PRINT_LOG = False
...@@ -122,36 +125,80 @@ class ControlFlowGraph(object): ...@@ -122,36 +125,80 @@ class ControlFlowGraph(object):
else: else:
return block_desc.find_var_recursive(str(var_name)) return block_desc.find_var_recursive(str(var_name))
def memory_optimize(self): def _check_var_validity(self, block_desc, x, is_forward):
def check_var_validity(block_desc, x, is_forward): if str(x) == "@EMPTY@":
if str(x) == "@EMPTY@": return False
return False if not self._has_var(block_desc, x, is_forward):
if not self._has_var(block_desc, x, is_forward): return False
return False if self._find_var(block_desc, x, is_forward).persistable():
if self._find_var(block_desc, x, is_forward).persistable(): return False
return False if self._find_var(block_desc, x,
if self._find_var( is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
block_desc, x, return False
is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: if x in self._skip_opt:
return False return False
if x in self._skip_opt: if not self._find_var(block_desc, x, is_forward).shape():
return False return False
if not self._find_var(block_desc, x, is_forward).shape(): return True
return False
return True
self._build_graph() def _update_skip_opt_set(self):
for i in range(self.op_size):
op = self._ops[i]
if op.type() == "fill_constant" and op.attr("force_cpu") == True:
self._skip_opt.update(op.output_arg_names())
def release_memory(self):
self._dataflow_analyze() self._dataflow_analyze()
self._update_skip_opt_set()
fwd_id = 0
bwd_id = 0
for i in range(self.op_size):
op = self._ops[i]
if op.type() in sub_block_ops:
continue
block_desc = op.block()
is_forward = i < self._forward_num
in_diff, out_diff = self._get_diff(self._live_in[i],
self._live_out[i])
can_optimize = filter(
lambda x: self._check_var_validity(block_desc, x, is_forward),
in_diff)
if can_optimize:
index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
delete_op = block_desc.insert_op(index)
delete_op.set_type("delete_var")
delete_op.set_input("X", can_optimize)
if is_forward:
fwd_id += 1
else:
bwd_id += 1
def memory_optimize(self, level=0):
def compare_shape(x_shape, cache_shape, opt_level):
if opt_level == 0:
return x_shape == cache_shape
if opt_level == 1:
if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
return False
x_size = abs(reduce(lambda x, y: x * y, x_shape))
cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
if x_size <= cache_size:
return True
return False
self._dataflow_analyze()
self._update_skip_opt_set()
self.pool = [] self.pool = []
for i in range(self.op_size): for i in range(self.op_size):
op = self._ops[i] op = self._ops[i]
if op.type() in sub_block_ops: if op.type() in sub_block_ops:
continue continue
block_desc = op.block() block_desc = op.block()
self.current_block_desc = block_desc
is_forward = i < self._forward_num is_forward = i < self._forward_num
if self.pool: if self.pool:
defs_can_optimize = filter( defs_can_optimize = filter(
lambda x: check_var_validity(block_desc, x, is_forward), lambda x: self._check_var_validity(block_desc, x, is_forward),
self._defs[i]) self._defs[i])
out_pair = [ out_pair = [
(x, self._find_var(block_desc, x, is_forward).shape()) (x, self._find_var(block_desc, x, is_forward).shape())
...@@ -164,7 +211,7 @@ class ControlFlowGraph(object): ...@@ -164,7 +211,7 @@ class ControlFlowGraph(object):
for index, cache_pair in enumerate(self.pool): for index, cache_pair in enumerate(self.pool):
cache_var = cache_pair[0] cache_var = cache_pair[0]
cache_shape = cache_pair[1] cache_shape = cache_pair[1]
if x_shape == cache_shape: if compare_shape(x_shape, cache_shape, level):
if self._has_var(block_desc, cache_var, is_forward): if self._has_var(block_desc, cache_var, is_forward):
x_dtype = self._find_var(block_desc, x, x_dtype = self._find_var(block_desc, x,
is_forward).dtype() is_forward).dtype()
...@@ -196,7 +243,7 @@ class ControlFlowGraph(object): ...@@ -196,7 +243,7 @@ class ControlFlowGraph(object):
in_diff, out_diff = self._get_diff(self._live_in[i], in_diff, out_diff = self._get_diff(self._live_in[i],
self._live_out[i]) self._live_out[i])
can_optimize = filter( can_optimize = filter(
lambda x: check_var_validity(block_desc, x, is_forward), lambda x: self._check_var_validity(block_desc, x, is_forward),
in_diff) in_diff)
if can_optimize: if can_optimize:
for var_name in can_optimize: for var_name in can_optimize:
...@@ -270,7 +317,8 @@ def _get_cfgs(input_program): ...@@ -270,7 +317,8 @@ def _get_cfgs(input_program):
([block_desc.op(i) for i in range(op_size)], op_size, set())) ([block_desc.op(i) for i in range(op_size)], op_size, set()))
sub_block_pair = [("while", "while_grad"), ("parallel_do", sub_block_pair = [("while", "while_grad"), ("parallel_do",
"parallel_do_grad")] "parallel_do_grad"),
("conditional_block", "conditional_block_grad")]
ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair)) ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
...@@ -281,9 +329,15 @@ def _get_cfgs(input_program): ...@@ -281,9 +329,15 @@ def _get_cfgs(input_program):
return cfgs return cfgs
def memory_optimize(input_program, print_log=False): def memory_optimize(input_program, print_log=False, level=0):
global PRINT_LOG global PRINT_LOG
PRINT_LOG = print_log PRINT_LOG = print_log
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
for cfg in cfgs: for cfg in cfgs:
cfg.memory_optimize() cfg.memory_optimize(level)
def release_memory(input_program):
cfgs = _get_cfgs(input_program)
for cfg in cfgs:
cfg.release_memory()
...@@ -92,7 +92,10 @@ class Optimizer(object): ...@@ -92,7 +92,10 @@ class Optimizer(object):
# create learning rate variable for every parameter # create learning rate variable for every parameter
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
return self.global_learning_rate() * param_lr if param_lr == 1.0:
return self.global_learning_rate()
else:
return self.global_learning_rate() * param_lr
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters """Create all accumulators needed by the parameters
...@@ -220,6 +223,8 @@ class Optimizer(object): ...@@ -220,6 +223,8 @@ class Optimizer(object):
params_grads = append_backward(loss, parameter_list, no_grad_set, params_grads = append_backward(loss, parameter_list, no_grad_set,
[error_clip_callback]) [error_clip_callback])
params_grads = sorted(params_grads, key=lambda x: x[0].name)
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import core
import contextlib
__all__ = ['convert_reader_to_recordio_file']
@contextlib.contextmanager
def create_recordio_writer(filename,
compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000):
writer = core.RecordIOWriter(filename, compressor, max_num_records)
yield writer
writer.close()
def convert_reader_to_recordio_file(
filename,
reader_creator,
feeder,
compressor=core.RecordIOWriter.Compressor.Snappy,
max_num_records=1000,
feed_order=None):
if feed_order is None:
feed_order = feeder.feed_names
counter = 0
with create_recordio_writer(filename, compressor,
max_num_records) as writer:
for batch in reader_creator():
res = feeder.feed(batch)
for each in feed_order:
writer.append_tensor(res[each])
writer.complete_append_tensor()
counter += 1
return counter
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import framework import framework
from . import core
__all__ = [ __all__ = [
'append_regularization_ops', 'append_regularization_ops',
...@@ -43,16 +44,20 @@ def append_regularization_ops(parameters_and_grads, regularization=None): ...@@ -43,16 +44,20 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
""" """
params_and_grads = [] params_and_grads = []
for param, grad in parameters_and_grads: for param, grad in parameters_and_grads:
# If no gradient then we don't need to do anything
if grad is None:
params_and_grads.append((param, grad))
continue
regularization_term = None regularization_term = None
if param.regularizer is not None: if param.regularizer is not None:
# Add variable for regularization term in grad block # Add variable for regularization term in grad block
regularization_term = param.regularizer(param, grad.block) regularization_term = param.regularizer(param, grad, grad.block)
elif regularization is not None: elif regularization is not None:
regularization_term = regularization(param, grad.block) regularization_term = regularization(param, grad, grad.block)
# If no gradient or no regularization specified, # If no regularization specified, then we don't need to do anything
# then we don't need to do anything if regularization_term is None:
if grad is None or regularization_term is None:
params_and_grads.append((param, grad)) params_and_grads.append((param, grad))
continue continue
...@@ -82,7 +87,7 @@ class WeightDecayRegularizer(object): ...@@ -82,7 +87,7 @@ class WeightDecayRegularizer(object):
def __init__(self): def __init__(self):
pass pass
def __call__(self, param, block): def __call__(self, param, grad, block):
"""Add corresponding weight decay operations to the network """Add corresponding weight decay operations to the network
""" """
raise NotImplementedError() raise NotImplementedError()
...@@ -102,7 +107,7 @@ class L2DecayRegularizer(WeightDecayRegularizer): ...@@ -102,7 +107,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
super(L2DecayRegularizer, self).__init__() super(L2DecayRegularizer, self).__init__()
self._regularization_coeff = regularization_coeff self._regularization_coeff = regularization_coeff
def __call__(self, param, block): def __call__(self, param, grad, block):
"""Add L2 weight decay ops to network """Add L2 weight decay ops to network
Adds L2 weight decay ops. Adds L2 weight decay ops.
...@@ -117,8 +122,23 @@ class L2DecayRegularizer(WeightDecayRegularizer): ...@@ -117,8 +122,23 @@ class L2DecayRegularizer(WeightDecayRegularizer):
""" """
assert isinstance(param, framework.Parameter) assert isinstance(param, framework.Parameter)
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
decay = block.create_var( decay = block.create_var(
dtype="float32", shape=param.shape, lod_level=param.lod_level) dtype="float32", shape=param.shape, lod_level=param.lod_level)
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
decay = block.create_var(
dtype="float32",
shape=param.shape,
type=core.VarDesc.VarType.SELECTED_ROWS)
block.append_op(
type='lookup_table',
inputs={'W': param,
'Ids': grad},
outputs={'Out': decay},
attrs={'is_sparse': True})
param = decay
# Append Op to calculate decay # Append Op to calculate decay
block.append_op( block.append_op(
type='scale', type='scale',
...@@ -141,7 +161,7 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -141,7 +161,7 @@ class L1DecayRegularizer(WeightDecayRegularizer):
super(L1DecayRegularizer, self).__init__() super(L1DecayRegularizer, self).__init__()
self._regularization_coeff = regularization_coeff self._regularization_coeff = regularization_coeff
def __call__(self, param, block): def __call__(self, param, grad, block):
"""Add L1 weight decay ops to network """Add L1 weight decay ops to network
Adds L1 weight decay ops. Adds L1 weight decay ops.
...@@ -158,6 +178,19 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -158,6 +178,19 @@ class L1DecayRegularizer(WeightDecayRegularizer):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
decay = block.create_var( decay = block.create_var(
dtype="float32", shape=param.shape, lod_level=param.lod_level) dtype="float32", shape=param.shape, lod_level=param.lod_level)
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
decay = block.create_var(
dtype="float32",
shape=param.shape,
type=core.VarDesc.VarType.SELECTED_ROWS)
block.append_op(
type='lookup_table',
inputs={'W': param,
'Ids': grad},
outputs={'Out': decay},
attrs={'is_sparse': True})
# Append sign op # Append sign op
block.append_op( block.append_op(
type='sign', inputs={"X": param}, outputs={"Out": decay}) type='sign', inputs={"X": param}, outputs={"Out": decay})
......
...@@ -181,7 +181,10 @@ def train_main(use_cuda, is_sparse, is_local=True): ...@@ -181,7 +181,10 @@ def train_main(use_cuda, is_sparse, is_local=True):
cost = pd.cross_entropy(input=rnn_out, label=label) cost = pd.cross_entropy(input=rnn_out, label=label)
avg_cost = pd.mean(cost) avg_cost = pd.mean(cost)
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer = fluid.optimizer.Adagrad(
learning_rate=1e-4,
regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.1))
optimize_ops, params_grads = optimizer.minimize(avg_cost) optimize_ops, params_grads = optimizer.minimize(avg_cost)
train_data = paddle.batch( train_data = paddle.batch(
......
...@@ -50,6 +50,7 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) ...@@ -50,6 +50,7 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program(), print_log=True) fluid.memory_optimize(fluid.default_main_program(), print_log=True)
# fluid.release_memory(fluid.default_main_program())
BATCH_SIZE = 200 BATCH_SIZE = 200
...@@ -69,8 +70,6 @@ exe.run(fluid.default_startup_program()) ...@@ -69,8 +70,6 @@ exe.run(fluid.default_startup_program())
PASS_NUM = 100 PASS_NUM = 100
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): for data in train_reader():
avg_loss_value, = exe.run(fluid.default_main_program(), avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
......
...@@ -125,9 +125,10 @@ opts = optimizer.minimize(avg_cost) ...@@ -125,9 +125,10 @@ opts = optimizer.minimize(avg_cost)
batch_size = fluid.layers.create_tensor(dtype='int64') batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size) batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
fluid.memory_optimize(fluid.default_main_program()) # fluid.memory_optimize(fluid.default_main_program(), level=0)
fluid.release_memory(fluid.default_main_program())
BATCH_SIZE = 128 BATCH_SIZE = 16
PASS_NUM = 1 PASS_NUM = 1
# fix the order of training data # fix the order of training data
...@@ -159,8 +160,7 @@ for pass_id in range(PASS_NUM): ...@@ -159,8 +160,7 @@ for pass_id in range(PASS_NUM):
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
pass_acc)) pass_acc))
# this model is slow, so if we can train two mini batch, we think it works properly. # this model is slow, so if we can train two mini batch, we think it works properly.
if i > 0:
if i > 2:
exit(0) exit(0)
if math.isnan(float(loss)): if math.isnan(float(loss)):
sys.exit("got NaN loss, training failed.") sys.exit("got NaN loss, training failed.")
......
...@@ -105,7 +105,8 @@ def main(): ...@@ -105,7 +105,8 @@ def main():
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program()) # fluid.memory_optimize(fluid.default_main_program())
fluid.release_memory(fluid.default_main_program())
# fix the order of training data # fix the order of training data
train_data = paddle.batch( train_data = paddle.batch(
......
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid import framework, unique_name from paddle.fluid import framework, unique_name, layer_helper
from paddle.fluid.executor import Executor from paddle.fluid.executor import Executor
from paddle.fluid.layers import fill_constant from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print
class TestRoutineOp(unittest.TestCase): class TestRoutineOp(unittest.TestCase):
...@@ -86,8 +86,7 @@ class TestRoutineOp(unittest.TestCase): ...@@ -86,8 +86,7 @@ class TestRoutineOp(unittest.TestCase):
self.assertEqual(leftmost_data[0][0], n + 1) self.assertEqual(leftmost_data[0][0], n + 1)
def _create_one_dim_tensor(self, value): def _create_one_dim_tensor(self, value):
one_dim_tensor = fill_constant( one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value)
shape=[1], dtype=core.VarDesc.VarType.INT64, value=value)
one_dim_tensor.stop_gradient = True one_dim_tensor.stop_gradient = True
return one_dim_tensor return one_dim_tensor
...@@ -95,6 +94,180 @@ class TestRoutineOp(unittest.TestCase): ...@@ -95,6 +94,180 @@ class TestRoutineOp(unittest.TestCase):
return framework.default_main_program().current_block().create_var( return framework.default_main_program().current_block().create_var(
name=unique_name.generate(name), type=type, dtype=dtype) name=unique_name.generate(name), type=type, dtype=dtype)
def _create_persistable_tensor(self, name, type, dtype):
return framework.default_main_program().current_block().create_var(
name=unique_name.generate(name),
type=type,
dtype=dtype,
persistable=True)
def test_select(self):
with framework.program_guard(framework.Program()):
ch1 = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
result1 = self._create_tensor('return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
input_value = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.FP64, value=10)
with fluid.Select() as select:
with select.case(fluid.channel_send, ch1, input_value):
# Execute something.
pass
with select.default():
pass
# This should not block because we are using a buffered channel.
result1, status = fluid.channel_recv(ch1, result1)
fluid.channel_close(ch1)
cpu = core.CPUPlace()
exe = Executor(cpu)
result = exe.run(fetch_list=[result1])
self.assertEqual(result[0][0], 10)
def test_fibonacci(self):
"""
Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5
"""
with framework.program_guard(framework.Program()):
quit_ch_input_var = self._create_persistable_tensor(
'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.INT32)
quit_ch_input = fill_constant(
shape=[1],
dtype=core.VarDesc.VarType.INT32,
value=0,
out=quit_ch_input_var)
result = self._create_persistable_tensor(
'result', core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.INT32)
fill_constant(
shape=[1],
dtype=core.VarDesc.VarType.INT32,
value=0,
out=result)
x = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
y = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
while_cond = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
while_false = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
x_tmp = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
def fibonacci(channel, quit_channel):
while_op = While(cond=while_cond)
with while_op.block():
result2 = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
x_to_send_tmp = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
# TODO(abhinav): Need to perform copy when doing a channel send.
# Once this is complete, we can remove these lines
assign(input=x, output=x_to_send_tmp)
with fluid.Select() as select:
with select.case(fluid.channel_send, channel,
x_to_send_tmp):
assign(input=x, output=x_tmp)
assign(input=y, output=x)
assign(elementwise_add(x=x_tmp, y=y), output=y)
with select.case(fluid.channel_recv, quit_channel,
result2):
# Quit
helper = layer_helper.LayerHelper('assign')
helper.append_op(
type='assign',
inputs={'X': [while_false]},
outputs={'Out': [while_cond]})
ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
with fluid.Go():
for i in xrange(10):
fluid.channel_recv(ch1, result)
Print(result)
fluid.channel_send(quit_ch, quit_ch_input)
fibonacci(ch1, quit_ch)
fluid.channel_close(ch1)
fluid.channel_close(quit_ch)
cpu = core.CPUPlace()
exe = Executor(cpu)
exe_result = exe.run(fetch_list=[result])
self.assertEqual(exe_result[0][0], 34)
def test_ping_pong(self):
"""
Mimics Ping Pong example: https://gobyexample.com/channel-directions
"""
with framework.program_guard(framework.Program()):
result = self._create_tensor('return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
ping_result = self._create_tensor('ping_return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
pong_result = self._create_tensor('pong_return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
def ping(ch, message):
message_to_send_tmp = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.FP64, value=0)
assign(input=message, output=message_to_send_tmp)
fluid.channel_send(ch, message_to_send_tmp)
def pong(ch1, ch2):
fluid.channel_recv(ch1, ping_result)
assign(input=ping_result, output=pong_result)
fluid.channel_send(ch2, pong_result)
pings = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
pongs = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
msg = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.FP64, value=9)
ping(pings, msg)
pong(pings, pongs)
fluid.channel_recv(pongs, result)
fluid.channel_close(pings)
fluid.channel_close(pongs)
cpu = core.CPUPlace()
exe = Executor(cpu)
exe_result = exe.run(fetch_list=[result])
self.assertEqual(exe_result[0][0], 9)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -11,7 +11,6 @@ list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com ...@@ -11,7 +11,6 @@ list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com
list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
list(REMOVE_ITEM TEST_OPS test_detection_output_op) # FIXME: detection_output_op will be rewritten. This unittest should be
list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
......
...@@ -469,6 +469,28 @@ class OpTest(unittest.TestCase): ...@@ -469,6 +469,28 @@ class OpTest(unittest.TestCase):
tensor.set_lod(lod) tensor.set_lod(lod)
return tensor return tensor
@staticmethod
def np_dtype_to_fluid_dtype(input):
"""Change the dtype of float16 numpy array
numpy float16 is binded to paddle::platform::float16
in tensor_py.h via the help of uint16 data type since
the internal memory representation of float16 is
uint16_t in paddle and np.uint16 in numpy, which are
themselves binded together by pybind.
Args:
input: input numpy array
Returns:
input: The dtype of input will be changed to np.uint16 if
it is originally np.float16, such that the internal memory
of input will be reinterpreted as of dtype np.uint16.
"""
if input.dtype == np.float16:
input.dtype = np.uint16
return input
def _get_gradient(self, input_to_check, place, output_names, no_grad_set): def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
prog = Program() prog = Program()
block = prog.global_block() block = prog.global_block()
......
...@@ -18,7 +18,7 @@ import numpy as np ...@@ -18,7 +18,7 @@ import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
class TestCastOp(op_test.OpTest): class TestCastOp1(op_test.OpTest):
def setUp(self): def setUp(self):
ipt = np.random.random(size=[10, 10]) ipt = np.random.random(size=[10, 10])
self.inputs = {'X': ipt.astype('float32')} self.inputs = {'X': ipt.astype('float32')}
...@@ -36,5 +36,36 @@ class TestCastOp(op_test.OpTest): ...@@ -36,5 +36,36 @@ class TestCastOp(op_test.OpTest):
self.check_grad(['X'], ['Out']) self.check_grad(['X'], ['Out'])
class TestCastOp2(op_test.OpTest):
def setUp(self):
ipt = np.random.random(size=[10, 10])
# numpy float16 is binded to fluid float16 via uint16
self.inputs = {'X': ipt.astype('float16').view(np.uint16)}
self.outputs = {'Out': ipt.astype('float32')}
self.attrs = {
'in_dtype': int(core.VarDesc.VarType.FP16),
'out_dtype': int(core.VarDesc.VarType.FP32)
}
self.op_type = 'cast'
def test_check_output(self):
self.check_output(atol=1e-3)
class TestCastOp3(op_test.OpTest):
def setUp(self):
ipt = np.random.random(size=[10, 10])
self.inputs = {'X': ipt.astype('float32')}
self.outputs = {'Out': ipt.astype('float16')}
self.attrs = {
'in_dtype': int(core.VarDesc.VarType.FP32),
'out_dtype': int(core.VarDesc.VarType.FP16)
}
self.op_type = 'cast'
def test_check_output(self):
self.check_output(atol=1e-3)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -63,9 +63,11 @@ def conv2d_forward_naive(input, filter, group, conv_param): ...@@ -63,9 +63,11 @@ def conv2d_forward_naive(input, filter, group, conv_param):
class TestConv2dOp(OpTest): class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv2d"
self.use_cudnn = False self.use_cudnn = False
self.use_mkldnn = False self.use_mkldnn = False
self.init_op_type() self.dtype = np.float32
self.init_kernel_type()
self.init_group() self.init_group()
self.init_dilation() self.init_dilation()
self.init_test_case() self.init_test_case()
...@@ -75,12 +77,16 @@ class TestConv2dOp(OpTest): ...@@ -75,12 +77,16 @@ class TestConv2dOp(OpTest):
'pad': self.pad, 'pad': self.pad,
'dilation': self.dilations 'dilation': self.dilations
} }
input = np.random.random(self.input_size).astype("float32")
filter = np.random.random(self.filter_size).astype("float32") input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv2d_forward_naive(input, filter, self.groups, output = conv2d_forward_naive(input, filter, self.groups,
conv2d_param).astype('float32') conv2d_param).astype(self.dtype)
self.inputs = {'Input': input, 'Filter': filter} self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
self.attrs = { self.attrs = {
'strides': self.stride, 'strides': self.stride,
'paddings': self.pad, 'paddings': self.pad,
...@@ -99,6 +105,8 @@ class TestConv2dOp(OpTest): ...@@ -99,6 +105,8 @@ class TestConv2dOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
if self.use_cudnn: if self.use_cudnn:
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -111,6 +119,8 @@ class TestConv2dOp(OpTest): ...@@ -111,6 +119,8 @@ class TestConv2dOp(OpTest):
set(['Input', 'Filter']), 'Output', max_relative_error=0.02) set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16:
return
if self.use_cudnn: if self.use_cudnn:
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -126,6 +136,8 @@ class TestConv2dOp(OpTest): ...@@ -126,6 +136,8 @@ class TestConv2dOp(OpTest):
no_grad_set=set(['Filter'])) no_grad_set=set(['Filter']))
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16:
return
if self.use_cudnn: if self.use_cudnn:
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -154,8 +166,8 @@ class TestConv2dOp(OpTest): ...@@ -154,8 +166,8 @@ class TestConv2dOp(OpTest):
def init_group(self): def init_group(self):
self.groups = 1 self.groups = 1
def init_op_type(self): def init_kernel_type(self):
self.op_type = "conv2d" pass
class TestWithPad(TestConv2dOp): class TestWithPad(TestConv2dOp):
...@@ -227,39 +239,105 @@ class TestWithInput1x1Filter1x1(TestConv2dOp): ...@@ -227,39 +239,105 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
#----------------Conv2dCUDNN---------------- #----------------Conv2dCUDNN----------------
class TestCUDNN(TestConv2dOp): class TestCUDNN(TestConv2dOp):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv2d"
class TestFP16CUDNN(TestConv2dOp):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNWithPad(TestWithPad): class TestCUDNNWithPad(TestWithPad):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv2d"
class TestFP16CUDNNWithPad(TestWithPad):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNWithStride(TestWithStride): class TestCUDNNWithStride(TestWithStride):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv2d"
class TestFP16CUDNNWithStride(TestWithStride):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNWithGroup(TestWithGroup): class TestCUDNNWithGroup(TestWithGroup):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv2d"
class TestFP16CUDNNWithGroup(TestWithGroup):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNWith1x1(TestWith1x1): class TestCUDNNWith1x1(TestWith1x1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv2d"
class TestFP16CUDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "conv2d"
class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-2)
class TestDepthwiseConv(TestConv2dOp): class TestDepthwiseConv(TestConv2dOp):
...@@ -295,21 +373,18 @@ class TestDepthwiseConv2(TestConv2dOp): ...@@ -295,21 +373,18 @@ class TestDepthwiseConv2(TestConv2dOp):
#----------------Conv2dMKLDNN---------------- #----------------Conv2dMKLDNN----------------
class TestMKLDNN(TestConv2dOp): class TestMKLDNN(TestConv2dOp):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithPad(TestWithPad): class TestMKLDNNWithPad(TestWithPad):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithStride(TestWithStride): class TestMKLDNNWithStride(TestWithStride):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "conv2d"
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import debuger
from paddle.fluid.framework import Program
class TestDebugger(unittest.TestCase):
def test_debug_str(self):
p = Program()
b = p.current_block()
#selected_rows
b.create_var(
name='selected_rows',
dtype="float32",
shape=[5, 10],
type=core.VarDesc.VarType.SELECTED_ROWS)
#tensor array
b.create_var(
name='tensor_array',
shape=[5, 10],
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY)
#operator
mul_x = b.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
mul_y = b.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = b.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
b.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
print(debuger.pprint_program_codes(p.desc))
if __name__ == '__main__':
unittest.main()
...@@ -166,8 +166,6 @@ class TestDetectionMAPOp(OpTest): ...@@ -166,8 +166,6 @@ class TestDetectionMAPOp(OpTest):
elif not difficult: elif not difficult:
label_count[label] += 1 label_count[label] += 1
true_pos = collections.defaultdict(list)
false_pos = collections.defaultdict(list)
for (label, score, tp, fp) in tf_pos: for (label, score, tp, fp) in tf_pos:
true_pos[label].append([score, tp]) true_pos[label].append([score, tp])
false_pos[label].append([score, fp]) false_pos[label].append([score, fp])
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestUnpoolOp(OpTest):
def setUp(self):
self.op_type = "detection_output"
self.init_test_case()
#loc.shape ((1, 4, 4, 1, 1))
#conf.shape ((1, 4, 2, 1, 1))
loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
[[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]],
[[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
priorbox = np.array([
0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1,
0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4,
0.8, 0.8, 0.1, 0.1, 0.2, 0.2
])
output = np.array([
0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031
])
self.inputs = {
'Loc': loc.astype('float32'),
'Conf': conf.astype('float32'),
'PriorBox': priorbox.astype('float32')
}
self.attrs = {
'num_classes': self.num_classes,
'top_k': self.top_k,
'nms_top_k': self.nms_top_k,
'background_label_id': self.background_label_id,
'nms_threshold': self.nms_threshold,
'confidence_threshold': self.confidence_threshold,
}
self.outputs = {'Out': output.astype('float32')}
def test_check_output(self):
self.check_output()
def init_test_case(self):
self.num_classes = 2
self.top_k = 10
self.nms_top_k = 20
self.background_label_id = 0
self.nms_threshold = 0.01
self.confidence_threshold = 0.01
if __name__ == '__main__':
unittest.main()
...@@ -220,7 +220,7 @@ class TestBook(unittest.TestCase): ...@@ -220,7 +220,7 @@ class TestBook(unittest.TestCase):
seq_data = layers.data( seq_data = layers.data(
name='seq_data', shape=[10, 10], dtype='float32', lod_level=1) name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
seq = layers.fc(input=seq_data, size=20) seq = layers.fc(input=seq_data, size=20)
self.assertIsNotNone(layers.sequence_softmax(x=seq)) self.assertIsNotNone(layers.sequence_softmax(seq))
print(str(program)) print(str(program))
def test_softmax(self): def test_softmax(self):
...@@ -228,7 +228,7 @@ class TestBook(unittest.TestCase): ...@@ -228,7 +228,7 @@ class TestBook(unittest.TestCase):
with program_guard(program): with program_guard(program):
data = layers.data(name='data', shape=[10], dtype='float32') data = layers.data(name='data', shape=[10], dtype='float32')
hid = layers.fc(input=data, size=20) hid = layers.fc(input=data, size=20)
self.assertIsNotNone(layers.softmax(x=hid)) self.assertIsNotNone(layers.softmax(hid))
print(str(program)) print(str(program))
def test_get_places(self): def test_get_places(self):
......
...@@ -98,6 +98,9 @@ class TestLearningRateDecay(unittest.TestCase): ...@@ -98,6 +98,9 @@ class TestLearningRateDecay(unittest.TestCase):
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
fluid.memory_optimize(fluid.default_main_program())
for step in range(10): for step in range(10):
lr_val, = exe.run(fluid.default_main_program(), lr_val, = exe.run(fluid.default_main_program(),
feed={}, feed={},
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
class TestLookupTableOp(OpTest): class TestLookupTableOp(OpTest):
...@@ -47,5 +49,52 @@ class TestLookupTableOpWithPadding(TestLookupTableOp): ...@@ -47,5 +49,52 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
pass pass
class TestLookupTableIdsIsSelectedRows(OpTest):
def check_with_place(self, place):
scope = core.Scope()
# create and initialize Variable
height = 10
rows = [0, 4, 4, 7]
row_numel = 12
# create and initialize W Variable
W = scope.var('W').get_tensor()
W_array = np.full((height, row_numel), 1.0).astype("float32")
for i in range(height):
W_array[i] *= i
W.set(W_array, place)
# create and initialize Ids Variable
ids_selected_rows = scope.var('Ids').get_selected_rows()
ids_selected_rows.set_height(len(rows))
ids_selected_rows.set_rows(rows)
np_array = np.ones((len(rows), row_numel)).astype("float32")
ids_tensor = ids_selected_rows.get_tensor()
ids_tensor.set(np_array, place)
# create Out Variable
Out = scope.var('Out').get_selected_rows()
# create and run lookup_table operator
concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
concat_rows_op.run(scope, place)
# get result from Out
Out_tensor = Out.get_tensor()
result_array = np.array(Out_tensor)
# all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(rows):
assert (row == result_array[idx]).all()
def test_concat_rows(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -41,7 +41,7 @@ class TestLRNOp(OpTest): ...@@ -41,7 +41,7 @@ class TestLRNOp(OpTest):
mid.fill(self.k) mid.fill(self.k)
for m in range(0, self.N): for m in range(0, self.N):
for i in range(0, self.C): for i in range(0, self.C):
for c in range(start, end + 1): for c in range(start, end):
ch = i + c ch = i + c
if ch < 0 or ch >= self.C: if ch < 0 or ch >= self.C:
continue continue
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -69,5 +70,42 @@ class TestMulOp2(OpTest): ...@@ -69,5 +70,42 @@ class TestMulOp2(OpTest):
['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
class TestFP16MulOp1(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((32, 84)).astype("float16")
y = np.random.random((84, 100)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.outputs = {'Out': np.dot(x, y)}
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-1)
class TestFP16MulOp2(OpTest):
def setUp(self):
self.op_type = "mul"
x = np.random.random((15, 4, 12, 10)).astype("float16")
y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.attrs = {
'x_num_col_dims': 2,
'y_num_col_dims': 2,
}
result = np.dot(
x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
result = result.reshape(15, 4, 8, 2, 9)
self.outputs = {'Out': result}
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=2e-1)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -21,31 +21,43 @@ from paddle.fluid.backward import append_backward ...@@ -21,31 +21,43 @@ from paddle.fluid.backward import append_backward
class TestOptimizer(unittest.TestCase): class TestOptimizer(unittest.TestCase):
def test_sgd_optimizer(self): def test_sgd_optimizer(self):
init_program = framework.Program() def check_sgd_optimizer(optimizer_attr):
program = framework.Program() init_program = framework.Program()
block = program.global_block() program = framework.Program()
mul_x = block.create_parameter( block = program.global_block()
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") mul_x = block.create_parameter(
mul_y = block.create_var( dtype="float32",
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") shape=[5, 10],
mul_out = block.create_var( lod_level=0,
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") name="mul.x",
mean_out = block.create_var( optimize_attr=optimizer_attr)
dtype="float32", shape=[1], lod_level=0, name="mean.out") mul_y = block.create_var(
block.append_op( dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
type="mul", mul_out = block.create_var(
inputs={"X": mul_x, dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
"Y": mul_y}, mean_out = block.create_var(
outputs={"Out": mul_out}, dtype="float32", shape=[1], lod_level=0, name="mean.out")
attrs={"x_num_col_dims": 1}) block.append_op(
block.append_op( type="mul",
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) inputs={"X": mul_x,
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) "Y": mul_y},
opts, _ = sgd_optimizer.minimize(mean_out, init_program) outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
opts, _ = sgd_optimizer.minimize(mean_out, init_program)
return opts
opts = check_sgd_optimizer({'learning_rate': 1.1})
self.assertEqual(len(opts), 3) self.assertEqual(len(opts), 3)
self.assertEqual([op.type for op in opts], self.assertEqual([op.type for op in opts],
["fill_constant", "elementwise_mul", "sgd"]) ["fill_constant", "elementwise_mul", "sgd"])
opts = check_sgd_optimizer({'learning_rate': 1.0})
self.assertEqual(len(opts), 1)
self.assertEqual([op.type for op in opts], ["sgd"])
class TestMomentumOptimizer(unittest.TestCase): class TestMomentumOptimizer(unittest.TestCase):
class MockMomentum(optimizer.MomentumOptimizer): class MockMomentum(optimizer.MomentumOptimizer):
...@@ -60,7 +72,11 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -60,7 +72,11 @@ class TestMomentumOptimizer(unittest.TestCase):
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1})
mul_y = block.create_var( mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var( mul_out = block.create_var(
...@@ -110,7 +126,11 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -110,7 +126,11 @@ class TestMomentumOptimizer(unittest.TestCase):
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1})
mul_y = block.create_var( mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var( mul_out = block.create_var(
...@@ -169,7 +189,11 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -169,7 +189,11 @@ class TestAdagradOptimizer(unittest.TestCase):
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1})
mul_y = block.create_var( mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var( mul_out = block.create_var(
...@@ -229,7 +253,11 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -229,7 +253,11 @@ class TestAdamOptimizer(unittest.TestCase):
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1})
mul_y = block.create_var( mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var( mul_out = block.create_var(
...@@ -292,7 +320,11 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -292,7 +320,11 @@ class TestAdamaxOptimizer(unittest.TestCase):
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1})
mul_y = block.create_var( mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var( mul_out = block.create_var(
...@@ -352,7 +384,11 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): ...@@ -352,7 +384,11 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
program = framework.Program() program = framework.Program()
block = program.global_block() block = program.global_block()
mul_x = block.create_parameter( mul_x = block.create_parameter(
dtype="float32", shape=[5, 10], lod_level=0, name="mul.x") dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr={'learning_rate': 1.1})
mul_y = block.create_var( mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var( mul_out = block.create_var(
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import numpy import numpy
...@@ -60,20 +61,23 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -60,20 +61,23 @@ class BaseParallelForTest(unittest.TestCase):
feed=feed, feed=feed,
fetch=fetch, fetch=fetch,
place=gpu, place=gpu,
use_parallel=False) use_parallel=False,
use_gpu=True)
result_gpu_parallel = self._run_test_impl_( result_gpu_parallel = self._run_test_impl_(
callback=callback, callback=callback,
feed=feed, feed=feed,
fetch=fetch, fetch=fetch,
place=gpu, place=gpu,
use_parallel=True) use_parallel=True,
use_gpu=True)
result_gpu_nccl = self._run_test_impl_( result_gpu_nccl = self._run_test_impl_(
callback=callback, callback=callback,
feed=feed, feed=feed,
fetch=fetch, fetch=fetch,
place=gpu, place=gpu,
use_parallel=True, use_parallel=True,
use_nccl=True) use_nccl=True,
use_gpu=True)
self._assert_same_(fetch, result_cpu, result_cpu_parallel, self._assert_same_(fetch, result_cpu, result_cpu_parallel,
result_gpu, result_gpu_parallel, result_gpu_nccl) result_gpu, result_gpu_parallel, result_gpu_nccl)
else: else:
...@@ -85,7 +89,8 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -85,7 +89,8 @@ class BaseParallelForTest(unittest.TestCase):
fetch, fetch,
place, place,
use_parallel=False, use_parallel=False,
use_nccl=False): use_nccl=False,
use_gpu=False):
""" """
Run a single test, returns the fetch values Run a single test, returns the fetch values
Args: Args:
...@@ -132,7 +137,12 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -132,7 +137,12 @@ class BaseParallelForTest(unittest.TestCase):
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup) exe.run(startup)
return exe.run(main, feed=feed, fetch_list=fetch) if use_gpu:
profile_type = 'GPU'
else:
profile_type = 'CPU'
with profiler.profiler(profile_type, 'total', '/tmp/profiler'):
return exe.run(main, feed=feed, fetch_list=fetch)
def _assert_same_(self, fetch, *args): def _assert_same_(self, fetch, *args):
""" """
......
...@@ -78,20 +78,22 @@ def avg_pool2D_forward_naive(x, ...@@ -78,20 +78,22 @@ def avg_pool2D_forward_naive(x,
class TestPool2d_Op(OpTest): class TestPool2d_Op(OpTest):
def setUp(self): def setUp(self):
self.op_type = "pool2d"
self.use_cudnn = False self.use_cudnn = False
self.use_mkldnn = False self.use_mkldnn = False
self.dtype = np.float32
self.init_test_case() self.init_test_case()
self.init_global_pool() self.init_global_pool()
self.init_op_type() self.init_kernel_type()
self.init_pool_type() self.init_pool_type()
self.init_ceil_mode() self.init_ceil_mode()
if self.global_pool: if self.global_pool:
self.paddings = [0 for _ in range(len(self.paddings))] self.paddings = [0 for _ in range(len(self.paddings))]
input = np.random.random(self.shape).astype("float32") input = np.random.random(self.shape).astype(self.dtype)
output = self.pool2D_forward_naive(input, self.ksize, self.strides, output = self.pool2D_forward_naive(input, self.ksize, self.strides,
self.paddings, self.global_pool, self.paddings, self.global_pool,
self.ceil_mode).astype("float32") self.ceil_mode).astype(self.dtype)
self.inputs = {'X': input} self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
self.attrs = { self.attrs = {
'strides': self.strides, 'strides': self.strides,
...@@ -105,7 +107,7 @@ class TestPool2d_Op(OpTest): ...@@ -105,7 +107,7 @@ class TestPool2d_Op(OpTest):
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
} }
self.outputs = {'Out': output.astype('float32')} self.outputs = {'Out': output}
def test_check_output(self): def test_check_output(self):
if self.use_cudnn: if self.use_cudnn:
...@@ -115,6 +117,8 @@ class TestPool2d_Op(OpTest): ...@@ -115,6 +117,8 @@ class TestPool2d_Op(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16:
return
if self.use_cudnn and self.pool_type != "max": if self.use_cudnn and self.pool_type != "max":
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
self.check_grad_with_place( self.check_grad_with_place(
...@@ -128,8 +132,8 @@ class TestPool2d_Op(OpTest): ...@@ -128,8 +132,8 @@ class TestPool2d_Op(OpTest):
self.strides = [1, 1] self.strides = [1, 1]
self.paddings = [0, 0] self.paddings = [0, 0]
def init_op_type(self): def init_kernel_type(self):
self.op_type = "pool2d" pass
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
...@@ -149,9 +153,6 @@ class TestCase1(TestPool2d_Op): ...@@ -149,9 +153,6 @@ class TestCase1(TestPool2d_Op):
self.strides = [1, 1] self.strides = [1, 1]
self.paddings = [0, 0] self.paddings = [0, 0]
def init_op_type(self):
self.op_type = "pool2d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
self.pool2D_forward_naive = avg_pool2D_forward_naive self.pool2D_forward_naive = avg_pool2D_forward_naive
...@@ -167,9 +168,6 @@ class TestCase2(TestPool2d_Op): ...@@ -167,9 +168,6 @@ class TestCase2(TestPool2d_Op):
self.strides = [1, 1] self.strides = [1, 1]
self.paddings = [1, 1] self.paddings = [1, 1]
def init_op_type(self):
self.op_type = "pool2d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "avg" self.pool_type = "avg"
self.pool2D_forward_naive = avg_pool2D_forward_naive self.pool2D_forward_naive = avg_pool2D_forward_naive
...@@ -179,27 +177,18 @@ class TestCase2(TestPool2d_Op): ...@@ -179,27 +177,18 @@ class TestCase2(TestPool2d_Op):
class TestCase3(TestPool2d_Op): class TestCase3(TestPool2d_Op):
def init_op_type(self):
self.op_type = "pool2d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool2D_forward_naive = max_pool2D_forward_naive self.pool2D_forward_naive = max_pool2D_forward_naive
class TestCase4(TestCase1): class TestCase4(TestCase1):
def init_op_type(self):
self.op_type = "pool2d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool2D_forward_naive = max_pool2D_forward_naive self.pool2D_forward_naive = max_pool2D_forward_naive
class TestCase5(TestCase2): class TestCase5(TestCase2):
def init_op_type(self):
self.op_type = "pool2d"
def init_pool_type(self): def init_pool_type(self):
self.pool_type = "max" self.pool_type = "max"
self.pool2D_forward_naive = max_pool2D_forward_naive self.pool2D_forward_naive = max_pool2D_forward_naive
...@@ -207,39 +196,105 @@ class TestCase5(TestCase2): ...@@ -207,39 +196,105 @@ class TestCase5(TestCase2):
#--------------------test pool2d-------------------- #--------------------test pool2d--------------------
class TestCUDNNCase1(TestPool2d_Op): class TestCUDNNCase1(TestPool2d_Op):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool2d"
class TestFP16CUDNNCase1(TestPool2d_Op):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase2(TestCase1): class TestCUDNNCase2(TestCase1):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool2d"
class TestFP16CUDNNCase2(TestCase1):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase3(TestCase2): class TestCUDNNCase3(TestCase2):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool2d"
class TestFP16CUDNNCase3(TestCase2):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase4(TestCase3): class TestCUDNNCase4(TestCase3):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool2d"
class TestFP16CUDNNCase4(TestCase3):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase5(TestCase4): class TestCUDNNCase5(TestCase4):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool2d"
class TestFP16CUDNNCase5(TestCase4):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCUDNNCase6(TestCase5): class TestCUDNNCase6(TestCase5):
def init_op_type(self): def init_kernel_type(self):
self.use_cudnn = True self.use_cudnn = True
self.op_type = "pool2d"
class TestFP16CUDNNCase6(TestCase5):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestCeilModeCase1(TestCUDNNCase1): class TestCeilModeCase1(TestCUDNNCase1):
...@@ -264,39 +319,33 @@ class TestCeilModeCase4(TestCase2): ...@@ -264,39 +319,33 @@ class TestCeilModeCase4(TestCase2):
#--------------------test pool2d MKLDNN-------------------- #--------------------test pool2d MKLDNN--------------------
class TestMKLDNNCase1(TestPool2d_Op): class TestMKLDNNCase1(TestPool2d_Op):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase2(TestCase1): class TestMKLDNNCase2(TestCase1):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase3(TestCase2): class TestMKLDNNCase3(TestCase2):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase4(TestCase3): class TestMKLDNNCase4(TestCase3):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase5(TestCase4): class TestMKLDNNCase5(TestCase4):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase6(TestCase5): class TestMKLDNNCase6(TestCase5):
def init_op_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self.op_type = "pool2d"
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -31,8 +31,22 @@ class TestProfiler(unittest.TestCase): ...@@ -31,8 +31,22 @@ class TestProfiler(unittest.TestCase):
with fluid.program_guard(main_program, startup_program): with fluid.program_guard(main_program, startup_program):
image = fluid.layers.data(name='x', shape=[784], dtype='float32') image = fluid.layers.data(name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu') hidden1 = fluid.layers.fc(input=image, size=64, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64')
counter = fluid.layers.zeros(
shape=[1], dtype='int64', force_cpu=True)
until = layers.fill_constant([1], dtype='int64', value=10)
data_arr = layers.array_write(hidden1, i)
cond = fluid.layers.less_than(x=counter, y=until)
while_op = fluid.layers.While(cond=cond)
with while_op.block():
hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu')
layers.array_write(hidden_n, i, data_arr)
fluid.layers.increment(x=counter, value=1, in_place=True)
layers.less_than(x=counter, y=until, cond=cond)
hidden_n = layers.array_read(data_arr, i)
hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64') label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.fluid as fluid
import paddle.v2 as paddle
import paddle.v2.dataset.mnist as mnist
class TestRecordIO(unittest.TestCase):
def setUp(self):
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=32)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
'./mnist.recordio', reader, feeder)
def test_main(self, decorator_callback=None):
# use new program
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_file = fluid.layers.open_recordio_file(
'./mnist.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
if decorator_callback is not None:
data_file = decorator_callback(data_file)
img, label = fluid.layers.read_file(data_file)
hidden = fluid.layers.fc(input=img, size=100, act='tanh')
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
if fluid.core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
avg_loss_np = []
# train a pass
batch_id = 0
while not data_file.eof():
tmp, = exe.run(fetch_list=[avg_loss])
avg_loss_np.append(tmp)
batch_id += 1
data_file.reset()
self.assertEqual(batch_id, self.num_batches)
self.assertLess(avg_loss_np[-1], avg_loss_np[0])
def test_shuffle_reader(self):
self.test_main(decorator_callback=lambda reader: fluid.layers.create_shuffle_reader(reader, buffer_size=200))
def test_double_buffer_reader(self):
self.test_main(decorator_callback=lambda reader: fluid.layers.create_double_buffer_reader(reader,
place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
...@@ -70,6 +70,19 @@ class TestMinOp(OpTest): ...@@ -70,6 +70,19 @@ class TestMinOp(OpTest):
self.check_output() self.check_output()
class TestProdOp(OpTest):
def setUp(self):
self.op_type = "reduce_prod"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].prod(axis=0)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
class TestKeepDimReduce(OpTest): class TestKeepDimReduce(OpTest):
def setUp(self): def setUp(self):
self.op_type = "reduce_sum" self.op_type = "reduce_sum"
......
...@@ -25,7 +25,7 @@ class TestScatterOp(OpTest): ...@@ -25,7 +25,7 @@ class TestScatterOp(OpTest):
updates_np = np.random.random((2, 3)).astype("float32") updates_np = np.random.random((2, 3)).astype("float32")
output_np = np.copy(ref_np) output_np = np.copy(ref_np)
output_np[index_np] = updates_np output_np[index_np] = updates_np
self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np} self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
def test_check_output(self): def test_check_output(self):
......
...@@ -16,11 +16,15 @@ import unittest ...@@ -16,11 +16,15 @@ import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
from test_softmax_op import stable_softmax from test_softmax_op import stable_softmax
import paddle.fluid.core as core
class TestSequenceSoftmaxOp(OpTest): class TestSequenceSoftmaxOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "sequence_softmax" self.op_type = "sequence_softmax"
self.use_cudnn = False
self.init_op_type()
x = np.random.uniform(0.1, 1, (11, 1)).astype("float32") x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
lod = [[0, 4, 5, 8, 11]] lod = [[0, 4, 5, 8, 11]]
...@@ -34,12 +38,31 @@ class TestSequenceSoftmaxOp(OpTest): ...@@ -34,12 +38,31 @@ class TestSequenceSoftmaxOp(OpTest):
self.inputs = {"X": (x, lod)} self.inputs = {"X": (x, lod)}
self.outputs = {"Out": out} self.outputs = {"Out": out}
self.attrs = {'use_cudnn': self.use_cudnn, }
def init_op_type(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() if self.use_cudnn:
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-5)
else:
self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out", max_relative_error=0.01) if self.use_cudnn:
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ["X"], "Out", max_relative_error=0.01)
else:
self.check_grad(["X"], "Out", max_relative_error=0.01)
# ----------------cudnn Sequencesoftmax----------------
class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
def init_op_type(self):
self.use_cudnn = True
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
import paddle.fluid.core as core
def stable_softmax(x): def stable_softmax(x):
...@@ -27,18 +28,37 @@ def stable_softmax(x): ...@@ -27,18 +28,37 @@ def stable_softmax(x):
class TestSoftmaxOp(OpTest): class TestSoftmaxOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softmax" self.op_type = "softmax"
self.use_cudnn = False
self.inputs = { self.inputs = {
'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32") 'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
} }
self.outputs = { self.outputs = {
'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X']) 'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
} }
self.attrs = {'use_cudnn': self.use_cudnn, }
def init_op_type(self):
pass
def test_check_output(self): def test_check_output(self):
self.check_output() if self.use_cudnn:
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-5)
else:
self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out') if self.use_cudnn:
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ["X"], "Out", max_relative_error=0.01)
else:
self.check_grad(["X"], "Out", max_relative_error=0.01)
class TestSoftmaxCUDNNOp(TestSoftmaxOp):
def init_op_type(self):
self.use_cudnn = True
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -26,7 +26,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): ...@@ -26,7 +26,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softmax_with_cross_entropy" self.op_type = "softmax_with_cross_entropy"
batch_size = 2 batch_size = 41
class_num = 37 class_num = 37
logits = np.random.uniform(0.1, 1.0, logits = np.random.uniform(0.1, 1.0,
...@@ -59,7 +59,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest): ...@@ -59,7 +59,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
def setUp(self): def setUp(self):
self.op_type = "softmax_with_cross_entropy" self.op_type = "softmax_with_cross_entropy"
batch_size = 2 batch_size = 41
class_num = 37 class_num = 37
logits = np.random.uniform(0.1, 1.0, logits = np.random.uniform(0.1, 1.0,
......
...@@ -62,20 +62,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') ...@@ -62,20 +62,22 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
packages=['paddle', packages=['paddle',
'paddle.proto',
'paddle.trainer',
'paddle.trainer_config_helpers',
'paddle.utils', 'paddle.utils',
'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader',
'paddle.v2.master',
'paddle.v2.plot',
'paddle.fluid', 'paddle.fluid',
'paddle.fluid.proto', 'paddle.fluid.proto',
'paddle.fluid.proto.profiler', 'paddle.fluid.proto.profiler',
'paddle.fluid.layers', 'paddle.fluid.layers']
'py_paddle']
if '${WITH_FLUID}'== 'OFF':
packages+=['paddle.proto',
'paddle.trainer',
'paddle.trainer_config_helpers',
'paddle.v2',
'paddle.v2.dataset',
'paddle.v2.reader',
'paddle.v2.master',
'paddle.v2.plot',
'py_paddle']
with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
setup_requires = f.read().splitlines() setup_requires = f.read().splitlines()
...@@ -84,11 +86,29 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: ...@@ -84,11 +86,29 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
setup_requires+=['opencv-python'] setup_requires+=['opencv-python']
# the prefix is sys.prefix which should always be usr # the prefix is sys.prefix which should always be usr
paddle_bin_dir = 'opt/paddle/bin' paddle_bins = ''
paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', if '${WITH_FLUID}'== 'OFF':
'${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', paddle_bin_dir = 'opt/paddle/bin'
'${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main', paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
'${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
'${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
'${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
package_data={'paddle.fluid': ['core.so']}
if '${WITH_FLUID}'== 'OFF':
package_data['paddle.v2.master']=['libpaddle_master.so']
package_data['py_paddle']=['*.py','_swig_paddle.so']
package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
}
if '${WITH_FLUID}'== 'OFF':
package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
paddle_rt_lib_dir = 'lib' paddle_rt_lib_dir = 'lib'
paddle_rt_libs = ['${WARPCTC_LIBRARIES}'] paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
...@@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}', ...@@ -101,19 +121,8 @@ setup(name='${PACKAGE_NAME}',
install_requires=setup_requires, install_requires=setup_requires,
packages=packages, packages=packages,
ext_modules=[Extension('_foo', ['stub.cc'])], ext_modules=[Extension('_foo', ['stub.cc'])],
package_data={ package_data=package_data,
'paddle.v2.master': ['libpaddle_master.so'], package_dir=package_dir,
'paddle.fluid': ['core.so'],
'py_paddle':['*.py','_swig_paddle.so']
},
package_dir={
'': '${CMAKE_CURRENT_SOURCE_DIR}',
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
},
scripts=paddle_bins, scripts=paddle_bins,
data_files=[(paddle_rt_lib_dir, paddle_rt_libs)] data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
) )
...@@ -121,27 +121,34 @@ class Timeline(object): ...@@ -121,27 +121,34 @@ class Timeline(object):
def _allocate_pids(self): def _allocate_pids(self):
for event in self._profile_pb.events: for event in self._profile_pb.events:
if event.device_id not in self._devices: if event.type == profiler_pb2.Event.CPU:
pid = self._allocate_pid() if (event.device_id, "CPU") not in self._devices:
self._devices[event.device_id] = pid pid = self._allocate_pid()
if event.device_id >= 0: self._devices[(event.device_id, "CPU")] = pid
self._chrome_trace.emit_pid("gpu:%s:stream:%d" % self._chrome_trace.emit_pid("cpu:block:%d" %
(pid, event.stream_id), pid) (event.device_id), pid)
elif event.device_id == -1: elif event.type == profiler_pb2.Event.GPUKernel:
self._chrome_trace.emit_pid("cpu:thread_hash:%d" % if (event.device_id, "GPUKernel") not in self._devices:
event.stream_id, pid) pid = self._allocate_pid()
self._devices[(event.device_id, "GPUKernel")] = pid
self._chrome_trace.emit_pid("gpu:%d" % (event.device_id),
pid)
def _allocate_events(self): def _allocate_events(self):
for event in self._profile_pb.events: for event in self._profile_pb.events:
pid = self._devices[event.device_id] if event.type == profiler_pb2.Event.CPU:
type = "CPU"
elif event.type == profiler_pb2.Event.GPUKernel:
type = "GPUKernel"
pid = self._devices[(event.device_id, type)]
args = {'name': event.name} args = {'name': event.name}
if event.memcopy.bytes > 0: if event.memcopy.bytes > 0:
args = {'mem_bytes': event.memcopy.bytes} args = {'mem_bytes': event.memcopy.bytes}
# TODO(panyx0718): Chrome tracing only handles ms. However, some # TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here. # ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region(event.start_ns, self._chrome_trace.emit_region(
(event.end_ns - event.start_ns) / event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
1.0, pid, 0, 'Op', event.name, args) event.sub_device_id, 'Op', event.name, args)
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册