提交 c1931468 编写于 作者: L Luo Tao

Merge branch 'develop' into ProtoDataProvider

set -e set -e
function train() { function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0"
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
...@@ -14,8 +12,6 @@ function train() { ...@@ -14,8 +12,6 @@ function train() {
elif [ $4 == "False" ]; then elif [ $4 == "False" ]; then
thread=`nproc` thread=`nproc`
# each trainer_count use only 1 core to avoid conflict # each trainer_count use only 1 core to avoid conflict
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log" log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
else else
echo "Wrong input $3, use True or False." echo "Wrong input $3, use True or False."
......
...@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") ...@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS # Set the architecture for iOS
if(NOT DEFINED IOS_ARCH) if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS") if(IOS_PLATFORM STREQUAL "OS")
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future set(IOS_ARCH "armv7;armv7s;arm64")
set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR") elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
# FIXME(liuyiqun): support "i386;x86_64" future set(IOS_ARCH "i386;x86_64")
set(IOS_ARCH "x86_64")
endif() endif()
endif() endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
...@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_ ...@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
# Hidden visibilty is required for cxx on iOS # Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
......
...@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER}") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
IF(CMAKE_CROSSCOMPILING) IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
...@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF() ENDIF()
ELSEIF(IOS) ELSEIF(IOS)
# FIXME(liuyiqun): support multiple architectures IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ELSE()
MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
"You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
ENDIF() ENDIF()
ELSEIF(RPI) ELSEIF(RPI)
# use hardfp # use hardfp
......
...@@ -12,6 +12,10 @@ ...@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
IF(MOBILE_INFERENCE)
return()
ENDIF()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
......
digraph G { digraph G {
rnn [label="1-th level RNN" shape=box] rnn [label="1st level RNN" shape=box]
subgraph cluster0 { subgraph cluster0 {
label = "time step 0" label = "time step 0"
...@@ -8,7 +8,7 @@ digraph G { ...@@ -8,7 +8,7 @@ digraph G {
sent0 [label="sentence"] sent0 [label="sentence"]
sent1 [label="sentence"] sent1 [label="sentence"]
rnn1 [label="2-th level RNN" shape=box] rnn1 [label="2nd level RNN" shape=box]
sent0 -> rnn1 sent0 -> rnn1
sent1 -> rnn1 sent1 -> rnn1
...@@ -20,7 +20,7 @@ digraph G { ...@@ -20,7 +20,7 @@ digraph G {
sent2 [label="sentence"] sent2 [label="sentence"]
sent3 [label="sentence"] sent3 [label="sentence"]
rnn2 [label="2-th level RNN" shape=box] rnn2 [label="2nd level RNN" shape=box]
sent2 -> rnn2 sent2 -> rnn2
sent3 -> rnn2 sent3 -> rnn2
...@@ -32,7 +32,7 @@ digraph G { ...@@ -32,7 +32,7 @@ digraph G {
sent4 [label="sentence"] sent4 [label="sentence"]
sent5 [label="sentence"] sent5 [label="sentence"]
rnn3 [label="2-th level RNN" shape=box] rnn3 [label="2nd level RNN" shape=box]
sent4 -> rnn3 sent4 -> rnn3
sent5 -> rnn3 sent5 -> rnn3
......
# RNNOp design # RNNOp design
This document is about an RNN operator which requires that instances in a mini-batch have the same length. We will have a more flexible RNN operator. This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
## RNN Algorithm Implementation ## RNN Algorithm Implementation
<p aligh="center"> <p align="center">
<img src="./images/rnn.jpg"/> <img src="./images/rnn.jpg"/>
</p> </p>
The above diagram shows an RNN unrolled into a full network. The above diagram shows an RNN unrolled into a full network.
There are several important concepts: There are several important concepts here:
- *step-net*: the sub-graph to run at each step, - *step-net*: the sub-graph that runs at each step.
- *memory*, $h_t$, the state of the current step, - *memory*, $h_t$, the state of the current step.
- *ex-memory*, $h_{t-1}$, the state of the previous step, - *ex-memory*, $h_{t-1}$, the state of the previous step.
- *initial memory value*, the ex-memory of the first step. - *initial memory value*, the memory of the first (initial) step.
### Step-scope ### Step-scope
There could be local variables defined in step-nets. PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step. There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
<p aligh="center"> <p align="center">
<img src="./images/rnn.png"/><br/> <img src="./images/rnn.png"/><br/>
Figure 2 the RNN's data flow Figure 2 illustrates the RNN's data flow
</p> </p>
Please be aware that all steps run the same step-net. Each step Please be aware that every step runs the same step-net. Each step does the following:
1. creates the step-scope, 1. Creates the step-scope.
2. realizes local variables, including step-outputs, in the step-scope, and 2. Initializes the local variables including step-outputs, in the step-scope.
3. runs the step-net, which could use these variables. 3. Runs the step-net, which uses the above mentioned variables.
The RNN operator will compose its output from step outputs in step scopes. The RNN operator will compose its output from step outputs in each of the step scopes.
### Memory and Ex-memory ### Memory and Ex-memory
Let's give more details about memory and ex-memory via a simply example: Let's give more details about memory and ex-memory using a simple example:
$$ $$
h_t = U h_{t-1} + W x_t h_t = U h_{t-1} + W x_t
$$, $$,
where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively. where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step, In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
or copy the value of the previous memory value to the current ex-memory variable. or copy the memory value of the previous step to the current ex-memory variable.
### Usage in Python ### Usage in Python
For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
We can define an RNN's step-net using Block: We can define an RNN's step-net using a Block:
```python ```python
import paddle as pd import paddle as pd
X = some_op() # x is some operator's output, and is a LoDTensor X = some_op() # x is some operator's output and is a LoDTensor
a = some_op() a = some_op()
# declare parameters # declare parameters
...@@ -68,7 +68,7 @@ with rnn.stepnet(): ...@@ -68,7 +68,7 @@ with rnn.stepnet():
x = rnn.add_input(X) x = rnn.add_input(X)
# declare a memory (rnn's step) # declare a memory (rnn's step)
h = rnn.add_memory(init=a) h = rnn.add_memory(init=a)
# h.pre_state() means previous memory of rnn # h.pre_state(), the previous memory of rnn
new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
# update current memory # update current memory
h.update(new_state) h.update(new_state)
...@@ -80,19 +80,19 @@ out = rnn() ...@@ -80,19 +80,19 @@ out = rnn()
Python API functions in above example: Python API functions in above example:
- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs. - `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
- `rnn.add_memory` creates a variable used as the memory. - `rnn.add_memory`: creates a variable used as the memory.
- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output. - `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
### Nested RNN and LoDTensor ### Nested RNN and LoDTensor
An RNN whose step-net includes other RNN operators is known as an *nested RNN*. An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text. The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
<p aligh="center"> <p align="center">
<img src="./images/2_level_rnn.png"/> <img src="./images/2_level_rnn.png"/>
</p> </p>
...@@ -110,7 +110,7 @@ a = some_op() ...@@ -110,7 +110,7 @@ a = some_op()
# chapter_data is a set of 128-dim word vectors # chapter_data is a set of 128-dim word vectors
# the first level of LoD is sentence # the first level of LoD is sentence
# the second level of LoD is chapter # the second level of LoD is a chapter
chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
def lower_level_rnn(paragraph): def lower_level_rnn(paragraph):
...@@ -138,14 +138,14 @@ with top_level_rnn.stepnet(): ...@@ -138,14 +138,14 @@ with top_level_rnn.stepnet():
pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
top_level_rnn.add_outputs(h) top_level_rnn.add_outputs(h)
# just output the last step # output the last step
chapter_out = top_level_rnn(output_all_steps=False) chapter_out = top_level_rnn(output_all_steps=False)
``` ```
in above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
By default, the `RNNOp` will concatenate the outputs from all the time steps, By default, the `RNNOp` will concatenate the outputs from all the time steps.
if the `output_all_steps` set to False, it will only output the final time step. If the `output_all_steps` is set to False, it will only output the final time step.
<p align="center"> <p align="center">
......
# Design: Sequence Decoder Generating LoDTensors # Design: Sequence Decoder Generating LoDTensors
In tasks such as machine translation and image to text, In tasks such as machine translation and visual captioning,
a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences. a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
This documentation describes how to implement the sequence decoder as an operator. This documentation describes how to implement the sequence decoder as an operator.
## Beam Search based Decoder ## Beam Search based Decoder
The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
due to the complexity, the implementation relays on a lot of special data structures,
quite trivial and hard to be customized by users.
There are a lot of heuristic tricks in the sequence generation tasks, There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
so the flexibility of sequence decoder is very important to users.
During PaddlePaddle's refactoring work, During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`; For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
## Changing LoD's absolute offset to relative offsets ## Changing LoD's absolute offset to relative offsets
The current `LoDTensor` is designed to store levels of variable-length sequences, The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
it stores several arrays of integers each represents a level.
The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clear. let's call this format the **absolute-offset LoD** for clarity.
The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python ```python
[[0, 3, 9] [[0, 3, 9]
[0, 2, 3, 3, 3, 9]] [0, 2, 3, 3, 3, 9]]
...@@ -41,8 +34,7 @@ The first level tells that there are two sequences: ...@@ -41,8 +34,7 @@ The first level tells that there are two sequences:
while on the second level, there are several empty sequences that both begin and end at `3`. while on the second level, there are several empty sequences that both begin and end at `3`.
It is impossible to tell how many empty second-level sequences exist in the first-level sequences. It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
There are many scenarios that relay on empty sequence representation, There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
So let's introduce another format of LoD, So let's introduce another format of LoD,
it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
...@@ -60,13 +52,12 @@ their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. ...@@ -60,13 +52,12 @@ their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
The second level is the same with the relative offset example because the lower level is a tensor. The second level is the same with the relative offset example because the lower level is a tensor.
It is easy to find out the second sequence in the first-level LoD has two empty sequences. It is easy to find out the second sequence in the first-level LoD has two empty sequences.
The following demos are based on relative-offset LoD. The following examples are based on relative-offset LoD.
## Usage in a simple machine translation model ## Usage in a simple machine translation model
Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it. Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
The model has an encoder that learns the semantic vector from a sequence, The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
and a decoder which uses the sequence decoder to generate new sentences.
**Encoder** **Encoder**
```python ```python
...@@ -154,34 +145,33 @@ def generate(): ...@@ -154,34 +145,33 @@ def generate():
translation_ids, translation_scores = decoder() translation_ids, translation_scores = decoder()
``` ```
The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates, The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
return the result of the beam search algorithm. returns the result of the beam search algorithm.
In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes In this way, users can customize anything on the input or output of beam search, for example:
1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. 1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
2. remove some specific candidate in `selected_ids` 2. Remove some specific candidate in `selected_ids`.
3. get the final `translation_ids`, remove the translation sequence in it. 3. Get the final `translation_ids`, remove the translation sequence in it.
The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
Both of them are two-level `LoDTensors` Both of them are two-level `LoDTensors`:
- the first level represents `batch_size` of (source) sentences; - The first level represents `batch_size` of (source) sentences.
- the second level represents the candidate ID sets for translation prefix. - The second level represents the candidate ID sets for translation prefix.
for example, 3 source sentences to translate, and has 2, 3, 1 candidates. For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
For example, the previous state For example, the previous state:
* LoD is `[0, 1, 3][0, 2, 5, 6]` * LoD is `[0, 1, 3][0, 2, 5, 6]`
* content of tensor is `a1 a2 b1 b2 b3 c1` * content of tensor is `a1 a2 b1 b2 b3 c1`
the current state stored in `encoder_ctx_expanded` the current state is stored in `encoder_ctx_expanded`:
* LoD is `[0, 2, 7][0 3 5 8 9 11 11]` * LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
* the content is * the content is
...@@ -192,54 +182,48 @@ the current state stored in `encoder_ctx_expanded` ...@@ -192,54 +182,48 @@ the current state stored in `encoder_ctx_expanded`
- b3 b3 - b3 b3
- None (c1 has 0 candidates, so c1 is dropped) - None (c1 has 0 candidates, so c1 is dropped)
Benefit from the relative offset LoD, empty candidate set can be represented naturally. The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
```python ```python
decoder.output(selected_ids) decoder.output(selected_ids)
decoder.output(selected_generation_scores) decoder.output(selected_generation_scores)
``` ```
the `selected_ids` is the candidate ids for the prefixes, The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
the first level represents the source sequences,
the second level represents generated sequences.
Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations. Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
## LoD and shape changes during decoding ## LoD and shape changes during decoding
<p align="center"> <p align="center">
<img src="./images/LOD-and-shape-changes-during-decoding.jpg"/> <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
</p> </p>
According the image above, the only phrase to change LoD is beam search. According to the image above, the only phase that changes the LoD is beam search.
## Beam search design ## Beam search design
The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
1. `topk_ids`, top K candidate ids for each prefix. 1. `topk_ids`, the top K candidate ids for each prefix.
2. `topk_scores`, the corresponding scores for `topk_ids` 2. `topk_scores`, the corresponding scores for `topk_ids`
3. `generated_scores`, the score of the prefixes. 3. `generated_scores`, the score of the prefixes.
All of the are LoDTensors, so that the sequence affilication is clear. All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
It will return three variables It will return three variables:
1. `selected_ids`, the final candidate beam search function selected for the next step. 1. `selected_ids`, the final candidate beam search function selected for the next step.
2. `selected_scores`, the scores for the candidates. 2. `selected_scores`, the scores for the candidates.
3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended). 3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` ## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors, The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
and they exist in each time step,
so it is natural to store them in arrays. so it is natural to store them in arrays.
Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors, Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
the results of beam search are better to store in a `TensorArray`.
The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
It needs some extensions to support pack or unpack an array of `LoDTensors`. It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
# 构建Android平台上的PaddlePaddle库 # Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: 用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式 - 基于Docker容器的编译方式
......
# 构建iOS平台上的PaddlePaddle库 # iOS平台编译指南
交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。
## 准备交叉编译环境 ## 准备交叉编译环境
...@@ -25,7 +25,7 @@ iOS平台可选配置参数: ...@@ -25,7 +25,7 @@ iOS平台可选配置参数:
- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS` - `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`
- `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示: - `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构
<table class="docutils"> <table class="docutils">
<colgroup> <colgroup>
...@@ -41,11 +41,11 @@ iOS平台可选配置参数: ...@@ -41,11 +41,11 @@ iOS平台可选配置参数:
<tbody valign="top"> <tbody valign="top">
<tr class="row-even"> <tr class="row-even">
<td>OS</td> <td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td> <td>armv7, armv7s, arm64 </td>
</tr> </tr>
<tr class="row-odd"> <tr class="row-odd">
<td>SIMULATOR</td> <td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td> <td>i386, x86_64 </td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
...@@ -66,7 +66,7 @@ iOS平台可选配置参数: ...@@ -66,7 +66,7 @@ iOS平台可选配置参数:
```bash ```bash
cmake -DCMAKE_SYSTEM_NAME=iOS \ cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \ -DIOS_PLATFORM=OS \
-DIOS_ARCH="arm64" \ -DIOS_ARCH="armv7;arm64" \
-DIOS_ENABLE_BITCODE=ON \ -DIOS_ENABLE_BITCODE=ON \
-DIOS_USE_VECLIB_FOR_BLAS=ON \ -DIOS_USE_VECLIB_FOR_BLAS=ON \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
...@@ -112,6 +112,6 @@ $ make install ...@@ -112,6 +112,6 @@ $ make install
- `lib`目录,其中包含PaddlePaddle的C-API静态库 - `lib`目录,其中包含PaddlePaddle的C-API静态库
- `third_party`目录,其中包含所依赖的所有第三方库 - `third_party`目录,其中包含所依赖的所有第三方库
注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。 注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。
自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。
# 构建Raspberry Pi平台上的PaddlePaddle库 # Raspberry Pi平台编译指南
通常有两个方法来构建基于 Rasspberry Pi 的版本: 通常有两个方法来构建基于 Rasspberry Pi 的版本:
......
...@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) { ...@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) {
extern "C" { extern "C" {
paddle_error paddle_init(int argc, char** argv) { paddle_error paddle_init(int argc, char** argv) {
static bool isInit = false;
if (isInit) return kPD_NO_ERROR;
std::vector<char*> realArgv; std::vector<char*> realArgv;
realArgv.reserve(argc + 1); realArgv.reserve(argc + 1);
realArgv.push_back(strdup("")); realArgv.push_back(strdup(""));
...@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) { ...@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) {
} }
initPaddle(argc + 1, realArgv.data()); initPaddle(argc + 1, realArgv.data());
free(realArgv[0]); free(realArgv[0]);
isInit = true;
return kPD_NO_ERROR; return kPD_NO_ERROR;
} }
} }
...@@ -25,7 +25,9 @@ limitations under the License. */ ...@@ -25,7 +25,9 @@ limitations under the License. */
#include "hl_matrix.h" #include "hl_matrix.h"
#include "hl_sequence.h" #include "hl_sequence.h"
#include "hl_sparse.h" #include "hl_sparse.h"
#ifndef PADDLE_MOBILE_INFERENCE
#include "hl_warpctc_wrap.h" #include "hl_warpctc_wrap.h"
#endif
#ifdef HPPL_STUB_FUNC #ifdef HPPL_STUB_FUNC
#include "stub/hl_aggregate_stub.h" #include "stub/hl_aggregate_stub.h"
......
...@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names, ...@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names,
return false; return false;
} }
} }
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "All input {";
for (auto& name : names) {
sout << name << ",";
}
sout << "} is in {";
for (auto& name : set) {
sout << name << ",";
}
sout << "}";
VLOG(10) << sout.str();
}
return true; return true;
} }
...@@ -290,14 +303,12 @@ static void CreateGradVarInBlock( ...@@ -290,14 +303,12 @@ static void CreateGradVarInBlock(
auto ops = block_desc->AllOps(); auto ops = block_desc->AllOps();
for (size_t op_index = grad_op_start_index; op_index < ops.size(); for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) { ++op_index) {
bool need_infer_shape = false;
std::unordered_set<std::string> new_vars; std::unordered_set<std::string> new_vars;
ForEachVarName(ops[op_index]->Outputs(), ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) { [&](const std::string& grad_var_name) {
if (block_desc->HasVar(grad_var_name)) { if (block_desc->HasVar(grad_var_name)) {
return false; return false;
} }
need_infer_shape = true;
auto var = block_desc->Var(grad_var_name); auto var = block_desc->Var(grad_var_name);
new_vars.insert(var->Name()); new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name); auto it = param_name_map.find(grad_var_name);
...@@ -311,7 +322,6 @@ static void CreateGradVarInBlock( ...@@ -311,7 +322,6 @@ static void CreateGradVarInBlock(
grad_record.op_idx_ = static_cast<int>(op_index); grad_record.op_idx_ = static_cast<int>(op_index);
return false; /* not break */ return false; /* not break */
}); });
if (need_infer_shape) {
ops[op_index]->InferVarType(block_desc); ops[op_index]->InferVarType(block_desc);
for (auto& arg : ops[op_index]->OutputArgumentNames()) { for (auto& arg : ops[op_index]->OutputArgumentNames()) {
if (new_vars.find(arg) == new_vars.end()) { if (new_vars.find(arg) == new_vars.end()) {
...@@ -328,7 +338,6 @@ static void CreateGradVarInBlock( ...@@ -328,7 +338,6 @@ static void CreateGradVarInBlock(
} }
ops[op_index]->InferShape(*block_desc); ops[op_index]->InferShape(*block_desc);
} }
}
} }
std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad( std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
...@@ -387,6 +396,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -387,6 +396,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
ProgramDescBind& program_desc, int block_idx, ProgramDescBind& program_desc, int block_idx,
std::unordered_set<std::string>* no_grad_vars, std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
VLOG(5) << "MakeBlockBackward";
BlockDescBind* cur_block = program_desc.MutableBlock(block_idx); BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
std::vector<OpDescBind*> op_descs = cur_block->AllOps(); std::vector<OpDescBind*> op_descs = cur_block->AllOps();
std::unordered_map<std::string, std::vector<size_t>> dup_out_ops; std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
...@@ -394,9 +404,10 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -394,9 +404,10 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::vector<std::unique_ptr<OpDescBind>> backward_descs; std::vector<std::unique_ptr<OpDescBind>> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector<std::unique_ptr<OpDescBind>> op_grads; std::vector<std::unique_ptr<OpDescBind>> op_grads;
if ((*it)->Type() == "recurrent") { if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
int step_block_idx = (*it)->GetBlockAttr("step_block"); int step_block_idx = (*it)->GetBlockAttr("step_block");
BlockDescBind* backward_block = CreateStepBlock( BlockDescBind* backward_block = CreateStepBlock(
program_desc, no_grad_vars, grad_to_var, step_block_idx); program_desc, no_grad_vars, grad_to_var, step_block_idx);
...@@ -410,6 +421,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -410,6 +421,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var); op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
} }
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "Made ";
for (auto& op_grad : op_grads) {
sout << op_grad->Type() << " ";
}
VLOG(10) << sout.str();
}
for (const auto& desc : op_grads) { for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) { for (const std::string& out_name : desc->OutputArgumentNames()) {
if (out_name.find("@GRAD") == std::string::npos) { if (out_name.find("@GRAD") == std::string::npos) {
...@@ -425,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -425,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs), op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
[](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); }); [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
} }
VLOG(5) << "Appending Sums";
// Check whether some variables are written more than once // Check whether some variables are written more than once
std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops; std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
for (const auto& dup : dup_out_ops) { for (const auto& dup : dup_out_ops) {
...@@ -432,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -432,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
const std::vector<size_t> dup_op = dup.second; const std::vector<size_t> dup_op = dup.second;
if (out_name != kEmptyVarName && dup_op.size() > 1) { if (out_name != kEmptyVarName && dup_op.size() > 1) {
std::vector<std::string> sum_op_inputs; std::vector<std::string> sum_op_inputs;
std::string next_g_name = out_name;
for (size_t i = 0; i < dup_op.size(); ++i) { for (size_t i = 0; i < dup_op.size(); ++i) {
VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
<< " duplicated";
std::string new_name = out_name + "@RENAME@" + std::to_string(i); std::string new_name = out_name + "@RENAME@" + std::to_string(i);
backward_descs[dup_op[i]]->Rename(out_name, new_name); backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
sum_op_inputs.emplace_back(new_name); sum_op_inputs.emplace_back(new_name);
next_g_name = sum_op_inputs.back();
} }
std::unique_ptr<OpDescBind> sum_op(new OpDescBind( std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
"sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
} }
} }
pending_sum_ops.sort( pending_sum_ops.sort(
[](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a, [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) { const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
...@@ -452,6 +480,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -452,6 +480,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::move(p.second)); std::move(p.second));
} }
VLOG(5) << "MakeBlockBackward Finished";
return backward_descs; return backward_descs;
} }
......
...@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) { ...@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::INT32; return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) { } else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64; return DataType::INT64;
} else if (typeid(bool).hash_code() == type.hash_code()) {
return DataType::BOOL;
} else { } else {
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
......
...@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) { ...@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
ddim = make_dim<9>(dims); ddim = make_dim<9>(dims);
break; break;
default: default:
throw std::invalid_argument( PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
"Dynamic dimensions must have between [1, 9] dimensions.");
} }
} }
......
...@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, ...@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(10) << op->DebugString();
op->Run(*local_scope, *device); op->Run(*local_scope, *device);
} }
if (create_local_scope) { if (create_local_scope) {
......
...@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name, ...@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
need_update_ = true; need_update_ = true;
} }
void OpDescBind::RenameOutput(const std::string &old_name,
const std::string &new_name) {
for (auto &output : outputs_) {
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
need_update_ = true;
}
void OpDescBind::RenameInput(const std::string &old_name,
const std::string &new_name) {
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
need_update_ = true;
}
struct SetAttrDescVisitor : public boost::static_visitor<void> { struct SetAttrDescVisitor : public boost::static_visitor<void> {
explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {} explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
mutable OpDesc::Attr *attr_; mutable OpDesc::Attr *attr_;
...@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs( ...@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
try {
return framework::make_ddim(var->Shape()); return framework::make_ddim(var->Shape());
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
} }
void CompileTimeInferShapeContext::SetDim(const std::string &name, void CompileTimeInferShapeContext::SetDim(const std::string &name,
......
...@@ -73,6 +73,10 @@ class OpDescBind { ...@@ -73,6 +73,10 @@ class OpDescBind {
void Rename(const std::string &old_name, const std::string &new_name); void Rename(const std::string &old_name, const std::string &new_name);
void RenameOutput(const std::string &old_name, const std::string &new_name);
void RenameInput(const std::string &old_name, const std::string &new_name);
// Only be used in C++ // Only be used in C++
const AttributeMap &GetAttrMap() const; const AttributeMap &GetAttrMap() const;
......
...@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
void OperatorWithKernel::Run(const Scope& scope, void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const { const platform::DeviceContext& dev_ctx) const {
if (VLOG_IS_ON(1)) {
auto inputs = this->InputVars();
auto outputs = this->OutputVars(true);
std::ostringstream sout;
sout << "Run operator " << this->Type() << " From [";
std::ostream_iterator<std::string> out_it(sout, ",");
std::copy(inputs.begin(), inputs.end(), out_it);
sout << "] to [";
std::copy(outputs.begin(), outputs.end(), out_it);
sout << "]";
VLOG(1) << sout.str();
}
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
......
...@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const { ...@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const {
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
auto iter = vars_.find(name); auto iter = vars_.find(name);
if (iter != vars_.end()) { if (iter != vars_.end()) {
VLOG(3) << "Get existing variable " << name;
return iter->second; return iter->second;
} }
Variable* v = new Variable(); Variable* v = new Variable();
vars_[name] = v; vars_[name] = v;
VLOG(3) << "Create variable " << name << " on scope"; VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first); v->name_ = &(vars_.find(name)->first);
return v; return v;
} }
......
...@@ -53,6 +53,10 @@ class InferShapeContext { ...@@ -53,6 +53,10 @@ class InferShapeContext {
virtual bool IsRuntime() const = 0; virtual bool IsRuntime() const = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
...@@ -60,9 +64,6 @@ class InferShapeContext { ...@@ -60,9 +64,6 @@ class InferShapeContext {
std::vector<framework::DDim> GetDims( std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes( std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
......
...@@ -17,9 +17,13 @@ limitations under the License. */ ...@@ -17,9 +17,13 @@ limitations under the License. */
#include "paddle/utils/StringUtil.h" #include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
#ifndef PADDLE_MOBILE_INFERENCE
DEFINE_int32(pool_limit_size, DEFINE_int32(pool_limit_size,
536870912, 536870912,
"maximum memory size managed by a memory pool, default is 512M"); "maximum memory size managed by a memory pool, default is 512M");
#else
DEFINE_int32(pool_limit_size, 0, "default is 0");
#endif
namespace paddle { namespace paddle {
......
# Region-based Heterogeneous Memory Management # Region-based Heterogeneous Memory Management
## Design
Please check out the [design documentation](http://gangliao.me) to find out more details about ### Usage
buddy memory allocator for both CPU and GPU.
To allocate 4KB CPU memory:
```cpp
p = memory::Alloc(platform::CPUPlace(), 4*1024);
```
To allocate 4KB memory on the 3rd GPU:
```cpp
p = memory::Alloc(platform::GPUPlace(2), 4*1024);
```
To free memory and check the so-far used amount of memory on a place:
```cpp
auto pl = platform::GPUPlace(0);
p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl);
memory::Free(pl, p);
```
### API
In `paddle/memory/memory.h` we have:
```cpp
namespace memory {
template <typename Place> void* Alloc(Place, size_t);
template <typename Place> void Free(Place, void*);
template <typename Place> size_t Used(Place);
} // namespace memory
```
These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
```cpp
template<>
void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
return GetCPUBuddyAllocator()->Alloc(size);
}
```
and
```cpp
template<>
void Alloc<GPUPlace>(GPUPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size);
}
```
Similar specializations exist for `Free` and `Used`.
### Implementation
`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
```cpp
BuddyAllocator* GetCPUBuddyAllocator() {
static BuddyAllocator* a = NULL;
if (a == NULL) {
a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
}
return a;
}
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static BuddyAllocator* as = NULL;
if (as == NULL) {
as = new BuddyAllocator*[platform::NumGPUs()];
for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
}
}
return as[gpu_id);
```
#### `BuddyAllocator`
`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm:
```cpp
BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
...
}
```
Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
```cpp
class BuddyAllocator {
private:
struct Block {
size_t size;
Block* left, right;
size_t index; // allocator id
};
...
};
```
Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
#### System Allocators
The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`.
## Justification
I got inspiration from Majel and Caffe2, though above design look different from both.
### Caffe2
In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy. In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
There are two implementations of `Context`:
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
### Majel
In Majel, there are basically two allocator types:
1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces.
In Majel there are hidden global variables like:
1. `cpu::SystemAllocator g_cpu_allocator`, and
1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
...@@ -9,6 +9,7 @@ function(op_library TARGET) ...@@ -9,6 +9,7 @@ function(op_library TARGET)
set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
set(cu_cc_srcs)
set(op_common_deps operator op_registry math_function) set(op_common_deps operator op_registry math_function)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
...@@ -22,6 +23,9 @@ function(op_library TARGET) ...@@ -22,6 +23,9 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc) list(APPEND cc_srcs ${TARGET}.cc)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
...@@ -29,6 +33,8 @@ function(op_library TARGET) ...@@ -29,6 +33,8 @@ function(op_library TARGET)
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$") if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src}) list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
else() else()
...@@ -43,7 +49,7 @@ function(op_library TARGET) ...@@ -43,7 +49,7 @@ function(op_library TARGET)
endif() endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
...@@ -140,7 +146,9 @@ function(op_library TARGET) ...@@ -140,7 +146,9 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP # pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -160,11 +168,12 @@ set(DEPS_OPS ...@@ -160,11 +168,12 @@ set(DEPS_OPS
recurrent_op recurrent_op
dynamic_recurrent_op dynamic_recurrent_op
softmax_with_cross_entropy_op softmax_with_cross_entropy_op
softmax_op
sequence_softmax_op
sum_op sum_op
pool_op pool_op
pool_with_index_op pool_with_index_op
conv_op conv_op
lstm_op
conv_transpose_op conv_transpose_op
nccl_op nccl_op
sequence_conv_op sequence_conv_op
...@@ -174,13 +183,20 @@ set(DEPS_OPS ...@@ -174,13 +183,20 @@ set(DEPS_OPS
array_to_lod_tensor_op array_to_lod_tensor_op
lstm_op lstm_op
tensor_array_read_write_op tensor_array_read_write_op
gru_op) gru_op
adagrad_op
sgd_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax)
op_library(sum_op DEPS selected_rows_functor)
op_library(sgd_op DEPS selected_rows_functor)
op_library(adagrad_op DEPS selected_rows_functor)
op_library(conv_op DEPS vol2col) op_library(conv_op DEPS vol2col)
op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
...@@ -220,6 +236,6 @@ cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc ...@@ -220,6 +236,6 @@ cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
rnn/recurrent_op_utils.cc rnn/recurrent_op_utils.cc
DEPS dynamic_recurrent_op) DEPS dynamic_recurrent_op)
if(WITH_GPU) if(WITH_GPU)
nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <thrust/reduce.h> #include <thrust/reduce.h>
#include "paddle/operators/accuracy_op.h" #include "paddle/operators/accuracy_op.h"
#include "paddle/platform/cuda_helper.h" #include "paddle/platform/cuda_helper.h"
#include "paddle/platform/gpu_info.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -73,26 +74,28 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -73,26 +74,28 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
int num_samples = static_cast<int>(inference->dims()[0]); int num_samples = static_cast<int>(inference->dims()[0]);
size_t infer_width = inference->dims()[1]; size_t infer_width = inference->dims()[1];
PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float))); auto stream = ctx.cuda_device_context().stream();
// cudaMemset((void**)&correct_data, 0, sizeof(float)); platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
if (num_samples == 0) { if (num_samples == 0) {
return; return;
} }
cudaMemcpy(total_data, &num_samples, sizeof(int), cudaMemcpyHostToDevice); platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
cudaMemcpyHostToDevice, stream);
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<< AccuracyCudaKernel<
1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>( PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
num_samples, infer_width, indices_data, label_data, correct_data, num_samples, infer_width, indices_data, label_data, correct_data,
accuracy_data); accuracy_data);
int d_num_samples, d_num_correct; int d_num_samples, d_num_correct;
float d_accuracy; float d_accuracy;
cudaMemcpy(&d_num_correct, correct_data, sizeof(int), platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost, stream);
cudaMemcpy(&d_num_samples, total_data, sizeof(int), cudaMemcpyDeviceToHost); platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
cudaMemcpy(&d_accuracy, accuracy_data, sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaMemcpyDeviceToHost); platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
cudaMemcpyDeviceToHost, stream);
} }
}; };
......
...@@ -14,6 +14,11 @@ limitations under the License. */ ...@@ -14,6 +14,11 @@ limitations under the License. */
#include "paddle/operators/adagrad_op.h" #include "paddle/operators/adagrad_op.h"
#include <cmath>
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -21,7 +26,7 @@ class AdagradOp : public framework::OperatorWithKernel { ...@@ -21,7 +26,7 @@ class AdagradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"), PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of AdagradOp should not be null."); "Input(Param) of AdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"), PADDLE_ENFORCE(ctx->HasInput("Grad"),
...@@ -54,8 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel { ...@@ -54,8 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel {
class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
AdagradOpMaker(framework::OpProto *proto, AdagradOpMaker(framework::OpProto* proto,
framework::OpAttrChecker *op_checker) framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter"); AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient"); AddInput("Grad", "(Tensor) Input gradient");
...@@ -87,10 +92,85 @@ for numerical stability to avoid the division by zero error. ...@@ -87,10 +92,85 @@ for numerical stability to avoid the division by zero error.
)DOC"); )DOC");
} }
}; };
namespace {
size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
return std::find(rows.begin(), rows.end(), value) - rows.begin();
}
} // namespace
template <typename T>
struct SparseAdagradFunctor<platform::CPUPlace, T> {
void operator()(const platform::DeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) {
// 1. g_m.rows = set(g.rows)
auto grad_rows = grad.rows();
std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto grad_width = grad.value().dims()[1];
std::unique_ptr<framework::SelectedRows> grad_merge{
new framework::SelectedRows()};
grad_merge->set_rows(merge_rows);
grad_merge->set_height(grad.height());
grad_merge->mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());
math::SetConstant<platform::CPUPlace, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);
auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
auto* grad_data = grad.value().data<T>();
for (size_t i = 0; i < grad_rows.size(); i++) {
size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]);
for (int64_t j = 0; j < grad_width; j++) {
grad_merge_data[grad_merge_i * grad_width + j] +=
grad_data[i * grad_width + j];
}
}
// 2. m += g_m * g_m
std::unique_ptr<framework::SelectedRows> grad_square{
new framework::SelectedRows()};
grad_square->set_rows(grad_merge->rows());
grad_square->set_height(grad_merge->height());
grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
context.GetPlace());
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
functor(context, *grad_square, moment);
// 3. update parameter
auto* lr = learning_rate.data<T>();
auto* param_data = param->data<T>();
auto* moment_data = moment->data<T>();
for (size_t i = 0; i < merge_rows.size(); i++) {
for (int64_t j = 0; j < grad_width; j++) {
param_data[merge_rows[i] * grad_width + j] -=
lr[0] * grad_merge_data[i * grad_width + j] /
(std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
}
}
}
};
template struct SparseAdagradFunctor<platform::CPUPlace, float>;
template struct SparseAdagradFunctor<platform::CPUPlace, double>;
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
REGISTER_OP_CPU_KERNEL(adagrad, REGISTER_OP_CPU_KERNEL(
ops::AdagradOpKernel<paddle::platform::CPUPlace, float>); adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
...@@ -14,7 +14,138 @@ ...@@ -14,7 +14,138 @@
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/operators/adagrad_op.h" #include "paddle/operators/adagrad_op.h"
#include "paddle/operators/math/selected_rows_functor.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace {
template <typename T, int block_size>
__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
T* grad_merge, const int64_t* grad_merge_rows,
size_t grad_merge_rows_size,
int64_t row_numel) {
const int ty = blockIdx.y;
int tid = threadIdx.x;
__shared__ size_t grad_merge_idx;
if (tid == 0) {
for (size_t i = 0; i < grad_merge_rows_size; i++) {
if (grad_rows[ty] == grad_merge_rows[i]) {
grad_merge_idx = i;
}
}
}
__syncthreads();
grad += ty * row_numel;
grad_merge += grad_merge_idx * row_numel;
for (int index = tid; index < row_numel; index += block_size) {
paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
}
}
template <typename T, int block_size>
__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
const T* learning_rate, T* param,
T* moment, int64_t row_numel,
T epsilon) {
const int ty = blockIdx.y;
int tid = threadIdx.x;
grad += ty * row_numel;
param += rows[ty] * row_numel;
moment += rows[ty] * row_numel;
for (int index = tid; index < row_numel; index += block_size) {
// Since index in rows of SelectedRows can be duplicate, we have to use
// Atomic Operation to avoid concurrent write error.
paddle::platform::CudaAtomicAdd(param + index,
-1.0 * learning_rate[0] * grad[index] /
(sqrt(moment[index]) + epsilon));
}
}
} // namespace
template <typename T>
struct SparseAdagradFunctor<platform::GPUPlace, T> {
void operator()(const platform::DeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) {
// 1. g_m.rows = set(g.rows)
auto grad_rows = grad.rows();
std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto grad_width = grad.value().dims()[1];
std::unique_ptr<framework::SelectedRows> grad_merge{
new framework::SelectedRows()};
grad_merge->set_rows(merge_rows);
grad_merge->set_height(grad.height());
grad_merge->mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());
math::SetConstant<platform::GPUPlace, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);
auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
auto* grad_data = grad.value().data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid1(1, grad_rows.size());
MergeGradKernel<
T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_data, grad.rows().data(),
grad_merge_data, grad_merge->rows().data(),
grad_merge->rows().size(), grad_width);
// 2. m += g_m * g_m
std::unique_ptr<framework::SelectedRows> grad_square{
new framework::SelectedRows()};
grad_square->set_rows(grad_merge->rows());
grad_square->set_height(grad_merge->height());
grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
context.GetPlace());
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
functor(context, *grad_square, moment);
// 3. update parameter
auto* lr = learning_rate.data<T>();
auto* param_data = param->data<T>();
auto* moment_data = moment->data<T>();
dim3 grid2(1, merge_rows.size());
SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge->rows().data(),
lr, param_data,
moment_data, grad_width, epsilon);
}
};
template struct SparseAdagradFunctor<platform::GPUPlace, float>;
template struct SparseAdagradFunctor<platform::GPUPlace, double>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(adagrad, REGISTER_OP_GPU_KERNEL(
ops::AdagradOpKernel<paddle::platform::GPUPlace, float>); adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
...@@ -19,18 +19,28 @@ limitations under the License. */ ...@@ -19,18 +19,28 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename Place, typename T>
struct SparseAdagradFunctor {
void operator()(const platform::DeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param);
};
template <typename Place, typename T> template <typename Place, typename T>
class AdagradOpKernel : public framework::OpKernel<T> { class AdagradOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut"); auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut"); auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
param_out_tensor->mutable_data<T>(ctx.GetPlace()); param_out_tensor->mutable_data<T>(ctx.GetPlace());
moment_out_tensor->mutable_data<T>(ctx.GetPlace()); moment_out_tensor->mutable_data<T>(ctx.GetPlace());
float epsilon = ctx.Attr<float>("epsilon"); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto* grad_var = ctx.InputVar("Grad");
if (grad_var->IsType<framework::LoDTensor>()) {
auto param = framework::EigenVector<T>::Flatten( auto param = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Param")); *ctx.Input<framework::Tensor>("Param"));
auto grad = framework::EigenVector<T>::Flatten( auto grad = framework::EigenVector<T>::Flatten(
...@@ -48,6 +58,20 @@ class AdagradOpKernel : public framework::OpKernel<T> { ...@@ -48,6 +58,20 @@ class AdagradOpKernel : public framework::OpKernel<T> {
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel()); Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
param_out.device(place) = param_out.device(place) =
param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* param_tensor = ctx.Input<framework::Tensor>("Param");
PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
SparseAdagradFunctor<Place, T> functor;
functor(ctx.device_context(), *ctx.Input<framework::SelectedRows>("Grad"),
*ctx.Input<framework::Tensor>("LearningRate"), epsilon,
moment_out_tensor, param_out_tensor);
} else {
PADDLE_THROW("Unsupported Variable Type of Grad");
}
} }
}; };
......
...@@ -42,6 +42,7 @@ class ArrayOp : public framework::OperatorBase { ...@@ -42,6 +42,7 @@ class ArrayOp : public framework::OperatorBase {
} else { } else {
offset = static_cast<size_t>(*i_tensor.data<int64_t>()); offset = static_cast<size_t>(*i_tensor.data<int64_t>());
} }
VLOG(10) << " Offset = " << offset;
return offset; return offset;
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/beam_search_op.h"
#include <map>
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores) {
auto items = SelectTopBeamSizeItems();
auto selected_items = ToMap(items);
PruneEndidCandidates(pre_ids, &selected_items);
// calculate the output tensor's height
size_t num_instances = std::accumulate(
std::begin(items), std::end(items), 0,
[](size_t a, std::vector<Item> &b) { return a + b.size(); });
// the output tensor shape should be [num_instances, 1]
auto dims = framework::make_ddim(
std::vector<int64_t>({static_cast<int>(num_instances), 1}));
selected_ids->Resize(dims);
selected_scores->Resize(dims);
std::map<size_t /*offset*/, std::vector<Item>> hash;
framework::LoD new_lod;
auto *ids_data = selected_ids->mutable_data<int>(platform::CPUPlace());
auto *scores_data =
selected_scores->mutable_data<float>(platform::CPUPlace());
// fill in data
std::vector<size_t> low_level;
size_t low_offset = 0;
for (auto &items : selected_items) {
low_level.push_back(low_offset);
for (auto &item : items) {
ids_data[low_offset] = item.id;
scores_data[low_offset] = item.score;
low_offset++;
}
}
// fill lod
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
framework::LoD lod(2);
lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end());
selected_ids->set_lod(lod);
selected_scores->set_lod(lod);
}
void BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
std::vector<std::vector<Item>> *items) {
auto *pre_ids_data = pre_ids.data<int>();
for (size_t offset = 0; offset < items->size(); offset++) {
auto prefix_id = pre_ids_data[offset];
if (prefix_id == end_id_) {
items->at(offset).clear();
}
}
}
std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
const std::vector<std::vector<Item>> &items) {
std::vector<std::vector<Item>> result;
for (auto &entries : items) {
for (const auto &item : entries) {
if (item.offset >= result.size()) {
result.resize(item.offset + 1);
}
result[item.offset].push_back(item);
}
}
return result;
}
std::vector<std::vector<BeamSearch::Item>>
BeamSearch::SelectTopBeamSizeItems() {
std::vector<std::vector<Item>> result;
std::vector<Item> items;
// for each source sentence, select the top beam_size items across all
// candidate sets.
while (NextItemSet(&items)) {
std::nth_element(std::begin(items), std::begin(items) + beam_size_,
std::end(items), [](const Item &a, const Item &b) {
// TODO(superjom) make score's comparation customizable.
// partial sort in descending order
return a.score > b.score;
});
// prune the top beam_size items.
if (items.size() > beam_size_) {
items.resize(beam_size_);
}
result.emplace_back(items);
}
return result;
}
// the candidates of a source
bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
if (sent_offset_ >= ids_->NumElements(lod_level_)) {
return false;
}
// find the current candidates
auto ids = *ids_;
auto scores = *scores_;
auto source_abs_two_level_lod = framework::SliceInLevel(
ids.lod(), lod_level_, sent_offset_, sent_offset_ + 1);
source_abs_two_level_lod = framework::ToAbsOffset(source_abs_two_level_lod);
auto abs_lod = framework::ToAbsOffset(ids.lod());
PADDLE_ENFORCE_GE(source_abs_two_level_lod.size(), 2UL);
auto *ids_data = ids.data<int>();
auto *scores_data = scores.data<float>();
size_t instance_dim = 1;
for (int i = 1; i < ids.dims().size(); i++) {
instance_dim *= ids.dims()[i];
}
items->clear();
items->reserve(framework::product(ids.dims()));
for (size_t offset = abs_lod[lod_level_][sent_offset_];
offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
for (int d = 0; d < instance_dim; d++) {
const size_t dim_offset = offset * instance_dim + d;
items->emplace_back(offset, ids_data[dim_offset],
scores_data[dim_offset]);
}
}
sent_offset_++;
return true;
}
class BeamSearchProtoAndCheckerMaker
: public framework::OpProtoAndCheckerMaker {
public:
BeamSearchProtoAndCheckerMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
// inputs and outputs stored in proto
AddInput("pre_ids", "ids in previous step");
AddInput("ids", "a LoDTensor of shape of [None,k]");
AddInput("scores",
"a LoDTensor that has the same shape and LoD with `ids`");
AddOutput("selected_ids",
"a LoDTensor that stores the IDs selected by beam search");
AddOutput(
"selected_scores",
"a LoDTensor that has the same shape and LoD with `selected_ids`");
// Attributes stored in AttributeMap
AddAttr<int>("level", "the level of LoDTensor");
AddAttr<int>("beam_size", "beam size for beam search");
AddAttr<int>("end_id",
"the token id which indicates the end of a sequence");
AddComment(
"This is a beam search operator that help to generate sequences.");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp,
paddle::operators::BeamSearchProtoAndCheckerMaker);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_TESTING
#include "gtest/gtest.h"
#endif
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
/*
* This is an implementation of beam search.
*
* To explain the details, lets take machine translation task for example, in
* this task, one source sentence is translated to multiple target sentences,
* during this period, one sentence will be translated to multiple translation
* prefixes(target sentence that have not ended), in each time step a prefix
* will have some candidates, input the candidate ids and their corresponding
* scores (probabilities), it will sort and select the top beam_size candidates
* for each source sentence, and store the selected candidates's score and their
* corresponding ids to LoDTensors.
*
* A detailed example:
*
* Input
*
* ids:
* LoD (should have 2 levels)
* first level: [0, 1, 4]
* second level: [0, 1, 2, 3, 4]
*
* tensor's data
* [
* [4, 2, 5]
* [2, 1, 3]
* [3, 5, 2]
* [8, 2, 1]
* ]
*
* scores:
* LoD same as `ids`
* tensor's data
* [
* [0.5, 0.3, 0.2]
* [0.6, 0.3, 0.1]
* [0.9, 0.5, 0.1]
* [0.7, 0.5, 0.1]
* ]
*
* the inputs means that there are 2 source sentences to translate, and the
* first source has 1 prefix, the second source has 2 prefix.
*
* lets assume beam size is 2, and the beam search's output should be
* LoD
* first level:
* [0, 1, 2]
* second level:
* [0, 2, 4]
*
* tensor's data
* [[
* 0.5,
* 0.3,
* 0.9,
* 0.7
* ]]
*
* TODO all the prune operations should be in the beam search, so it is better
* to split the beam search algorithm into a sequence of smaller operators, and
* the prune operators can be inserted in this sequence.
*/
class BeamSearch {
public:
// TODO(superjom) make type customizable
using id_t = size_t;
using score_t = float;
/*
* Input the arguments that needed by this class.
*/
BeamSearch(const framework::LoDTensor& ids,
const framework::LoDTensor& scores, size_t level, size_t beam_size,
int end_id)
: beam_size_(beam_size),
ids_(&ids),
scores_(&scores),
lod_level_(level),
end_id_(end_id) {}
/*
* The main function of beam search.
*
* @selected_ids: a [None, 1]-shaped tensor with LoD.
* In a machine translation model, it might be the candidate term id sets,
* each set stored as a varience-length sequence.
* The format might be described with a two-level LoD
* - [[0 1]
* - [0 1 2]]
* - [[]
* - [0 1]]
* the first level of LoD tells that there are two source sentences. The
* second level describes the details of the candidate id set's offsets in
* the
* source sentences.
*
* @selected_scores: a LoD tensor with the same shape and LoD with
* selected_ids.
* It stores the corresponding scores of candidate ids in selected_ids.
*
* Return false if all the input tensor is empty, in machine translation task
* that means no candidates is provided, and the task will stop running.
*/
void operator()(const framework::LoDTensor& pre_ids,
framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores);
protected:
/*
* The basic items help to sort.
*/
struct Item {
Item() {}
Item(size_t offset, size_t id, float score)
: offset(offset), id(id), score(score) {}
// offset in the lod_level_+1
size_t offset;
// the candidate id
id_t id;
// the corresponding score
score_t score;
};
void PruneEndidCandidates(const framework::LoDTensor& pre_ids,
std::vector<std::vector<Item>>* items);
/*
* Transform the items into a map whose key is offset, value is the items.
* NOTE low performance
*/
std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>>& inputs);
/*
* For each source, select top beam_size records.
*/
std::vector<std::vector<Item>> SelectTopBeamSizeItems();
/*
* Get the items of next source sequence, return false if no remaining items.
*/
bool NextItemSet(std::vector<Item>* items);
private:
size_t beam_size_;
const framework::LoDTensor* ids_;
const framework::LoDTensor* scores_;
size_t lod_level_{0};
size_t sent_offset_{0};
int end_id_{0};
};
class BeamSearchOp : public framework::OperatorBase {
public:
BeamSearchOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
BeamSearchOp(const BeamSearchOp& o)
: framework::OperatorBase(
static_cast<const framework::OperatorBase&>(o)) {
PADDLE_THROW("Not Implemented");
}
void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override {
LOG(INFO) << "run beam search op";
auto ids_var = scope.FindVar(Input("ids"));
auto scores_var = scope.FindVar(Input("scores"));
auto pre_ids_var = scope.FindVar(Input("pre_ids"));
PADDLE_ENFORCE_NOT_NULL(ids_var);
PADDLE_ENFORCE_NOT_NULL(scores_var);
PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
auto& ids = ids_var->Get<framework::LoDTensor>();
auto& scores = scores_var->Get<framework::LoDTensor>();
auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>();
size_t level = Attr<int>("level");
size_t beam_size = Attr<int>("beam_size");
int end_id = Attr<int>("end_id");
LOG(INFO) << "init beam search";
BeamSearch alg(ids, scores, level, beam_size, end_id);
LOG(INFO) << "after beam search";
auto selected_ids_var = scope.FindVar(Output("selected_ids"));
auto selected_scores_var = scope.FindVar(Output("selected_scores"));
PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
auto& selected_ids_tensor =
*selected_ids_var->GetMutable<framework::LoDTensor>();
auto& selected_scores_tensor =
*selected_scores_var->GetMutable<framework::LoDTensor>();
LOG(INFO) << "run beam search";
alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
LOG(INFO) << "finish beam search";
}
};
} // namespace operators
} // namespace paddle
...@@ -174,7 +174,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> { ...@@ -174,7 +174,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
// Caculate the gradient of Input(Bias). // Caculate the gradient of Input(Bias).
if (d_bias) { if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace()); d_bias->mutable_data<T>(ctx.GetPlace());
auto d_bias_mat = EigenMatrix<T>::From(*d_bias); auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0)); d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
} }
} }
......
...@@ -200,9 +200,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -200,9 +200,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
T alpha = 1.0f, beta = 0.0f; T alpha = 1.0f, beta = 0.0f;
if (input_grad) { if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad); math::set_constant(ctx.device_context(), input_grad, 0);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward( PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_output_desc, output_grad_data, handle, &alpha, cudnn_output_desc, output_grad_data,
...@@ -214,9 +212,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -214,9 +212,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward filter --------------------- // ------------------- cudnn conv backward filter ---------------------
if (filter_grad) { if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace()); T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*filter_grad); math::set_constant(ctx.device_context(), filter_grad, 0);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
// Gradient with respect to the filter // Gradient with respect to the filter
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc, handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
......
...@@ -22,8 +22,6 @@ class CudnnConvOpMaker : public Conv2DOpMaker { ...@@ -22,8 +22,6 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
CudnnConvOpMaker(framework::OpProto* proto, CudnnConvOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker) framework::OpAttrChecker* op_checker)
: Conv2DOpMaker(proto, op_checker) { : Conv2DOpMaker(proto, op_checker) {
AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
.SetDefault(std::vector<int>{1, 1});
AddAttr<int>("workspace_size_MB", AddAttr<int>("workspace_size_MB",
"workspace size for cudnn, in MB, " "workspace size for cudnn, in MB, "
"workspace is a section of GPU memory which will be " "workspace is a section of GPU memory which will be "
......
...@@ -30,6 +30,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -30,6 +30,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides"); std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
int groups = ctx->Attrs().Get<int>("groups"); int groups = ctx->Attrs().Get<int>("groups");
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
int input_channels = in_dims[1]; int input_channels = in_dims[1];
int output_channels = filter_dims[0]; int output_channels = filter_dims[0];
...@@ -52,9 +53,15 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -52,9 +53,15 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
"The number of output channels should be divided by groups."); "The number of output channels should be divided by groups.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]}); std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < paddings.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
(dilations[i] * (filter_dims[i + 2] - 1) + 1) >
0,
"Due to the settings of paddings, filter_dims and "
"dilations, the output size is less than 0, please check "
"again.");
output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2], output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
paddings[i], strides[i])); dilations[i], paddings[i], strides[i]));
} }
ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
} }
...@@ -78,9 +85,15 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -78,9 +85,15 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution operator. " "(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides", "strides of convolution operator.") AddAttr<std::vector<int>>("strides",
"(vector<int> default:{1, 1}), the "
"strides(h_stride, w_stride) of "
"convolution operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings", "paddings of convolution operator.") AddAttr<std::vector<int>>("paddings",
"(vector<int> default:{0, 0}), the "
"paddings(h_pad, w_pad) of "
"convolution operator.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
...@@ -90,15 +103,20 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, ...@@ -90,15 +103,20 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
"first half of the input channels, while the second half of the filters " "first half of the input channels, while the second half of the filters "
"is only connected to the second half of the input channels.") "is only connected to the second half of the input channels.")
.SetDefault(1); .SetDefault(1);
AddAttr<std::vector<int>>("dilations",
"(vector<int> default:{1, 1}), the "
"dilations(h_dilation, w_dilation) of "
"convolution operator.")
.SetDefault({1, 1});
AddComment(R"DOC( AddComment(R"DOC(
Convolution Operator. Convolution Operator.
The convolution operation calculates the output based on the input, filter The convolution operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the and strides, paddings, groups, dilations parameters. The size of each dimension of the
parameters is checked in the infer-shape. parameters is checked in the infer-shape.
Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
size, C is the number of channels, H is the height of the feature, and W is size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(ksize, strides, paddings) are two elements. the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements.
These two elements represent height and width, respectively. These two elements represent height and width, respectively.
The input(X) size and output(Out) size may be different. The input(X) size and output(Out) size may be different.
...@@ -109,8 +127,8 @@ Example: ...@@ -109,8 +127,8 @@ Example:
Output: Output:
Output shape: (N, C_out, H_out, W_out) Output shape: (N, C_out, H_out, W_out)
where where
H_out = (H_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1; H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1;
W_out = (W_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1; W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1;
)DOC"); )DOC");
} }
...@@ -135,13 +153,15 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, ...@@ -135,13 +153,15 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution operator." "(Tensor) The output tensor of convolution operator."
"The format of output tensor is also NCDHW."); "The format of output tensor is also NCDHW.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>("strides",
"strides", "(vector<int>, default:{1, 1, 1}), the "
"(vector, default:{0, 0, 0}), the strides of convolution operator.") "strides(d_stride, h_stride, w_stride) of "
"convolution operator.")
.SetDefault({1, 1, 1}); .SetDefault({1, 1, 1});
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>("paddings",
"paddings", "(vector<int>, default:{0, 0, 0}), the "
"(vector, default:{0, 0, 0}), the paddings of convolution operator.") "paddings(d_pad, h_pad, w_pad) of convolution "
"operator.")
.SetDefault({0, 0, 0}); .SetDefault({0, 0, 0});
AddAttr<int>( AddAttr<int>(
"groups", "groups",
...@@ -151,6 +171,12 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, ...@@ -151,6 +171,12 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
"first half of the input channels, while the second half of the filters " "first half of the input channels, while the second half of the filters "
"is only connected to the second half of the input channels.") "is only connected to the second half of the input channels.")
.SetDefault(1); .SetDefault(1);
AddAttr<std::vector<int>>("dilations",
"(vector<int> default:{1, 1, 1}), the "
"dilations(d_dilation, h_dilation, w_dilation) of "
"convolution operator. Currently, conv3d doesn't "
"support dilation.")
.SetDefault({1, 1, 1});
AddComment(R"DOC( AddComment(R"DOC(
Convolution3D Operator. Convolution3D Operator.
......
...@@ -27,11 +27,24 @@ using Tensor = framework::Tensor; ...@@ -27,11 +27,24 @@ using Tensor = framework::Tensor;
// Base convolution operator definations for other conv // Base convolution operator definations for other conv
// like operators to reuse the implementation. // like operators to reuse the implementation.
inline int OutputSize(int input_size, int filter_size, int padding, inline int OutputSize(int input_size, int filter_size, int dilation,
int stride) { int padding, int stride) {
int output_size = (input_size - filter_size + 2 * padding) / stride + 1; const int dkernel = dilation * (filter_size - 1) + 1;
const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
return output_size; return output_size;
} }
inline bool IsExpand(std::vector<int64_t>& filter_dim,
std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
// Define Op classes in .h file so that other conv // Define Op classes in .h file so that other conv
// operator implementations can reuse the code. // operator implementations can reuse the code.
...@@ -50,14 +63,12 @@ class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -50,14 +63,12 @@ class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
class ConvOp : public framework::OperatorWithKernel { class ConvOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override; void InferShape(framework::InferShapeContext* ctx) const override;
}; };
class ConvOpGrad : public framework::OperatorWithKernel { class ConvOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override; void InferShape(framework::InferShapeContext* ctx) const override;
}; };
...@@ -73,9 +84,10 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -73,9 +84,10 @@ class GemmConvKernel : public framework::OpKernel<T> {
Tensor* output = context.Output<Tensor>("Output"); Tensor* output = context.Output<Tensor>("Output");
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
int groups = context.Attr<int>("groups");
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups"); std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
...@@ -106,14 +118,17 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -106,14 +118,17 @@ class GemmConvKernel : public framework::OpKernel<T> {
framework::DDim col_matrix_shape = framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1); framework::flatten_to_2d(col_shape, filter_shape_vec.size() + 1);
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col; Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col, // col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape // but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface. // to call the matrix multiplication interface.
Tensor col_matrix; Tensor col_matrix;
if (is_expand) {
col.mutable_data<T>(col_shape, context.GetPlace());
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim( framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size())); input->dims(), 1, static_cast<int>(input->dims().size()));
...@@ -130,24 +145,30 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -130,24 +145,30 @@ class GemmConvKernel : public framework::OpKernel<T> {
int in_step = static_cast<int>(input->dims()[1]) / groups; int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups; int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<Place, T> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) { for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (filter_shape_vec.size() == 2) { if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (filter_shape_vec.size() == 2) {
// im2col // im2col
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col; im2col(context.device_context(), in_slice, dilations, strides,
im2col(context.device_context(), in_slice, col, strides[0], std::vector<int>{paddings[0], paddings[1], paddings[0],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]},
paddings[1]); &col);
} else if (filter_shape_vec.size() == 3) { } else if (filter_shape_vec.size() == 3) {
// vol2col // vol2col
math::Vol2ColFunctor<Place, T> vol2col; vol2col(context.device_context(), in_slice, dilations, strides,
vol2col(context.device_context(), in_slice, col, strides[0], paddings, &col);
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
} }
// gemm // gemm
...@@ -178,9 +199,10 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -178,9 +199,10 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
if (!input_grad && !filter_grad) return; if (!input_grad && !filter_grad) return;
int groups = context.Attr<int>("groups");
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
int groups = context.Attr<int>("groups"); std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
...@@ -230,14 +252,17 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -230,14 +252,17 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
int in_step = static_cast<int>(input->dims()[1]) / groups; int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output_grad->dims()[1]) / groups; int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col; Tensor col;
// col_matrix shares the same piece of data with col, // col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape // but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface. // to call the matrix multiplication interface.
Tensor col_matrix; Tensor col_matrix;
if (is_expand) {
col.mutable_data<T>(col_shape, context.GetPlace()); col.mutable_data<T>(col_shape, context.GetPlace());
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
}
math::SetConstant<Place, T> set_zero; math::SetConstant<Place, T> set_zero;
...@@ -245,6 +270,9 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -245,6 +270,9 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
set_zero(context.device_context(), input_grad, static_cast<T>(0)); set_zero(context.device_context(), input_grad, static_cast<T>(0));
math::Col2VolFunctor<Place, T> col2vol;
math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_matrix_shape); output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
...@@ -254,24 +282,26 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -254,24 +282,26 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
Tensor out_grad_slice = Tensor out_grad_slice =
out_grad_batch.Slice(g * out_step, (g + 1) * out_step); out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix,
T(0.0));
// col2im
Tensor in_grad_slice = Tensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step); in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
if (filter_shape_vec.size() == 2) { if (!is_expand) {
math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im; col_matrix.ShareDataWith(in_grad_slice);
col2im(context.device_context(), in_grad_slice, col, strides[0], col_matrix.Resize(col_matrix_shape);
strides[1], paddings[0], paddings[0], paddings[1], }
paddings[1]); math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix,
T(0.0));
} else if (filter_shape_vec.size() == 3) { if (is_expand && filter_shape_vec.size() == 2) {
math::Col2VolFunctor<Place, T> col2vol; col2im(context.device_context(), col, dilations, strides,
col2vol(context.device_context(), in_grad_slice, col, strides[0], std::vector<int>{paddings[0], paddings[1], paddings[0],
strides[1], strides[2], paddings[0], paddings[1], paddings[1]},
paddings[2]); &in_grad_slice);
} else if (is_expand && filter_shape_vec.size() == 3) {
col2vol(context.device_context(), col, dilations, strides, paddings,
&in_grad_slice);
} }
} }
} }
...@@ -282,7 +312,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -282,7 +312,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
Tensor filter_grad_ = *filter_grad; Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape); filter_grad_.Resize(filter_matrix_shape);
set_zero(context.device_context(), filter_grad, static_cast<T>(0)); set_zero(context.device_context(), filter_grad, static_cast<T>(0));
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
math::Vol2ColFunctor<Place, T> vol2col;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch = Tensor out_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_matrix_shape); output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
...@@ -293,16 +324,18 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -293,16 +324,18 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
out_grad_batch.Slice(g * out_step, (g + 1) * out_step); out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (filter_shape_vec.size() == 2) { if (!is_expand) {
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col; col.ShareDataWith(in_slice);
im2col(context.device_context(), in_slice, col, strides[0], col_matrix.ShareDataWith(col);
strides[1], paddings[0], paddings[0], paddings[1], col_matrix.Resize(col_matrix_shape);
paddings[1]); } else if (filter_shape_vec.size() == 2) {
im2col(context.device_context(), in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (filter_shape_vec.size() == 3) { } else if (filter_shape_vec.size() == 3) {
math::Vol2ColFunctor<Place, T> vol2col; vol2col(context.device_context(), in_slice, dilations, strides,
vol2col(context.device_context(), in_slice, col, strides[0], paddings, &col);
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
} }
// gemm // gemm
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
limitations under the License. */ limitations under the License. */
#include "paddle/operators/conv_shift_op.h" #include "paddle/operators/conv_shift_op.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/platform/cuda_helper.h" #include "paddle/platform/cuda_helper.h"
namespace paddle { namespace paddle {
...@@ -22,7 +23,7 @@ using framework::Tensor; ...@@ -22,7 +23,7 @@ using framework::Tensor;
namespace { namespace {
inline int div_up(int x, int y) { return (x + y - 1) / y; } inline int DivUp(int x, int y) { return (x + y - 1) / y; }
// Some notes on the design: // Some notes on the design:
// //
...@@ -33,9 +34,9 @@ inline int div_up(int x, int y) { return (x + y - 1) / y; } ...@@ -33,9 +34,9 @@ inline int div_up(int x, int y) { return (x + y - 1) / y; }
// y is fairly small. For large y, it would probably be more efficient // y is fairly small. For large y, it would probably be more efficient
// to also tile across y. // to also tile across y.
template <typename T> template <typename T>
__global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width, __global__ void ConvShiftForward(const T *x, const T *y, int x_width,
int y_width, int y_half_width, int y_width, int y_half_width, int batch_size,
int batch_size) { T *out) {
extern __shared__ T mem[]; extern __shared__ T mem[];
int tx = threadIdx.x; int tx = threadIdx.x;
...@@ -62,11 +63,10 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width, ...@@ -62,11 +63,10 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width,
if (tx < num_x) { if (tx < num_x) {
int load_i = (i - y_half_width + x_width) % x_width; int load_i = (i - y_half_width + x_width) % x_width;
sx[tx] = x[k * x_width + load_i]; sx[tx] = x[k * x_width + load_i];
} else {
return;
} }
__syncthreads(); __syncthreads();
if (tx < num_x) {
// Compute dot product of sx[tx:tx + y_width] and sy. // Compute dot product of sx[tx:tx + y_width] and sy.
T sum = 0; T sum = 0;
for (int j = 0; j < y_width; ++j) { for (int j = 0; j < y_width; ++j) {
...@@ -75,12 +75,14 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width, ...@@ -75,12 +75,14 @@ __global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width,
// Save to out[k, i]. // Save to out[k, i].
out[k * x_width + i] = sum; out[k * x_width + i] = sum;
}
} }
// Compute x gradient - initial naive implementation with atomic add. // Compute x gradient - initial naive implementation with atomic add.
template <typename T> template <typename T>
__global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width, __global__ void ConvShiftGradX(const T *dout, const T *y, int x_width,
int y_width, int y_half_width, int batch_size) { int y_width, int y_half_width, int batch_size,
T *dx) {
int i = blockIdx.x * blockDim.x + threadIdx.x; // x index int i = blockIdx.x * blockDim.x + threadIdx.x; // x index
int j = blockIdx.y; // y index int j = blockIdx.y; // y index
int k = blockIdx.z; // batch index int k = blockIdx.z; // batch index
...@@ -94,8 +96,8 @@ __global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width, ...@@ -94,8 +96,8 @@ __global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width,
// Compute y gradient - initial naive implementation with atomic add. // Compute y gradient - initial naive implementation with atomic add.
template <typename T> template <typename T>
__global__ void conv_shift_dy(const T *x, const T *dout, T *dy, int x_width, __global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
int y_width, int y_half_width, int batch_size) { int y_half_width, int batch_size, T *dy) {
int i = blockIdx.x * blockDim.x + threadIdx.x; // x index int i = blockIdx.x * blockDim.x + threadIdx.x; // x index
int j = blockIdx.y; // y index int j = blockIdx.y; // y index
int k = blockIdx.z; // batch index int k = blockIdx.z; // batch index
...@@ -125,15 +127,15 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> { ...@@ -125,15 +127,15 @@ class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
int y_half_width = (y_width - 1) / 2; int y_half_width = (y_width - 1) / 2;
const int x_per_block = 256; const int x_per_block = 256;
int num_x_blocks = div_up(x_width, x_per_block); int num_x_blocks = DivUp(x_width, x_per_block);
int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T); int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
dim3 grid_dim(num_x_blocks, batch_size); dim3 grid_dim(num_x_blocks, batch_size);
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>( ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size); x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
} }
}; };
...@@ -157,25 +159,26 @@ class ConvShiftGradKernel<platform::GPUPlace, T> ...@@ -157,25 +159,26 @@ class ConvShiftGradKernel<platform::GPUPlace, T>
int y_width = Y->dims()[1]; int y_width = Y->dims()[1];
int y_half_width = (y_width - 1) / 2; int y_half_width = (y_width - 1) / 2;
auto stream = context.cuda_device_context().stream(); auto &device_ctx = context.cuda_device_context();
math::SetConstant<platform::GPUPlace, T> zero;
const int x_per_block = 256; const int x_per_block = 256;
int num_x_blocks = div_up(x_width, x_per_block); int num_x_blocks = DivUp(x_width, x_per_block);
dim3 grid_dim(num_x_blocks, y_width, batch_size); dim3 grid_dim(num_x_blocks, y_width, batch_size);
if (dX) { if (dX) {
T *dx_data = dX->mutable_data<T>(context.GetPlace()); T *dx_data = dX->mutable_data<T>(context.GetPlace());
cudaMemsetAsync(dx_data, 0, dX->numel() * sizeof(T), stream); zero(device_ctx, dX, static_cast<T>(0.0));
conv_shift_dx<T><<<grid_dim, x_per_block, 0, stream>>>( ConvShiftGradX<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
dout_data, y_data, dx_data, x_width, y_width, y_half_width, dout_data, y_data, x_width, y_width, y_half_width, batch_size,
batch_size); dx_data);
} }
if (dY) { if (dY) {
T *dy_data = dY->mutable_data<T>(context.GetPlace()); T *dy_data = dY->mutable_data<T>(context.GetPlace());
cudaMemsetAsync(dy_data, 0, dY->numel() * sizeof(T), stream); zero(device_ctx, dY, static_cast<T>(0.0));
conv_shift_dy<T><<<grid_dim, x_per_block, 0, stream>>>( ConvShiftDy<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
x_data, dout_data, dy_data, x_width, y_width, y_half_width, x_data, dout_data, x_width, y_width, y_half_width, batch_size,
batch_size); dy_data);
} }
} }
}; };
......
...@@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -30,11 +30,6 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides"); std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
for (size_t i = 0; i < paddings.size(); ++i) {
PADDLE_ENFORCE_EQ(paddings[i], 0,
"No Padding allowed in conv transpose op.");
}
PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"ConvTransposeOp intput should be 4-D or 5-D tensor."); "ConvTransposeOp intput should be 4-D or 5-D tensor.");
PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(), PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
...@@ -51,8 +46,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -51,8 +46,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
"as the number of filters."); "as the number of filters.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]}); std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
for (size_t i = 0; i < paddings.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back((in_dims[i + 2] - 1) * strides[i] + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
filter_dims[i + 2]); filter_dims[i + 2]);
} }
ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
...@@ -79,11 +74,13 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( ...@@ -79,11 +74,13 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
"The format of output tensor is also NCHW."); "The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"strides", "strides",
"(vector defalut:{1, 1}), strides of convolution transpose operator.") "(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of "
"convolution transpose operator.")
.SetDefault({1, 1}); .SetDefault({1, 1});
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"paddings", "paddings",
"(vector defalut:{0, 0}), paddings of convolution transpose operator.") "(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
"transpose operator.")
.SetDefault({0, 0}); .SetDefault({0, 0});
AddComment(R"DOC( AddComment(R"DOC(
Convolution2D Transpose Operator. Convolution2D Transpose Operator.
...@@ -132,13 +129,14 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( ...@@ -132,13 +129,14 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
"Where N is batch size, C is " "Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the " "the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature."); "height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>("strides",
"strides", "(vector<int> defalut:{1, 1, 1}), the "
"(vector defalut:{1, 1, 1}), strides of convolution transpose operator.") "strides{d_stride, h_stride, w_stride} of "
"convolution transpose operator.")
.SetDefault({1, 1, 1}); .SetDefault({1, 1, 1});
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>("paddings",
"paddings", "(vector<int> defalut:{0, 0, 0}), paddings(d_pad, "
"(vector defalut:{0, 0, 0}), paddings of convolution transpose operator.") "h_pad, w_pad) of convolution transpose operator.")
.SetDefault({0, 0, 0}); .SetDefault({0, 0, 0});
AddComment(R"DOC( AddComment(R"DOC(
Convolution3D Transpose Operator. Convolution3D Transpose Operator.
......
...@@ -43,16 +43,12 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -43,16 +43,12 @@ class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
class ConvTransposeOp : public framework::OperatorWithKernel { class ConvTransposeOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override; void InferShape(framework::InferShapeContext* ctx) const override;
}; };
class ConvTransposeOpGrad : public framework::OperatorWithKernel { class ConvTransposeOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override; void InferShape(framework::InferShapeContext* ctx) const override;
}; };
...@@ -66,6 +62,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -66,6 +62,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
Tensor* output = context.Output<Tensor>("Output"); Tensor* output = context.Output<Tensor>("Output");
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
// TODO(Zhuoyuan): Paddings can be added in future. // TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2dtranspose. // groups will alway be disabled in conv2dtranspose.
...@@ -120,6 +117,10 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -120,6 +117,10 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
math::SetConstant<Place, T> set_zero; math::SetConstant<Place, T> set_zero;
set_zero(context.device_context(), output, static_cast<T>(0)); set_zero(context.device_context(), output, static_cast<T>(0));
math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im;
math::Col2VolFunctor<Place, T> col2vol;
std::vector<int> dilations({1, 1, 1});
// convolution transpose: gemm + col2im or col2vol (similar to conv-backward // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
// on input) // on input)
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
...@@ -138,16 +139,16 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -138,16 +139,16 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
if (filter_shape_vec.size() == 2) { if (filter_shape_vec.size() == 2) {
// col2im: col_matrix -> dy // col2im: col_matrix -> dy
// from (c * k_h * k_w, h * w) to (c, o_h, o_w) // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
math::Col2ImFunctor<math::ColFormat::kCFO, Place, T> col2im; col2im(context.device_context(), col,
std::vector<int>{dilations[0], dilations[1]}, strides,
col2im(context.device_context(), output_batch, col, strides[0], std::vector<int>{paddings[0], paddings[1], paddings[0],
strides[1], 0, 0, 0, 0); paddings[1]},
&output_batch);
} else if (filter_shape_vec.size() == 3) { } else if (filter_shape_vec.size() == 3) {
// col2vol: col_matrix -> dy // col2vol: col_matrix -> dy
// from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
math::Col2VolFunctor<Place, T> col2vol; col2vol(context.device_context(), col, dilations, strides, paddings,
col2vol(context.device_context(), output_batch, col, strides[0], &output_batch);
strides[1], strides[2], 0, 0, 0);
} }
} }
} }
...@@ -171,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -171,7 +172,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
if ((!input_grad) && (!filter_grad)) return; if ((!input_grad) && (!filter_grad)) return;
std::vector<int> strides = context.Attr<std::vector<int>>("strides"); std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// Actually, no paddings and groups allowed in conv transpose.
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings"); std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
...@@ -228,6 +228,10 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -228,6 +228,10 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
Tensor filter_grad_; Tensor filter_grad_;
math::SetConstant<Place, T> set_zero; math::SetConstant<Place, T> set_zero;
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col;
math::Vol2ColFunctor<Place, T> vol2col;
std::vector<int> dilations({1, 1, 1});
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
set_zero(context.device_context(), input_grad, static_cast<T>(0)); set_zero(context.device_context(), input_grad, static_cast<T>(0));
...@@ -247,17 +251,16 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -247,17 +251,16 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
if (filter_shape_vec.size() == 2) { if (filter_shape_vec.size() == 2) {
// im2col: dy -> col matrix // im2col: dy -> col matrix
// from (c, o_h, o_w) to (c * k_h * k_w, h * w) // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
math::Im2ColFunctor<math::ColFormat::kCFO, Place, T> im2col; im2col(context.device_context(), output_grad_batch,
im2col(context.device_context(), output_grad_batch, col, strides[0], std::vector<int>{dilations[0], dilations[1]}, strides,
strides[1], paddings[0], paddings[0], paddings[1], std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]); paddings[1]},
&col);
} else if (filter_shape_vec.size() == 3) { } else if (filter_shape_vec.size() == 3) {
// vol2col: dy -> col_matrix // vol2col: dy -> col_matrix
// from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w) // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
math::Vol2ColFunctor<Place, T> vol2col; vol2col(context.device_context(), output_grad_batch, dilations,
vol2col(context.device_context(), output_grad_batch, col, strides[0], strides, paddings, &col);
strides[1], strides[2], paddings[0], paddings[1],
paddings[2]);
} }
if (input_grad) { if (input_grad) {
......
...@@ -132,7 +132,7 @@ class CosSimGradKernel : public framework::OpKernel<T> { ...@@ -132,7 +132,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
// compute dy // compute dy
if (out_grad_y) { if (out_grad_y) {
out_grad_y->mutable_data<T>(context.GetPlace()); out_grad_y->mutable_data<T>(context.GetPlace());
auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1); auto dy = EigenVector<T>::Flatten(*out_grad_y);
auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast; auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}})); dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}}));
} }
......
...@@ -23,8 +23,6 @@ template <typename T> ...@@ -23,8 +23,6 @@ template <typename T>
__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X, __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const int64_t* label, const int N, const int64_t* label, const int N,
const int D) { const int D) {
// TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
// CUDA_1D_KERNEL_LOOP(i, N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) { i += blockDim.x * gridDim.x) {
int idx = i * D + label[i]; int idx = i * D + label[i];
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace paddle {
namespace operators {
namespace detail {
/**
* Get Reference From Pointer with check. The error message is printf format,
* and passed by `args`
*/
template <typename T, typename... ARGS>
inline T &Ref(T *ptr, ARGS &&... args) {
PADDLE_ENFORCE(ptr != nullptr, args...);
return *ptr;
}
} // namespace detail
} // namespace operators
} // namespace paddle
...@@ -101,4 +101,7 @@ REGISTER_OPERATOR(fill_constant_batch_size_like, ...@@ -101,4 +101,7 @@ REGISTER_OPERATOR(fill_constant_batch_size_like,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_constant_batch_size_like, fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>, ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, float>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>); ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, double>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace, int>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUPlace,
int64_t>);
...@@ -12,11 +12,14 @@ ...@@ -12,11 +12,14 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/fill_constant_batch_size_like_op.h" #include "paddle/operators/fill_constant_batch_size_like_op.h"
#include "paddle/framework/op_registry.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
fill_constant_batch_size_like, fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>, ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, float>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>); ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, double>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace, int>,
ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::GPUPlace,
int64_t>);
...@@ -54,5 +54,8 @@ namespace ops = paddle::operators; ...@@ -54,5 +54,8 @@ namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
ops::FillZerosLikeOpMaker); ops::FillZerosLikeOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
fill_zeros_like, fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int>,
ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>); ops::FillZerosLikeKernel<paddle::platform::CPUPlace, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>,
ops::FillZerosLikeKernel<paddle::platform::CPUPlace, double>,
ops::FillZerosLikeKernel<paddle::platform::CPUPlace, bool>);
...@@ -12,10 +12,13 @@ ...@@ -12,10 +12,13 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/fill_zeros_like_op.h" #include "paddle/operators/fill_zeros_like_op.h"
#include "paddle/framework/op_registry.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
fill_zeros_like, fill_zeros_like, ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int>,
ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>); ops::FillZerosLikeKernel<paddle::platform::GPUPlace, int64_t>,
ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>,
ops::FillZerosLikeKernel<paddle::platform::GPUPlace, double>,
ops::FillZerosLikeKernel<paddle::platform::GPUPlace, bool>);
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/gru_op.h" #include "paddle/operators/gru_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
......
...@@ -27,10 +27,6 @@ namespace operators { ...@@ -27,10 +27,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
class GRUKernel : public framework::OpKernel<T> { class GRUKernel : public framework::OpKernel<T> {
public: public:
...@@ -57,19 +53,15 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -57,19 +53,15 @@ class GRUKernel : public framework::OpKernel<T> {
bool is_reverse = context.Attr<bool>("is_reverse"); bool is_reverse = context.Attr<bool>("is_reverse");
math::LoDTensor2BatchFunctor<Place, T> to_batch; math::LoDTensor2BatchFunctor<Place, T> to_batch;
to_batch(context.device_context(), *input, *batch_gate, true, is_reverse); auto& dev_ctx = context.device_context();
to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
int frame_size = hidden_dims[1];
int batch_size = hidden_dims[0];
auto g = EigenMatrix<T>::From(*batch_gate);
auto place = context.GetEigenDevice<Place>();
if (bias) { if (bias) {
auto b = EigenMatrix<T>::From(*bias); math::RowwiseAdd<Place, T> add_bias;
g.device(place) = g + add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
.broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
} }
int frame_size = hidden_dims[1];
math::hl_gru_value<T> gru_value; math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data); gru_value.gateWeight = const_cast<T*>(weight_data);
gru_value.stateWeight = gru_value.stateWeight =
...@@ -89,7 +81,7 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -89,7 +81,7 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value.gateValue = gate_t.data<T>(); gru_value.gateValue = gate_t.data<T>();
gru_value.resetOutputValue = reset_hidden_prev_t.data<T>(); gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
math::GRUUnitFunctor<Place, T>::compute( math::GRUUnitFunctor<Place, T>::compute(
context.device_context(), gru_value, frame_size, cur_batch_size, dev_ctx, gru_value, frame_size, cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")), math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation"))); math::ActiveType(context.Attr<std::string>("gate_activation")));
gru_value.prevOutValue = gru_value.outputValue; gru_value.prevOutValue = gru_value.outputValue;
...@@ -97,7 +89,7 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -97,7 +89,7 @@ class GRUKernel : public framework::OpKernel<T> {
math::Batch2LoDTensorFunctor<Place, T> to_seq; math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_hidden->set_lod(batch_gate->lod()); batch_hidden->set_lod(batch_gate->lod());
to_seq(context.device_context(), *batch_hidden, *hidden); to_seq(dev_ctx, *batch_hidden, *hidden);
} }
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
...@@ -138,15 +130,14 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -138,15 +130,14 @@ class GRUGradKernel : public framework::OpKernel<T> {
batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims, batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
context.GetPlace()); context.GetPlace());
math::SetConstant<Place, T> zero; math::SetConstant<Place, T> zero;
zero(context.device_context(), &batch_hidden_grad, static_cast<T>(0.0)); auto& dev_ctx = context.device_context();
zero(context.device_context(), &batch_gate_grad, static_cast<T>(0.0)); zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
zero(context.device_context(), &batch_reset_hidden_prev_grad, zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
static_cast<T>(0.0)); zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
bool is_reverse = context.Attr<bool>("is_reverse"); bool is_reverse = context.Attr<bool>("is_reverse");
batch_hidden_grad.set_lod(batch_hidden->lod()); batch_hidden_grad.set_lod(batch_hidden->lod());
to_batch(context.device_context(), *hidden_grad, batch_hidden_grad, false, to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
is_reverse);
math::hl_gru_value<T> gru_value; math::hl_gru_value<T> gru_value;
gru_value.gateWeight = const_cast<T*>(weight_data); gru_value.gateWeight = const_cast<T*>(weight_data);
...@@ -157,7 +148,7 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -157,7 +148,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
if (weight_grad) { if (weight_grad) {
gru_grad.gateWeightGrad = gru_grad.gateWeightGrad =
weight_grad->mutable_data<T>(context.GetPlace()); weight_grad->mutable_data<T>(context.GetPlace());
zero(context.device_context(), weight_grad, static_cast<T>(0.0)); zero(dev_ctx, weight_grad, static_cast<T>(0.0));
gru_grad.stateWeightGrad = gru_grad.stateWeightGrad =
weight_grad->data<T>() + 2 * frame_size * frame_size; weight_grad->data<T>() + 2 * frame_size * frame_size;
} else { } else {
...@@ -188,7 +179,7 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -188,7 +179,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
gru_value.prevOutValue = const_cast<T*>(h0_data); gru_value.prevOutValue = const_cast<T*>(h0_data);
if (h0_grad) { if (h0_grad) {
T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace()); T* h0_grad_data = h0_grad->mutable_data<T>(context.GetPlace());
zero(context.device_context(), h0_grad, static_cast<T>(0.0)); zero(dev_ctx, h0_grad, static_cast<T>(0.0));
gru_grad.prevOutGrad = h0_grad_data; gru_grad.prevOutGrad = h0_grad_data;
} else { } else {
gru_grad.prevOutGrad = nullptr; gru_grad.prevOutGrad = nullptr;
...@@ -202,8 +193,7 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -202,8 +193,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
} }
math::GRUUnitGradFunctor<Place, T>::compute( math::GRUUnitGradFunctor<Place, T>::compute(
context.device_context(), gru_value, gru_grad, frame_size, dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size,
cur_batch_size,
math::ActiveType(context.Attr<std::string>("activation")), math::ActiveType(context.Attr<std::string>("activation")),
math::ActiveType(context.Attr<std::string>("gate_activation"))); math::ActiveType(context.Attr<std::string>("gate_activation")));
} }
...@@ -211,14 +201,12 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -211,14 +201,12 @@ class GRUGradKernel : public framework::OpKernel<T> {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
math::Batch2LoDTensorFunctor<Place, T> to_seq; math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_gate_grad.set_lod(batch_gate->lod()); batch_gate_grad.set_lod(batch_gate->lod());
to_seq(context.device_context(), batch_gate_grad, *input_grad); to_seq(dev_ctx, batch_gate_grad, *input_grad);
} }
if (bias_grad) { if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace()); bias_grad->mutable_data<T>(context.GetPlace());
auto d_b = EigenMatrix<T>::From(*bias_grad); math::ColwiseSum<Place, T> col_sum;
auto d_g = EigenMatrix<T>::From(batch_gate_grad); col_sum(dev_ctx, batch_gate_grad, bias_grad);
auto place = context.GetEigenDevice<Place>();
d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
constexpr char kInput[] = "X";
constexpr char kOutput[] = "Out";
class IsEmptyOp : public framework::OperatorBase {
public:
IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::DeviceContext &dev_ctx) const override {
// get input
auto *var = scope.FindVar(Input(kInput));
PADDLE_ENFORCE_NOT_NULL(var);
auto &tensor = var->Get<framework::LoDTensor>();
// get output
auto *out = scope.FindVar(Output(kOutput));
PADDLE_ENFORCE_NOT_NULL(out);
auto *out_tensor = out->GetMutable<framework::LoDTensor>();
out_tensor->Resize({1});
out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
framework::product(tensor.dims()) == 0;
}
};
class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
IsEmptyOpProtoMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(kInput, "(Tensor) Tensor which is to be checked.");
AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
AddComment(R"DOC(
IsEmpty Operator which checks whether a tensor is empty.
It will just return product(tensor.ddims()) > 0;
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
paddle::operators::IsEmptyOpProtoMaker);
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/lstm_op.h" #include "paddle/operators/lstm_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
......
...@@ -24,10 +24,6 @@ namespace operators { ...@@ -24,10 +24,6 @@ namespace operators {
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
inline void ReorderInitState(const platform::DeviceContext& ctx, inline void ReorderInitState(const platform::DeviceContext& ctx,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src, const size_t* index,
...@@ -65,16 +61,11 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -65,16 +61,11 @@ class LSTMKernel : public framework::OpKernel<T> {
framework::DDim dims({in_dims[0], frame_size}); framework::DDim dims({in_dims[0], frame_size});
if (bias) { if (bias) {
Eigen::array<int, 2> extents({{1, 4 * frame_size}}); Tensor b = *bias;
Eigen::array<int, 2> offsets({{0, 0}}); b.Resize({bias->numel(), 1});
auto b = EigenMatrix<T>::From(*bias); Tensor gate_bias = b.Slice(0, 4 * frame_size);
auto gate = EigenMatrix<T>::From(*batch_gate); math::RowwiseAdd<Place, T> add_bias;
gate.device(ctx.GetEigenDevice<Place>()) = add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
gate +
b.slice(offsets, extents)
.reshape(Eigen::array<int, 2>({{1, frame_size * 4}}))
.broadcast(
Eigen::array<int, 2>({{static_cast<int>(in_dims[0]), 1}}));
} }
math::LstmMetaValue<T> lstm_value; math::LstmMetaValue<T> lstm_value;
...@@ -350,16 +341,11 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -350,16 +341,11 @@ class LSTMGradKernel : public framework::OpKernel<T> {
} }
if (bias && bias_g) { if (bias && bias_g) {
/* backward bias */ /* backward bias */
int m = static_cast<int>(batch_gate_g.dims()[0]); Tensor b_g = *bias_g;
int n = static_cast<int>(batch_gate_g.dims()[1]); b_g.Resize({bias_g->numel(), 1});
Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
Tensor ones; math::ColwiseSum<Place, T> col_sum;
ones.mutable_data<T>({m}, ctx.GetPlace()); col_sum(device_ctx, batch_gate_g, &gate_bias_g);
math::SetConstant<Place, T> set;
set(device_ctx, &ones, static_cast<T>(1.0));
math::gemv<Place, T>(device_ctx, true, m, n, 1., batch_gate_g.data<T>(),
ones.data<T>(), 0., bias_g->data<T>());
} }
if (h0 && h0_g) { if (h0 && h0_g) {
......
add_subdirectory(detail) add_subdirectory(detail)
if(WITH_GPU) if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator) nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function) nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor) nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(softmax SRCS softmax.cc DEPS device_context)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context)
cc_library(context_project SRCS context_project.cc DEPS device_context) cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function) cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
......
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/operators/math/im2col.h" #include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -24,9 +24,6 @@ namespace math { ...@@ -24,9 +24,6 @@ namespace math {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
/* /*
* \brief Context projection concatenates features in adjacent time-steps in * \brief Context projection concatenates features in adjacent time-steps in
...@@ -88,13 +85,18 @@ template <typename Place, typename T> ...@@ -88,13 +85,18 @@ template <typename Place, typename T>
class ContextProjectFunctor { class ContextProjectFunctor {
public: public:
void operator()(const platform::DeviceContext& context, const LoDTensor& in, void operator()(const platform::DeviceContext& context, const LoDTensor& in,
const Tensor& padding_data, Tensor& col, const Tensor& padding_data, bool padding_trainable,
bool padding_trainable, int context_start, int context_length, const int context_start, const int context_length,
int context_stride, int up_pad, int down_pad) { const int context_stride, const int up_pad,
const int down_pad, Tensor* col) {
auto lod_level_0 = in.lod()[0]; auto lod_level_0 = in.lod()[0];
math::Im2ColFunctor<math::ColFormat::kOCF, Place, float> im2col_ocf; math::Im2ColFunctor<math::ColFormat::kOCF, Place, float> im2col_ocf;
std::vector<int> dilation({1, 1});
std::vector<int> padding({up_pad, 0, down_pad, 0});
std::vector<int> stride({context_stride, 1});
int input_row_begin, input_row_end; int input_row_begin, input_row_end;
int sequence_height, sequence_width; int sequence_height, sequence_width;
sequence_width = in.dims()[1]; sequence_width = in.dims()[1];
...@@ -105,7 +107,7 @@ class ContextProjectFunctor { ...@@ -105,7 +107,7 @@ class ContextProjectFunctor {
: static_cast<int>(lod_level_0[i]); : static_cast<int>(lod_level_0[i]);
input_row_end = static_cast<int>(lod_level_0[i + 1]); input_row_end = static_cast<int>(lod_level_0[i + 1]);
Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]), Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
static_cast<int>(lod_level_0[i + 1])); static_cast<int>(lod_level_0[i + 1]));
sequence_height = static_cast<int>(out_t.dims()[0]); sequence_height = static_cast<int>(out_t.dims()[0]);
...@@ -123,16 +125,13 @@ class ContextProjectFunctor { ...@@ -123,16 +125,13 @@ class ContextProjectFunctor {
{1, input_row_end - input_row_begin, {1, input_row_end - input_row_begin,
sequence_width}); // input_channels, input_height, input_width sequence_width}); // input_channels, input_height, input_width
in_t.Resize(framework::make_ddim(input_shape)); in_t.Resize(framework::make_ddim(input_shape));
im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
im2col_ocf(context, in_t, out_t,
/*stride_height*/ context_stride, /*stride_width*/ 1, up_pad,
down_pad, 0, 0);
out_t.Resize({sequence_height, context_length * sequence_width}); out_t.Resize({sequence_height, context_length * sequence_width});
} }
} }
if (padding_trainable) { if (padding_trainable) {
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]), Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
static_cast<int>(lod_level_0[i + 1])); static_cast<int>(lod_level_0[i + 1]));
sequence_height = static_cast<int>(out_t.dims()[0]); sequence_height = static_cast<int>(out_t.dims()[0]);
...@@ -150,9 +149,7 @@ class ContextProjectFunctor { ...@@ -150,9 +149,7 @@ class ContextProjectFunctor {
Tensor out_t_sub = out_t.Slice(k * context_length, Tensor out_t_sub = out_t.Slice(k * context_length,
k * context_length + padding_size); k * context_length + padding_size);
Tensor w_sub = padding_data.Slice(k, k + padding_size); Tensor w_sub = padding_data.Slice(k, k + padding_size);
auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub); out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
auto w_sub_e = EigenMatrix<T>::From(w_sub);
out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
} }
} }
if (down_pad > 0) { // add down pad if (down_pad > 0) { // add down pad
...@@ -182,9 +179,7 @@ class ContextProjectFunctor { ...@@ -182,9 +179,7 @@ class ContextProjectFunctor {
(down_pad_begin_row + t) * context_length); (down_pad_begin_row + t) * context_length);
Tensor w_sub = padding_data.Slice( Tensor w_sub = padding_data.Slice(
up_pad + padding_idx, up_pad + padding_idx + padding_size); up_pad + padding_idx, up_pad + padding_idx + padding_size);
auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub); out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
auto w_sub_e = EigenMatrix<T>::From(w_sub);
out_t_sub_e.device(*context.GetEigenDevice<Place>()) = w_sub_e;
} }
} }
out_t.Resize({sequence_height, context_length * sequence_width}); out_t.Resize({sequence_height, context_length * sequence_width});
...@@ -196,14 +191,19 @@ class ContextProjectFunctor { ...@@ -196,14 +191,19 @@ class ContextProjectFunctor {
template <typename Place, typename T> template <typename Place, typename T>
class ContextProjectGradFunctor { class ContextProjectGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, LoDTensor& in, void operator()(const platform::DeviceContext& context, const LoDTensor& in,
Tensor& padding_data, Tensor& col, bool padding_trainable, bool padding_trainable, const int context_start,
int context_start, int context_length, int context_stride, const int context_length, const int context_stride,
int up_pad, int down_pad, bool input_grad, bool pad_grad) { const int up_pad, const int down_pad, bool pad_grad,
bool input_grad, Tensor* padding_data, Tensor* col) {
auto lod_level_0 = in.lod()[0]; auto lod_level_0 = in.lod()[0];
math::Col2ImFunctor<math::ColFormat::kOCF, Place, float> col2im_ocf; math::Col2ImFunctor<math::ColFormat::kOCF, Place, float> col2im_ocf;
std::vector<int> dilation({1, 1});
std::vector<int> padding({up_pad, 0, down_pad, 0});
std::vector<int> stride({context_stride, 1});
int input_row_begin, input_row_end; int input_row_begin, input_row_end;
int sequence_height, sequence_width; int sequence_height, sequence_width;
sequence_width = in.dims()[1]; sequence_width = in.dims()[1];
...@@ -215,7 +215,7 @@ class ContextProjectGradFunctor { ...@@ -215,7 +215,7 @@ class ContextProjectGradFunctor {
: static_cast<int>(lod_level_0[i]); : static_cast<int>(lod_level_0[i]);
input_row_end = static_cast<int>(lod_level_0[i + 1]); input_row_end = static_cast<int>(lod_level_0[i + 1]);
Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]), Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
static_cast<int>(lod_level_0[i + 1])); static_cast<int>(lod_level_0[i + 1]));
sequence_height = static_cast<int>(out_t.dims()[0]); sequence_height = static_cast<int>(out_t.dims()[0]);
...@@ -234,9 +234,7 @@ class ContextProjectGradFunctor { ...@@ -234,9 +234,7 @@ class ContextProjectGradFunctor {
sequence_width}); // input_channels, input_height, input_width sequence_width}); // input_channels, input_height, input_width
in_t.Resize(framework::make_ddim(input_shape)); in_t.Resize(framework::make_ddim(input_shape));
col2im_ocf(context, in_t, out_t, col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
/*stride_height*/ context_stride, /*stride_width*/ 1,
up_pad, down_pad, 0, 0);
out_t.Resize({sequence_height, context_length * sequence_width}); out_t.Resize({sequence_height, context_length * sequence_width});
} }
} }
...@@ -244,7 +242,7 @@ class ContextProjectGradFunctor { ...@@ -244,7 +242,7 @@ class ContextProjectGradFunctor {
if (pad_grad) { if (pad_grad) {
if (padding_trainable) { if (padding_trainable) {
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
Tensor out_t = col.Slice(static_cast<int>(lod_level_0[i]), Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
static_cast<int>(lod_level_0[i + 1])); static_cast<int>(lod_level_0[i + 1]));
sequence_height = static_cast<int>(out_t.dims()[0]); sequence_height = static_cast<int>(out_t.dims()[0]);
...@@ -259,11 +257,9 @@ class ContextProjectGradFunctor { ...@@ -259,11 +257,9 @@ class ContextProjectGradFunctor {
k + context_length < up_pad ? context_length : up_pad - k; k + context_length < up_pad ? context_length : up_pad - k;
Tensor out_t_sub = out_t.Slice(k * context_length, Tensor out_t_sub = out_t.Slice(k * context_length,
k * context_length + padding_size); k * context_length + padding_size);
Tensor w_sub = padding_data.Slice(k, k + padding_size); Tensor w_sub = padding_data->Slice(k, k + padding_size);
auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub); axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
auto w_sub_e = EigenMatrix<T>::From(w_sub); out_t_sub.data<T>(), w_sub.data<T>());
w_sub_e.device(*context.GetEigenDevice<Place>()) =
w_sub_e + out_t_sub_e;
} }
} }
if (down_pad > 0) { if (down_pad > 0) {
...@@ -292,12 +288,10 @@ class ContextProjectGradFunctor { ...@@ -292,12 +288,10 @@ class ContextProjectGradFunctor {
Tensor out_t_sub = out_t.Slice( Tensor out_t_sub = out_t.Slice(
(down_pad_begin_row + t) * context_length - padding_size, (down_pad_begin_row + t) * context_length - padding_size,
(down_pad_begin_row + t) * context_length); (down_pad_begin_row + t) * context_length);
Tensor w_sub = padding_data.Slice( Tensor w_sub = padding_data->Slice(
up_pad + padding_idx, up_pad + padding_idx + padding_size); up_pad + padding_idx, up_pad + padding_idx + padding_size);
auto out_t_sub_e = EigenMatrix<T>::From(out_t_sub); axpy<Place, T>(context, w_sub.numel(), static_cast<T>(1),
auto w_sub_e = EigenMatrix<T>::From(w_sub); out_t_sub.data<T>(), w_sub.data<T>());
w_sub_e.device(*context.GetEigenDevice<Place>()) =
w_sub_e + out_t_sub_e;
} }
} }
out_t.Resize({sequence_height, context_length * sequence_width}); out_t.Resize({sequence_height, context_length * sequence_width});
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
#pragma once #pragma once
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/hostdevice.h" #include "paddle/platform/hostdevice.h"
......
...@@ -28,57 +28,55 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -28,57 +28,55 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, T> { platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col, const framework::Tensor& im, const std::vector<int>& dilation,
int stride_height, int stride_width, int padding_up, const std::vector<int>& stride,
int padding_down, int padding_left, int padding_right) { const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3); PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col->dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im.dims()[0];
int input_height = im.dims()[1]; int im_height = im.dims()[1];
int input_width = im.dims()[2]; int im_width = im.dims()[2];
int filter_height = col.dims()[1]; int filter_height = col->dims()[1];
int filter_width = col.dims()[2]; int filter_width = col->dims()[2];
int output_height = col.dims()[3]; int col_height = col->dims()[3];
int output_width = col.dims()[4]; int col_width = col->dims()[4];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
(input_height + padding_up + padding_down - filter_height) / ((dilation[0] * (filter_height - 1) + 1))) /
stride_height + stride[0] +
1, 1,
output_height, col_height,
"Output_height and padding(padding_up, padding_down) are " "Output_height and padding(padding_up, padding_down) are "
"inconsistent."); "inconsistent.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(input_width + padding_left + padding_right - filter_width) / ((dilation[1] * (filter_width - 1) + 1))) /
stride_width + stride[1] +
1, 1,
output_width, col_width,
"output_width and padding(padding_left, padding_right) are " "Output_height and padding(padding_up, padding_down) are "
"inconsistent."); "inconsistent.");
int channels_col = input_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
T* col_data = col.data<T>(); T* col_data = col->data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width; int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height; int h_offset = (c / filter_width) % filter_height;
int c_im = c / filter_width / filter_height; int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) { for (int h = 0; h < col_height; ++h) {
for (int w = 0; w < output_width; ++w) { for (int w = 0; w < col_width; ++w) {
int im_row_idx = h * stride_height + h_offset - padding_up; int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
int im_col_idx = w * stride_width + w_offset - padding_left; int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
int col_idx = (c * col_height + h) * col_width + w;
int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
if (im_row_idx < 0 || im_row_idx >= input_height || im_col_idx < 0 || col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx >= input_width) { im_col_idx < 0 || im_col_idx >= im_width)
col_data[(c * output_height + h) * output_width + w] = T(0); ? static_cast<T>(0)
} else { : im_data[im_idx];
im_row_idx += c_im * input_height;
col_data[(c * output_height + h) * output_width + w] =
im_data[im_row_idx * input_width + im_col_idx];
}
} }
} }
} }
...@@ -94,54 +92,55 @@ template <class T> ...@@ -94,54 +92,55 @@ template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, T> { platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im, void operator()(const platform::DeviceContext& context,
const framework::Tensor& col, int stride_height, const framework::Tensor& col,
int stride_width, int padding_up, int padding_down, const std::vector<int>& dilation,
int padding_left, int padding_right) { const std::vector<int>& stride,
PADDLE_ENFORCE(im.dims().size() == 3); const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im->dims()[0];
int input_height = im.dims()[1]; int im_height = im->dims()[1];
int input_width = im.dims()[2]; int im_width = im->dims()[2];
int filter_height = col.dims()[1]; int filter_height = col.dims()[1];
int filter_width = col.dims()[2]; int filter_width = col.dims()[2];
int output_height = col.dims()[3]; int col_height = col.dims()[3];
int output_width = col.dims()[4]; int col_width = col.dims()[4];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
(input_height + padding_up + padding_down - filter_height) / ((dilation[0] * (filter_height - 1) + 1))) /
stride_height + stride[0] +
1, 1,
output_height, col_height,
"Output_height and padding(padding_up, padding_down) are " "Output_height and padding(padding_up, padding_down) are "
"inconsistent."); "inconsistent.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(input_width + padding_left + padding_right - filter_width) / ((dilation[1] * (filter_width - 1) + 1))) /
stride_width + stride[1] +
1, 1,
output_width, col_width,
"output_width and padding(padding_left, padding_right) are " "Output_height and padding(padding_up, padding_down) are "
"inconsistent."); "inconsistent.");
int channels_col = input_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
T* im_data = im.data<T>(); T* im_data = im->data<T>();
const T* col_data = col.data<T>(); const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width; int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height; int h_offset = (c / filter_width) % filter_height;
int c_im = c / filter_width / filter_height; int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) { for (int h = 0; h < col_height; ++h) {
for (int w = 0; w < output_width; ++w) { for (int w = 0; w < col_width; ++w) {
int im_row_idx = h * stride_height + h_offset - padding_up; int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
int im_col_idx = w * stride_width + w_offset - padding_left; int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
if ((im_row_idx) >= 0 && (im_row_idx) < input_height && if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
(im_col_idx) >= 0 && (im_col_idx) < input_width) { (im_col_idx) >= 0 && (im_col_idx) < im_width) {
im_row_idx += c_im * input_height; im_row_idx += c_im * im_height;
im_data[im_row_idx * input_width + im_col_idx] += im_data[im_row_idx * im_width + im_col_idx] +=
col_data[(c * output_height + h) * output_width + w]; col_data[(c * col_height + h) * col_width + w];
} }
} }
} }
...@@ -168,64 +167,59 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -168,64 +167,59 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, T> { platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col, const framework::Tensor& im, const std::vector<int>& dilation,
int stride_height, int stride_width, int padding_up, const std::vector<int>& stride,
int padding_down, int padding_left, int padding_right) { const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3); PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col->dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im.dims()[0];
int input_height = im.dims()[1]; int im_height = im.dims()[1];
int input_width = im.dims()[2]; int im_width = im.dims()[2];
int filter_height = col.dims()[3]; int filter_height = col->dims()[3];
int filter_width = col.dims()[4]; int filter_width = col->dims()[4];
int output_height = col.dims()[0]; int col_height = col->dims()[0];
int output_width = col.dims()[1]; int col_width = col->dims()[1];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) / (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
stride_height + col_height,
1,
output_height,
"Output_height and padding(padding_up, padding_down) are " "Output_height and padding(padding_up, padding_down) are "
"inconsistent."); "inconsistent.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) / (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
stride_width + col_width,
1, "col_width and padding(padding_left, padding_right) are "
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent."); "inconsistent.");
const T* im_data = im.data<T>(); const T* im_data = im.data<T>();
T* col_data = col.data<T>(); T* col_data = col->data<T>();
for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) { for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) { for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
for (int channel = 0; channel < input_channels; ++channel) { for (int channel = 0; channel < im_channels; ++channel) {
for (int filter_row_idx = 0; filter_row_idx < filter_height; for (int filter_row_idx = 0; filter_row_idx < filter_height;
++filter_row_idx) { ++filter_row_idx) {
for (int filter_col_idx = 0; filter_col_idx < filter_width; for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) { ++filter_col_idx) {
int im_row_offset = int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_up; col_row_idx * stride[0] + filter_row_idx - padding[0];
int im_col_offset = int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_left; col_col_idx * stride[1] + filter_col_idx - padding[1];
int col_offset = ((((col_row_idx)*output_width + col_col_idx) * int col_offset =
input_channels + ((((col_row_idx)*col_width + col_col_idx) * im_channels +
channel) * channel) *
filter_height + filter_height +
filter_row_idx) * filter_row_idx) *
filter_width + filter_width +
filter_col_idx; filter_col_idx;
if (im_row_offset < 0 || im_row_offset >= input_height ||
im_col_offset < 0 || im_col_offset >= input_width) { int im_offset = (channel * im_height + im_row_offset) * im_width +
col_data[col_offset] = T(0);
} else {
int im_offset =
(channel * input_height + im_row_offset) * input_width +
im_col_offset; im_col_offset;
col_data[col_offset] = im_data[im_offset]; col_data[col_offset] =
} (im_row_offset < 0 || im_row_offset >= im_height ||
im_col_offset < 0 || im_col_offset >= im_width)
? static_cast<T>(0)
: im_data[im_offset];
} }
} }
} }
...@@ -243,60 +237,57 @@ template <class T> ...@@ -243,60 +237,57 @@ template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, T> { platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im, void operator()(const platform::DeviceContext& context,
const framework::Tensor& col, int stride_height, const framework::Tensor& col,
int stride_width, int padding_up, int padding_down, const std::vector<int>& dilation,
int padding_left, int padding_right) { const std::vector<int>& stride,
PADDLE_ENFORCE(im.dims().size() == 3); const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im->dims()[0];
int input_height = im.dims()[1]; int im_height = im->dims()[1];
int input_width = im.dims()[2]; int im_width = im->dims()[2];
int filter_height = col.dims()[3]; int filter_height = col.dims()[3];
int filter_width = col.dims()[4]; int filter_width = col.dims()[4];
int output_height = col.dims()[0]; int col_height = col.dims()[0];
int output_width = col.dims()[1]; int col_width = col.dims()[1];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) / (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
stride_height + col_height,
1,
output_height,
"Output_height and padding(padding_up, padding_down) are " "Output_height and padding(padding_up, padding_down) are "
"inconsistent."); "inconsistent.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) / (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
stride_width + col_width,
1, "col_width and padding(padding_left, padding_right) are "
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent."); "inconsistent.");
T* im_data = im.data<T>(); T* im_data = im->data<T>();
const T* col_data = col.data<T>(); const T* col_data = col.data<T>();
for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) { for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) { for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
for (int channel = 0; channel < input_channels; ++channel) { for (int channel = 0; channel < im_channels; ++channel) {
for (int filter_row_idx = 0; filter_row_idx < filter_height; for (int filter_row_idx = 0; filter_row_idx < filter_height;
++filter_row_idx) { ++filter_row_idx) {
for (int filter_col_idx = 0; filter_col_idx < filter_width; for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) { ++filter_col_idx) {
int im_row_offset = int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_up; col_row_idx * stride[0] + filter_row_idx - padding[0];
int im_col_offset = int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_left; col_col_idx * stride[1] + filter_col_idx - padding[1];
int col_offset = (((col_row_idx * output_width + col_col_idx) * int col_offset =
input_channels + (((col_row_idx * col_width + col_col_idx) * im_channels +
channel) * channel) *
filter_height + filter_height +
filter_row_idx) * filter_row_idx) *
filter_width + filter_width +
filter_col_idx; filter_col_idx;
if (im_row_offset >= 0 && im_row_offset < input_height && if (im_row_offset >= 0 && im_row_offset < im_height &&
im_col_offset >= 0 && im_col_offset < input_width) { im_col_offset >= 0 && im_col_offset < im_width) {
int im_offset = int im_offset =
(channel * input_height + im_row_offset) * input_width + (channel * im_height + im_row_offset) * im_width +
im_col_offset; im_col_offset;
im_data[im_offset] += col_data[col_offset]; im_data[im_offset] += col_data[col_offset];
} }
......
...@@ -20,36 +20,32 @@ namespace operators { ...@@ -20,36 +20,32 @@ namespace operators {
namespace math { namespace math {
template <class T> template <class T>
__global__ void im2col(const T* data_im, int num_outs, int height, int width, __global__ void im2col(const T* data_im, int num_outs, int im_height,
int im_width, int dilation_h, int dilation_w,
int filter_height, int filter_width, int stride_height, int filter_height, int filter_width, int stride_height,
int stride_width, int padding_height, int padding_width, int stride_width, int padding_height, int padding_width,
int output_height, int output_width, T* data_col) { int col_height, int col_width, T* data_col) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; const int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < num_outs) { if (index < num_outs) {
int w_out = index % output_width; int w_out = index % col_width;
index /= output_width; int h_out = (index / col_width) % col_height;
int h_out = index % output_height; int channel_in = index / col_width / col_height;
int channel_in = index / output_height;
int channel_out = channel_in * filter_height * filter_width; int channel_out = channel_in * filter_height * filter_width;
int h_in = h_out * stride_height; int h_in = h_out * stride_height - padding_height;
int w_in = w_out * stride_width; int w_in = w_out * stride_width - padding_width;
data_col += (channel_out * output_height + h_out) * output_width + w_out; data_col += (channel_out * col_height + h_out) * col_width + w_out;
data_im += (channel_in * im_height + h_in) * im_width + w_in;
for (int i = 0; i < filter_height; ++i) { for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) { for (int j = 0; j < filter_width; ++j) {
int rIdx = int(h_in + i); int rIdx = h_in + i * dilation_h;
int cIdx = int(w_in + j); int cIdx = w_in + j * dilation_w;
if ((rIdx - (int)padding_height) >= (int)height || *data_col =
(rIdx - (int)padding_height) < 0 || (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
(cIdx - (int)padding_width) >= (int)width || ? 0
(cIdx - (int)padding_width) < 0) { : data_im[i * dilation_h * im_width + j * dilation_w];
*data_col = 0; data_col += col_height * col_width;
} else {
rIdx = rIdx + channel_in * height - padding_height;
cIdx = cIdx - padding_width;
*data_col = data_im[rIdx * width + cIdx];
}
data_col += output_height * output_width;
} }
} }
} }
...@@ -65,30 +61,36 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -65,30 +61,36 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, T> { platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col, const framework::Tensor& im, const std::vector<int>& dilation,
int stride_height, int stride_width, int padding_up, const std::vector<int>& stride,
int padding_down, int padding_left, int padding_right) { const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3); PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col->dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im.dims()[0];
int input_height = im.dims()[1]; int im_height = im.dims()[1];
int input_width = im.dims()[2]; int im_width = im.dims()[2];
int filter_height = col.dims()[1]; int filter_height = col->dims()[1];
int filter_width = col.dims()[2]; int filter_width = col->dims()[2];
int output_height = col.dims()[3]; int col_height = col->dims()[3];
int output_width = col.dims()[4]; int col_width = col->dims()[4];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) / PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
stride_height + (dilation[0] * (filter_height - 1) + 1)) /
1 == stride[0] +
output_height); 1,
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) / col_height,
stride_width + "Output_height and padding(padding_up, padding_down) are "
1 == "inconsistent.");
output_width); PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
int num_outputs = input_channels * output_height * output_width; stride[1] +
1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
int num_outputs = im_channels * col_height * col_width;
int blocks = (num_outputs + 1024 - 1) / 1024; int blocks = (num_outputs + 1024 - 1) / 1024;
int block_x = 512; int block_x = 512;
int block_y = (blocks + 512 - 1) / 512; int block_y = (blocks + 512 - 1) / 512;
...@@ -97,56 +99,57 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -97,56 +99,57 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
im2col<T><<<grid, threads, 0, im2col<T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>( .stream()>>>(
im.data<T>(), num_outputs, input_height, input_width, filter_height, im.data<T>(), num_outputs, im_height, im_width, dilation[0],
filter_width, stride_height, stride_width, padding_up, padding_left, dilation[1], filter_height, filter_width, stride[0], stride[1],
output_height, output_width, col.data<T>()); padding[0], padding[1], col_height, col_width, col->data<T>());
} }
}; };
template <class T> template <class T>
__global__ void col2im(size_t n, const T* data_col, size_t height, size_t width, __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
size_t channels, size_t filter_height, int dilation_h, int dilation_w, int filter_height,
size_t filter_width, size_t stride_height, int filter_width, int stride_height, int stride_width,
size_t stride_width, size_t padding_height, int padding_height, int padding_width, int col_height,
size_t padding_width, size_t output_height, int col_width, T* data_im) {
size_t output_width, T* data_im) { const int index =
size_t index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
const int d_filter_height = dilation_h * (filter_height - 1) + 1;
const int d_filter_width = dilation_w * (filter_width - 1) + 1;
if (index < n) { if (index < n) {
T val = 0; T val = 0;
int w = int(index % width); int w = index % im_width + padding_width;
int h = int((index / width) % height); int h = (index / im_width) % im_height + padding_height;
int c = int(index / (width * height)); int c = index / (im_width * im_height);
if ((w - (int)padding_width) >= 0 &&
(w - (int)padding_width) < (width - 2 * padding_width) &&
(h - (int)padding_height) >= 0 &&
(h - padding_height) < (height - 2 * padding_height)) {
// compute the start and end of the output // compute the start and end of the output
int w_col_start = (w < (int)filter_width) int w_col_start =
? 0 (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
: (w - int(filter_width)) / (int)stride_width + 1; int w_col_end = min(w / stride_width + 1, col_width);
int w_col_end = int h_col_start =
min((int)(w / (int)stride_width + 1), (int)(output_width)); (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
int h_col_start = (h < (int)filter_height) int h_col_end = min(h / stride_height + 1, col_height);
? 0
: (h - (int)filter_height) / (int)stride_height + 1;
int h_col_end = min(int(h / stride_height + 1), int(output_height));
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out] int h_off = (h - h_col * stride_height);
int c_col = int(c * filter_height * filter_width) + int w_off = (w - w_col * stride_width);
(h - h_col * (int)stride_height) * (int)filter_width + if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
(w - w_col * (int)stride_width); h_off /= dilation_h;
val += w_off /= dilation_w;
data_col[(c_col * output_height + h_col) * output_width + w_col]; int data_col_index =
(((c * filter_height + h_off) * filter_width + w_off) *
col_height +
h_col) *
col_width +
w_col;
val += data_col[data_col_index];
} }
} }
h -= padding_height;
w -= padding_width;
data_im[c * ((width - 2 * padding_width) *
(height - 2 * padding_height)) +
h * (width - 2 * padding_width) + w] += val;
} }
data_im[index] = val;
} }
} }
...@@ -159,33 +162,38 @@ template <class T> ...@@ -159,33 +162,38 @@ template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, T> { platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im, void operator()(const platform::DeviceContext& context,
const framework::Tensor& col, int stride_height, const framework::Tensor& col,
int stride_width, int padding_up, int padding_down, const std::vector<int>& dilation,
int padding_left, int padding_right) { const std::vector<int>& stride,
PADDLE_ENFORCE(im.dims().size() == 3); const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im->dims()[0];
int input_height = im.dims()[1]; int im_height = im->dims()[1];
int input_width = im.dims()[2]; int im_width = im->dims()[2];
int filter_height = col.dims()[1]; int filter_height = col.dims()[1];
int filter_width = col.dims()[2]; int filter_width = col.dims()[2];
int output_height = col.dims()[3]; int col_height = col.dims()[3];
int output_width = col.dims()[4]; int col_width = col.dims()[4];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) / PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
stride_height + (dilation[0] * (filter_height - 1) + 1)) /
1 == stride[0] +
output_height); 1,
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) / col_height,
stride_width + "Output_height and padding(padding_up, padding_down) are "
1 == "inconsistent.");
output_width); PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
size_t num_kernels = input_channels * stride[1] +
(input_height + padding_up + padding_down) * 1,
(input_width + padding_left + padding_right); col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
size_t num_kernels = im_channels * im_height * im_width;
size_t blocks = (num_kernels + 1024 - 1) / 1024; size_t blocks = (num_kernels + 1024 - 1) / 1024;
size_t block_x = 512; size_t block_x = 512;
...@@ -198,10 +206,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -198,10 +206,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
col2im<T><<<grid, threads, 0, col2im<T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>( .stream()>>>(
num_kernels, col.data<T>(), input_height + padding_up + padding_down, num_kernels, col.data<T>(), im_height, im_width, dilation[0],
input_width + padding_left + padding_left, input_channels, dilation[1], filter_height, filter_width, stride[0], stride[1],
filter_height, filter_width, stride_height, stride_width, padding_up, padding[0], padding[2], col_height, col_width, im->data<T>());
padding_left, output_height, output_width, im.data<T>());
} }
}; };
...@@ -215,33 +222,32 @@ template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -215,33 +222,32 @@ template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, double>; platform::GPUPlace, double>;
template <class T> template <class T>
__global__ void im2colOCF(const T* im_data, T* col_data, int input_channels, __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
int input_height, int input_width, int filter_height, int im_width, int filter_height, int filter_width,
int filter_width, int stride_height, int stride_width, int stride_height, int stride_width,
int padding_height, int padding_width, int padding_height, int padding_width, int col_height,
int output_height, int output_width) { int col_width, T* col_data) {
int swid = blockIdx.x; int swid = blockIdx.x;
int shid = blockIdx.y; int shid = blockIdx.y;
for (int channelid = threadIdx.z; channelid < input_channels; for (int channelid = threadIdx.z; channelid < im_channels;
channelid += blockDim.z) { channelid += blockDim.z) {
for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
int width_offset = idx + swid * stride_width - padding_width; int width_offset = idx + swid * stride_width - padding_width;
int height_offset = idy + shid * stride_height - padding_height; int height_offset = idy + shid * stride_height - padding_height;
int im_offset = width_offset + height_offset * input_width + int im_offset = width_offset + height_offset * im_width +
channelid * input_height * input_width; channelid * im_height * im_width;
int col_offset = idx + idy * filter_width + int col_offset = idx + idy * filter_width +
channelid * filter_height * filter_width + channelid * filter_height * filter_width +
(shid * output_width + swid) * (shid * col_width + swid) *
(input_channels * filter_height * filter_width); (im_channels * filter_height * filter_width);
if (height_offset >= input_height || height_offset < 0 || col_data[col_offset] =
width_offset >= input_width || width_offset < 0) { (height_offset >= im_height || height_offset < 0 ||
col_data[col_offset] = T(0); width_offset >= im_width || width_offset < 0)
} else { ? T(0)
col_data[col_offset] = im_data[im_offset]; : im_data[im_offset];
}
} }
} }
} }
...@@ -257,27 +263,33 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -257,27 +263,33 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, T> { platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col, const framework::Tensor& im, const std::vector<int>& dilation,
int stride_height, int stride_width, int padding_up, const std::vector<int>& stride,
int padding_down, int padding_left, int padding_right) { const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3); PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col->dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im.dims()[0];
int input_height = im.dims()[1]; int im_height = im.dims()[1];
int input_width = im.dims()[2]; int im_width = im.dims()[2];
int filter_height = col.dims()[3]; int filter_height = col->dims()[3];
int filter_width = col.dims()[4]; int filter_width = col->dims()[4];
int output_height = col.dims()[0]; int col_height = col->dims()[0];
int output_width = col.dims()[1]; int col_width = col->dims()[1];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) / PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
stride_height + (dilation[0] * (filter_height - 1) + 1)) /
1 == stride[0] +
output_height); 1,
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) / col_height,
stride_width + "Output_height and padding(padding_up, padding_down) are "
1 == "inconsistent.");
output_width); PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
stride[1] +
1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
int block_dim_x = 0; int block_dim_x = 0;
int block_dim_y = 0; int block_dim_y = 0;
...@@ -296,42 +308,41 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -296,42 +308,41 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
} }
int block_dim_z = 1024 / block_dim_x / block_dim_y; int block_dim_z = 1024 / block_dim_x / block_dim_y;
dim3 threads(block_dim_x, block_dim_y, dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
std::min(block_dim_z, input_channels)); dim3 grid(col_width, col_height);
dim3 grid(output_width, output_height);
im2colOCF<T><<<grid, threads, 0, im2colOCF<T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>( .stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width, im.data<T>(), im_channels, im_height, im_width, filter_height,
filter_height, filter_width, stride_height, stride_width, padding_up, filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
padding_left, output_height, output_width); col_width, col->data<T>());
} }
}; };
template <class T> template <class T>
__global__ void col2imOCF(T* im_data, const T* col_data, int input_channels, __global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
int input_height, int input_width, int filter_height, int im_width, int filter_height, int filter_width,
int filter_width, int stride_height, int stride_width, int stride_height, int stride_width,
int padding_height, int padding_width, int padding_height, int padding_width, int col_height,
int output_height, int output_width) { int col_width, T* im_data) {
int swid = blockIdx.x; int swid = blockIdx.x;
int shid = blockIdx.y; int shid = blockIdx.y;
for (int channelid = threadIdx.z; channelid < input_channels; for (int channelid = threadIdx.z; channelid < im_channels;
channelid += blockDim.z) { channelid += blockDim.z) {
for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
int width_offset = idx + swid * stride_width - padding_width; int width_offset = idx + swid * stride_width - padding_width;
int height_offset = idy + shid * stride_height - padding_height; int height_offset = idy + shid * stride_height - padding_height;
int im_offset = width_offset + height_offset * input_width + int im_offset = width_offset + height_offset * im_width +
channelid * input_height * input_width; channelid * im_height * im_width;
int col_offset = idx + idy * filter_width + int col_offset = idx + idy * filter_width +
channelid * filter_height * filter_width + channelid * filter_height * filter_width +
(shid * output_width + swid) * (shid * col_width + swid) *
(input_channels * filter_height * filter_width); (im_channels * filter_height * filter_width);
if (height_offset >= 0 && height_offset < input_height && if (height_offset >= 0 && height_offset < im_height &&
width_offset >= 0 && width_offset < input_width) { width_offset >= 0 && width_offset < im_width) {
paddle::platform::CudaAtomicAdd(im_data + im_offset, paddle::platform::CudaAtomicAdd(im_data + im_offset,
col_data[col_offset]); col_data[col_offset]);
} }
...@@ -349,28 +360,35 @@ template <class T> ...@@ -349,28 +360,35 @@ template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, T> { platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im, void operator()(const platform::DeviceContext& context,
const framework::Tensor& col, int stride_height, const framework::Tensor& col,
int stride_width, int padding_up, int padding_down, const std::vector<int>& dilation,
int padding_left, int padding_right) { const std::vector<int>& stride,
PADDLE_ENFORCE(im.dims().size() == 3); const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0]; int im_channels = im->dims()[0];
int input_height = im.dims()[1]; int im_height = im->dims()[1];
int input_width = im.dims()[2]; int im_width = im->dims()[2];
int filter_height = col.dims()[3]; int filter_height = col.dims()[3];
int filter_width = col.dims()[4]; int filter_width = col.dims()[4];
int output_height = col.dims()[0]; int col_height = col.dims()[0];
int output_width = col.dims()[1]; int col_width = col.dims()[1];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) / PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
stride_height + (dilation[0] * (filter_height - 1) + 1)) /
1 == stride[0] +
output_height); 1,
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) / col_height,
stride_width + "Output_height and padding(padding_up, padding_down) are "
1 == "inconsistent.");
output_width); PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
(dilation[1] * (filter_width - 1) + 1)) /
stride[1] +
1,
col_width,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
int block_dim_x = 0; int block_dim_x = 0;
int block_dim_y = 0; int block_dim_y = 0;
...@@ -389,15 +407,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -389,15 +407,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
} }
int block_dim_z = 1024 / block_dim_x / block_dim_y; int block_dim_z = 1024 / block_dim_x / block_dim_y;
dim3 threads(block_dim_x, block_dim_y, dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
std::min(block_dim_z, input_channels)); dim3 grid(col_width, col_height);
dim3 grid(output_width, output_height);
col2imOCF<T><<<grid, threads, 0, col2imOCF<T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>( .stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width, col.data<T>(), im_channels, im_height, im_width, filter_height,
filter_height, filter_width, stride_height, stride_width, padding_up, filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
padding_left, output_height, output_width); col_width, im->data<T>());
} }
}; };
......
...@@ -35,6 +35,15 @@ enum class ColFormat { kCFO = 0, kOCF = 1 }; ...@@ -35,6 +35,15 @@ enum class ColFormat { kCFO = 0, kOCF = 1 };
* \param colData Column data. * \param colData Column data.
* \param colShape The shape of colData. * \param colShape The shape of colData.
* *
* \param dilations dilation data.
* \param 2-dimension [dilation_height, dilation_width].
*
* \param strides stride data.
* \param 2-dimension [stride_height, stride_width].
*
* \param paddings padding data.
* \param 4-dimension [up_pad, left_pad, down_pad, right_pad].
*
* If the template argument Format is kCFO, the shape of colData is: * If the template argument Format is kCFO, the shape of colData is:
* [input_channels, filter_height, filter_width, output_height, output_width] * [input_channels, filter_height, filter_width, output_height, output_width]
* So, it is easy to reshape into a convolution matrix for convolution * So, it is easy to reshape into a convolution matrix for convolution
...@@ -73,18 +82,19 @@ template <ColFormat Format, typename Place, typename T> ...@@ -73,18 +82,19 @@ template <ColFormat Format, typename Place, typename T>
class Im2ColFunctor { class Im2ColFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col, const framework::Tensor& im, const std::vector<int>& dilation,
int stride_height, int stride_width, int padding_up, const std::vector<int>& stride,
int padding_down, int padding_left, int padding_right); const std::vector<int>& padding, framework::Tensor* col);
}; };
template <ColFormat Format, typename Place, typename T> template <ColFormat Format, typename Place, typename T>
class Col2ImFunctor { class Col2ImFunctor {
public: public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im, void operator()(const platform::DeviceContext& context,
const framework::Tensor& col, int stride_height, const framework::Tensor& col,
int stride_width, int padding_up, int padding_down, const std::vector<int>& dilation,
int padding_left, int padding_right); const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* im);
}; };
} // namespace math } // namespace math
......
...@@ -45,10 +45,14 @@ void testIm2col() { ...@@ -45,10 +45,14 @@ void testIm2col() {
int input_height = 2; int input_height = 2;
int input_width = 3; int input_width = 3;
int filter_size = 2; int filter_size = 2;
int stride = 1; std::vector<int> stride({1, 1}); // stride_y, stride_x
int padding = 0; std::vector<int> padding(
int output_height = (input_height - filter_size + 2 * padding) / stride + 1; {0, 0, 0, 0}); // up_pad, left_pad, down_pad, right_pad
int output_width = (input_width - filter_size + 2 * padding) / stride + 1; std::vector<int> dilation({1, 1}); // dilation_y, dilation_x
int output_height =
(input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
int output_width =
(input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
float* input_ptr = input_tmp.mutable_data<float>( float* input_ptr = input_tmp.mutable_data<float>(
{1, input_height, input_width}, paddle::platform::CPUPlace()); {1, input_height, input_width}, paddle::platform::CPUPlace());
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
...@@ -85,10 +89,8 @@ void testIm2col() { ...@@ -85,10 +89,8 @@ void testIm2col() {
paddle::operators::math::ColFormat::kOCF, Place, float> paddle::operators::math::ColFormat::kOCF, Place, float>
im2col_ocf; im2col_ocf;
im2col(*context, input, output_cfo, stride, stride, padding, padding, padding, im2col(*context, input, dilation, stride, padding, &output_cfo);
padding); im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding,
padding, padding);
float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5}; float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5}; float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
...@@ -131,8 +133,7 @@ void testIm2col() { ...@@ -131,8 +133,7 @@ void testIm2col() {
input.CopyFrom(input_tmp, *place, *context); input.CopyFrom(input_tmp, *place, *context);
} }
col2im(*context, input, output_cfo, stride, stride, padding, padding, padding, col2im(*context, output_cfo, dilation, stride, padding, &input);
padding);
float* in_ptr; float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
...@@ -153,8 +154,7 @@ void testIm2col() { ...@@ -153,8 +154,7 @@ void testIm2col() {
input.CopyFrom(input_tmp, *place, *context); input.CopyFrom(input_tmp, *place, *context);
} }
col2im_ocf(*context, input, output_ocf, stride, stride, padding, padding, col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
padding, padding);
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>(); in_ptr = input.data<float>();
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/framework/data_type.h" #include "paddle/framework/data_type.h"
#include "paddle/operators/math/math_function_impl.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -232,7 +233,36 @@ void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context, ...@@ -232,7 +233,36 @@ void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
} }
template <>
void axpy<platform::CPUPlace, float>(const platform::DeviceContext& context,
const int n, const float alpha,
const float* x, float* y) {
cblas_saxpy(n, alpha, x, 1, y, 1);
}
template <>
void axpy<platform::CPUPlace, double>(const platform::DeviceContext& context,
const int n, const double alpha,
const double* x, double* y) {
cblas_daxpy(n, alpha, x, 1, y, 1);
}
template struct SetConstant<platform::CPUPlace, float>; template struct SetConstant<platform::CPUPlace, float>;
template struct SetConstant<platform::CPUPlace, double>;
template struct SetConstant<platform::CPUPlace, int>;
template struct SetConstant<platform::CPUPlace, int64_t>;
template struct SetConstant<platform::CPUPlace, bool>;
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUPlace, float, RANK>; \
template struct Transpose<platform::CPUPlace, double, RANK>;
DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2);
DEFINE_CPU_TRANS(3);
DEFINE_CPU_TRANS(4);
DEFINE_CPU_TRANS(5);
DEFINE_CPU_TRANS(6);
struct TensorSetConstantCPU { struct TensorSetConstantCPU {
TensorSetConstantCPU(framework::Tensor* tensor, float value) TensorSetConstantCPU(framework::Tensor* tensor, float value)
...@@ -280,6 +310,11 @@ void set_constant(const platform::DeviceContext& context, ...@@ -280,6 +310,11 @@ void set_constant(const platform::DeviceContext& context,
#endif #endif
} }
template struct RowwiseAdd<platform::CPUPlace, float>;
template struct RowwiseAdd<platform::CPUPlace, double>;
template struct ColwiseSum<platform::CPUPlace, float>;
template struct ColwiseSum<platform::CPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/framework/data_type.h" #include "paddle/framework/data_type.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/math_function_impl.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -231,7 +233,42 @@ void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context, ...@@ -231,7 +233,42 @@ void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1));
} }
template <>
void axpy<platform::GPUPlace, float>(const platform::DeviceContext& context,
const int n, const float alpha,
const float* x, float* y) {
PADDLE_ENFORCE(platform::dynload::cublasSaxpy(
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.cublas_handle(),
n, &alpha, x, 1, y, 1));
}
template <>
void axpy<platform::GPUPlace, double>(const platform::DeviceContext& context,
const int n, const double alpha,
const double* x, double* y) {
PADDLE_ENFORCE(platform::dynload::cublasDaxpy(
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.cublas_handle(),
n, &alpha, x, 1, y, 1));
}
template struct SetConstant<platform::GPUPlace, float>; template struct SetConstant<platform::GPUPlace, float>;
template struct SetConstant<platform::GPUPlace, double>;
template struct SetConstant<platform::GPUPlace, int>;
template struct SetConstant<platform::GPUPlace, int64_t>;
template struct SetConstant<platform::GPUPlace, bool>;
#define DEFINE_GPU_TRANS(RANK) \
template struct Transpose<platform::GPUPlace, float, RANK>; \
template struct Transpose<platform::GPUPlace, double, RANK>;
DEFINE_GPU_TRANS(1);
DEFINE_GPU_TRANS(2);
DEFINE_GPU_TRANS(3);
DEFINE_GPU_TRANS(4);
DEFINE_GPU_TRANS(5);
DEFINE_GPU_TRANS(6);
struct TensorSetConstantGPU { struct TensorSetConstantGPU {
TensorSetConstantGPU(const platform::DeviceContext& context, TensorSetConstantGPU(const platform::DeviceContext& context,
...@@ -257,6 +294,11 @@ void set_constant_with_place<platform::GPUPlace>( ...@@ -257,6 +294,11 @@ void set_constant_with_place<platform::GPUPlace>(
TensorSetConstantGPU(context, tensor, value)); TensorSetConstantGPU(context, tensor, value));
} }
template struct RowwiseAdd<platform::GPUPlace, float>;
template struct RowwiseAdd<platform::GPUPlace, double>;
template struct ColwiseSum<platform::GPUPlace, float>;
template struct ColwiseSum<platform::GPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -93,14 +93,21 @@ void gemv(const platform::DeviceContext& context, const bool trans_a, ...@@ -93,14 +93,21 @@ void gemv(const platform::DeviceContext& context, const bool trans_a,
const int M, const int N, const T alpha, const T* A, const T* B, const int M, const int N, const T alpha, const T* A, const T* B,
const T beta, T* C); const T beta, T* C);
template <typename Place, typename T>
void axpy(const platform::DeviceContext& context, const int n, const T alpha,
const T* x, T* y);
template <typename Place, typename T, int Rank>
struct Transpose {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& in, framework::Tensor* out,
const std::vector<int>& axis);
};
template <typename Place, typename T> template <typename Place, typename T>
struct SetConstant { struct SetConstant {
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor* tensor, T num) { framework::Tensor* tensor, T num);
auto t = framework::EigenVector<T>::Flatten(*tensor);
t.device(*context.GetEigenDevice<Place>()) =
t.constant(static_cast<T>(num));
}
}; };
template <typename Place> template <typename Place>
...@@ -110,6 +117,19 @@ void set_constant_with_place(const platform::DeviceContext& context, ...@@ -110,6 +117,19 @@ void set_constant_with_place(const platform::DeviceContext& context,
void set_constant(const platform::DeviceContext& context, void set_constant(const platform::DeviceContext& context,
framework::Tensor* tensor, float value); framework::Tensor* tensor, float value);
template <typename Place, typename T>
struct RowwiseAdd {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& vec,
framework::Tensor* output);
};
template <typename Place, typename T>
struct ColwiseSum {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/data_type.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
namespace math {
template <typename Place, typename T>
void SetConstant<Place, T>::operator()(const platform::DeviceContext& context,
framework::Tensor* tensor, T num) {
auto t = framework::EigenVector<T>::Flatten(*tensor);
t.device(*context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(num));
}
template <typename Place, typename T, int Rank>
void Transpose<Place, T, Rank>::operator()(
const platform::DeviceContext& context, const framework::Tensor& in,
framework::Tensor* out, const std::vector<int>& axis) {
Eigen::array<int, Rank> permute;
for (int i = 0; i < Rank; i++) {
permute[i] = axis[i];
}
auto in_dim = in.dims();
auto out_dim = out->dims();
auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
auto* dev = context.GetEigenDevice<Place>();
eigen_out.device(*dev) = eigen_in.shuffle(permute);
}
template <typename Place, typename T>
void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector,
framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenMatrix<T>::From(vector);
auto out = framework::EigenMatrix<T>::From(*output);
Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
out.device(*context.GetEigenDevice<Place>()) =
in + vec.reshape(shape).broadcast(bcast);
}
template <typename Place, typename T>
void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), size);
auto vec = framework::EigenMatrix<T>::From(*vector);
auto in = framework::EigenMatrix<T>::From(input);
Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
vec.reshape(shape).device(*context.GetEigenDevice<Place>()) =
in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/math/sequence2batch.h" #include "paddle/operators/math/sequence2batch.h"
namespace paddle { namespace paddle {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
...@@ -21,6 +22,10 @@ namespace paddle { ...@@ -21,6 +22,10 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
class CopyMatrixRowsFunctor { class CopyMatrixRowsFunctor {
public: public:
......
...@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and ...@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/math/softmax.h" #include "paddle/operators/math/softmax.h"
#include "paddle/operators/math/softmax_impl.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template class SoftmaxFunctor<platform::CPUPlace, float>; template class SoftmaxFunctor<platform::CPUPlace, float>;
template class SoftmaxFunctor<platform::CPUPlace, double>;
template class SoftmaxGradFunctor<platform::CPUPlace, float>; template class SoftmaxGradFunctor<platform::CPUPlace, float>;
template class SoftmaxGradFunctor<platform::CPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -15,13 +15,16 @@ limitations under the License. */ ...@@ -15,13 +15,16 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/operators/math/softmax.h" #include "paddle/operators/math/softmax.h"
#include "paddle/operators/math/softmax_impl.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template class SoftmaxFunctor<platform::GPUPlace, float>; template class SoftmaxFunctor<platform::GPUPlace, float>;
template class SoftmaxFunctor<platform::GPUPlace, double>;
template class SoftmaxGradFunctor<platform::GPUPlace, float>; template class SoftmaxGradFunctor<platform::GPUPlace, float>;
template class SoftmaxGradFunctor<platform::GPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -13,60 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,60 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
struct ValueClip {
HOSTDEVICE T operator()(const T& x) const {
const T kThreshold = -64.;
return x < kThreshold ? kThreshold : x;
}
};
template <typename Place, typename T> template <typename Place, typename T>
class SoftmaxFunctor { class SoftmaxFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor* X, framework::Tensor* Y) { const framework::Tensor* X, framework::Tensor* Y);
auto logits = EigenMatrix<T>::From(*X);
auto softmax = EigenMatrix<T>::From(*Y);
const int kBatchDim = 0;
const int kClassDim = 1;
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
auto shifted_logits = (logits -
logits.maximum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class))
.unaryExpr(ValueClip<T>());
softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
softmax.device(*context.GetEigenDevice<Place>()) =
(softmax *
softmax.sum(along_class)
.inverse()
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
}
}; };
template <typename Place, typename T> template <typename Place, typename T>
...@@ -74,29 +31,7 @@ class SoftmaxGradFunctor { ...@@ -74,29 +31,7 @@ class SoftmaxGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor* y, const framework::Tensor* y_grad, const framework::Tensor* y, const framework::Tensor* y_grad,
framework::Tensor* x_grad) { framework::Tensor* x_grad);
auto softmax = EigenMatrix<T>::From(*y);
auto softmax_grad = EigenMatrix<T>::From(*y_grad);
auto logits_grad = EigenMatrix<T>::From(*x_grad);
const int kBatchDim = 0;
const int kClassDim = 1;
const int batch_size = softmax.dimension(kBatchDim);
const int num_classes = softmax.dimension(kClassDim);
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
auto dot = (softmax * softmax_grad)
.sum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class);
logits_grad.device(*context.GetEigenDevice<Place>()) =
(softmax_grad - dot) * softmax;
}
}; };
} // namespace math } // namespace math
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
struct ValueClip {
HOSTDEVICE T operator()(const T& x) const {
const T kThreshold = -64.;
return x < kThreshold ? kThreshold : x;
}
};
template <typename Place, typename T>
void SoftmaxFunctor<Place, T>::operator()(
const platform::DeviceContext& context, const framework::Tensor* X,
framework::Tensor* Y) {
auto logits = EigenMatrix<T>::From(*X);
auto softmax = EigenMatrix<T>::From(*Y);
const int kBatchDim = 0;
const int kClassDim = 1;
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
auto shifted_logits = (logits -
logits.maximum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class))
.unaryExpr(ValueClip<T>());
softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
softmax.device(*context.GetEigenDevice<Place>()) =
(softmax *
softmax.sum(along_class)
.inverse()
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
}
template <typename Place, typename T>
void SoftmaxGradFunctor<Place, T>::operator()(
const platform::DeviceContext& context, const framework::Tensor* y,
const framework::Tensor* y_grad, framework::Tensor* x_grad) {
auto softmax = EigenMatrix<T>::From(*y);
auto softmax_grad = EigenMatrix<T>::From(*y_grad);
auto logits_grad = EigenMatrix<T>::From(*x_grad);
const int kBatchDim = 0;
const int kClassDim = 1;
const int batch_size = softmax.dimension(kBatchDim);
const int num_classes = softmax.dimension(kClassDim);
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
auto dot = (softmax * softmax_grad)
.sum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class);
logits_grad.device(*context.GetEigenDevice<Place>()) =
(softmax_grad - dot) * softmax;
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -28,28 +28,51 @@ template <class T> ...@@ -28,28 +28,51 @@ template <class T>
class Vol2ColFunctor<platform::CPUPlace, T> { class Vol2ColFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& vol, framework::Tensor& col, const framework::Tensor& vol,
int stride_depth, int stride_height, int stride_width, const std::vector<int>& dilations,
int padding_depth, int padding_height, const std::vector<int>& strides,
int padding_width) const { const std::vector<int>& paddings,
framework::Tensor* col) const {
PADDLE_ENFORCE(vol.dims().size() == 4); PADDLE_ENFORCE(vol.dims().size() == 4);
PADDLE_ENFORCE(col.dims().size() == 7); PADDLE_ENFORCE(col->dims().size() == 7);
int input_channels = vol.dims()[0]; int input_channels = vol.dims()[0];
int input_depth = vol.dims()[1]; int input_depth = vol.dims()[1];
int input_height = vol.dims()[2]; int input_height = vol.dims()[2];
int input_width = vol.dims()[3]; int input_width = vol.dims()[3];
int filter_depth = col.dims()[1]; int filter_depth = col->dims()[1];
int filter_height = col.dims()[2]; int filter_height = col->dims()[2];
int filter_width = col.dims()[3]; int filter_width = col->dims()[3];
int output_depth = col.dims()[4]; int output_depth = col->dims()[4];
int output_height = col.dims()[5]; int output_height = col->dims()[5];
int output_width = col.dims()[6]; int output_width = col->dims()[6];
int channels_col = int channels_col =
input_channels * filter_depth * filter_height * filter_width; input_channels * filter_depth * filter_height * filter_width;
PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
const T* vol_data = vol.data<T>(); const T* vol_data = vol.data<T>();
T* col_data = col.data<T>(); T* col_data = col->data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width; int w_offset = c % filter_width;
...@@ -57,24 +80,23 @@ class Vol2ColFunctor<platform::CPUPlace, T> { ...@@ -57,24 +80,23 @@ class Vol2ColFunctor<platform::CPUPlace, T> {
int d_offset = (c / filter_width / filter_height) % filter_depth; int d_offset = (c / filter_width / filter_height) % filter_depth;
int c_in = c / filter_width / filter_height / filter_depth; int c_in = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) { for (int d = 0; d < output_depth; ++d) {
int d_pad = d * stride_depth - padding_depth + d_offset; int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) { for (int h = 0; h < output_height; ++h) {
int h_pad = h * stride_height - padding_height + h_offset; int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) { for (int w = 0; w < output_width; ++w) {
int w_pad = w * stride_width - padding_width + w_offset; int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
int col_idx = int col_idx =
((c * output_depth + d) * output_height + h) * output_width + w; ((c * output_depth + d) * output_height + h) * output_width + w;
if (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) {
col_data[col_idx] = static_cast<T>(0);
} else {
int vol_idx = int vol_idx =
((c_in * input_depth + d_pad) * input_height + h_pad) * ((c_in * input_depth + d_pad) * input_height + h_pad) *
input_width + input_width +
w_pad; w_pad;
col_data[col_idx] = vol_data[vol_idx]; col_data[col_idx] =
} (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
? static_cast<T>(0)
: vol_data[vol_idx];
} }
} }
} }
...@@ -92,17 +114,18 @@ template <class T> ...@@ -92,17 +114,18 @@ template <class T>
class Col2VolFunctor<platform::CPUPlace, T> { class Col2VolFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor& vol, const framework::Tensor& col, const framework::Tensor& col,
int stride_depth, int stride_height, int stride_width, const std::vector<int>& dilations,
int padding_depth, int padding_height, const std::vector<int>& strides,
int padding_width) const { const std::vector<int>& paddings,
PADDLE_ENFORCE(vol.dims().size() == 4); framework::Tensor* vol) const {
PADDLE_ENFORCE(vol->dims().size() == 4);
PADDLE_ENFORCE(col.dims().size() == 7); PADDLE_ENFORCE(col.dims().size() == 7);
int input_channels = vol.dims()[0]; int input_channels = vol->dims()[0];
int input_depth = vol.dims()[1]; int input_depth = vol->dims()[1];
int input_height = vol.dims()[2]; int input_height = vol->dims()[2];
int input_width = vol.dims()[3]; int input_width = vol->dims()[3];
int filter_depth = col.dims()[1]; int filter_depth = col.dims()[1];
int filter_height = col.dims()[2]; int filter_height = col.dims()[2];
int filter_width = col.dims()[3]; int filter_width = col.dims()[3];
...@@ -112,7 +135,28 @@ class Col2VolFunctor<platform::CPUPlace, T> { ...@@ -112,7 +135,28 @@ class Col2VolFunctor<platform::CPUPlace, T> {
int channels_col = int channels_col =
input_channels * filter_depth * filter_height * filter_width; input_channels * filter_depth * filter_height * filter_width;
T* vol_data = vol.data<T>(); PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"mismatching.");
PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"mismatching.");
T* vol_data = vol->data<T>();
const T* col_data = col.data<T>(); const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) { for (int c = 0; c < channels_col; ++c) {
...@@ -121,11 +165,11 @@ class Col2VolFunctor<platform::CPUPlace, T> { ...@@ -121,11 +165,11 @@ class Col2VolFunctor<platform::CPUPlace, T> {
int d_offset = (c / filter_width / filter_height) % filter_depth; int d_offset = (c / filter_width / filter_height) % filter_depth;
int cIm = c / filter_width / filter_height / filter_depth; int cIm = c / filter_width / filter_height / filter_depth;
for (int d = 0; d < output_depth; ++d) { for (int d = 0; d < output_depth; ++d) {
int d_pad = d * stride_depth - padding_depth + d_offset; int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
for (int h = 0; h < output_height; ++h) { for (int h = 0; h < output_height; ++h) {
int h_pad = h * stride_height - padding_height + h_offset; int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
for (int w = 0; w < output_width; ++w) { for (int w = 0; w < output_width; ++w) {
int w_pad = w * stride_width - padding_width + w_offset; int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
...@@ -133,6 +177,7 @@ class Col2VolFunctor<platform::CPUPlace, T> { ...@@ -133,6 +177,7 @@ class Col2VolFunctor<platform::CPUPlace, T> {
((cIm * input_depth + d_pad) * input_height + h_pad) * ((cIm * input_depth + d_pad) * input_height + h_pad) *
input_width + input_width +
w_pad; w_pad;
int col_idx = int col_idx =
((c * output_depth + d) * output_height + h) * output_width + ((c * output_depth + d) * output_height + h) * output_width +
w; w;
......
...@@ -21,11 +21,12 @@ namespace math { ...@@ -21,11 +21,12 @@ namespace math {
template <class T> template <class T>
__global__ void vol2col(int num_kernels, const T* data_vol, int depth, __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
int height, int width, int filter_depth, int height, int width, int dilation_d, int dilation_h,
int filter_height, int filter_width, int stride_depth, int dilation_w, int filter_depth, int filter_height,
int stride_height, int stride_width, int padding_depth, int filter_width, int stride_depth, int stride_height,
int padding_height, int padding_width, int output_detph, int stride_width, int padding_depth, int padding_height,
int output_height, int output_width, T* data_col) { int padding_width, int output_detph, int output_height,
int output_width, T* data_col) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
index += blockDim.x * gridDim.x) { index += blockDim.x * gridDim.x) {
int w_out = index % output_width; int w_out = index % output_width;
...@@ -44,12 +45,14 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth, ...@@ -44,12 +45,14 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
for (int k = 0; k < filter_depth; ++k) { for (int k = 0; k < filter_depth; ++k) {
for (int i = 0; i < filter_height; ++i) { for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) { for (int j = 0; j < filter_width; ++j) {
int d = d_in + k; int d = d_in + k * dilation_d;
int h = h_in + i; int h = h_in + i * dilation_h;
int w = w_in + j; int w = w_in + j * dilation_w;
int col_idx = (k * dilation_d * height + i * dilation_h) * width +
j * dilation_w;
*data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 && *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
w < width) w < width)
? data_vol[(k * height + i) * width + j] ? data_vol[col_idx]
: 0; : 0;
data_col += output_detph * output_height * output_width; data_col += output_detph * output_height * output_width;
} }
...@@ -68,23 +71,46 @@ template <class T> ...@@ -68,23 +71,46 @@ template <class T>
class Vol2ColFunctor<platform::GPUPlace, T> { class Vol2ColFunctor<platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& vol, framework::Tensor& col, const framework::Tensor& vol,
int stride_depth, int stride_height, int stride_width, const std::vector<int>& dilations,
int padding_depth, int padding_height, const std::vector<int>& strides,
int padding_width) const { const std::vector<int>& paddings,
framework::Tensor* col) const {
PADDLE_ENFORCE(vol.dims().size() == 4); PADDLE_ENFORCE(vol.dims().size() == 4);
PADDLE_ENFORCE(col.dims().size() == 7); PADDLE_ENFORCE(col->dims().size() == 7);
int input_channels = vol.dims()[0]; int input_channels = vol.dims()[0];
int input_depth = vol.dims()[1]; int input_depth = vol.dims()[1];
int input_height = vol.dims()[2]; int input_height = vol.dims()[2];
int input_width = vol.dims()[3]; int input_width = vol.dims()[3];
int filter_depth = col.dims()[1]; int filter_depth = col->dims()[1];
int filter_height = col.dims()[2]; int filter_height = col->dims()[2];
int filter_width = col.dims()[3]; int filter_width = col->dims()[3];
int output_depth = col.dims()[4]; int output_depth = col->dims()[4];
int output_height = col.dims()[5]; int output_height = col->dims()[5];
int output_width = col.dims()[6]; int output_width = col->dims()[6];
PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"Mismatching.");
PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"Mismatching.");
PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"Mismatching.");
int num_outputs = int num_outputs =
input_channels * output_depth * output_height * output_width; input_channels * output_depth * output_height * output_width;
...@@ -95,19 +121,25 @@ class Vol2ColFunctor<platform::GPUPlace, T> { ...@@ -95,19 +121,25 @@ class Vol2ColFunctor<platform::GPUPlace, T> {
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>( .stream()>>>(
num_outputs, vol.data<T>(), input_depth, input_height, input_width, num_outputs, vol.data<T>(), input_depth, input_height, input_width,
filter_depth, filter_height, filter_width, stride_depth, stride_height, dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
stride_width, padding_depth, padding_height, padding_width, filter_width, strides[0], strides[1], strides[2], paddings[0],
output_depth, output_height, output_width, col.data<T>()); paddings[1], paddings[2], output_depth, output_height, output_width,
col->data<T>());
} }
}; };
template <class T> template <class T>
__global__ void col2vol(int num_kernels, const T* data_col, int depth, __global__ void col2vol(int num_kernels, const T* data_col, int depth,
int height, int width, int filter_depth, int height, int width, int dilation_d, int dilation_h,
int filter_height, int filter_width, int stride_depth, int dilation_w, int filter_depth, int filter_height,
int stride_height, int stride_width, int padding_depth, int filter_width, int stride_depth, int stride_height,
int padding_height, int padding_width, int output_detph, int stride_width, int padding_depth, int padding_height,
int output_height, int output_width, T* data_vol) { int padding_width, int output_detph, int output_height,
int output_width, T* data_vol) {
const int d_filter_depth = dilation_d * (filter_depth - 1) + 1;
const int d_filter_height = dilation_h * (filter_height - 1) + 1;
const int d_filter_width = dilation_w * (filter_width - 1) + 1;
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels; for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
index += blockDim.x * gridDim.x) { index += blockDim.x * gridDim.x) {
T src_val = 0; T src_val = 0;
...@@ -115,35 +147,41 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth, ...@@ -115,35 +147,41 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth,
int h = (index / width) % height + padding_height; int h = (index / width) % height + padding_height;
int d = (index / width / height) % depth + padding_depth; int d = (index / width / height) % depth + padding_depth;
int c = index / width / height / depth; int c = index / width / height / depth;
// compute the start and end of the output // compute the start and end of the output
int w_col_start = int w_col_start =
(w < filter_width) ? 0 : (w - filter_width) / stride_width + 1; (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
int w_col_end = min(w / stride_width + 1, output_width); int w_col_end = min(w / stride_width + 1, output_width);
int h_col_start = int h_col_start =
(h < filter_height) ? 0 : (h - filter_height) / stride_height + 1; (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
int h_col_end = min(h / stride_height + 1, output_height); int h_col_end = min(h / stride_height + 1, output_height);
int d_col_start = int d_col_start =
(d < filter_depth) ? 0 : (d - filter_depth) / stride_depth + 1; (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1;
int d_col_end = min(d / stride_depth + 1, output_detph); int d_col_end = min(d / stride_depth + 1, output_detph);
int offset = (c * filter_depth * filter_height * filter_width +
d * filter_width * filter_height + h * filter_width + w) *
output_detph * output_height * output_width;
int coeff_d_col =
(1 - stride_depth * filter_width * filter_height * output_detph) *
output_height * output_width;
int coeff_h_col =
(1 - stride_height * filter_width * output_detph * output_height) *
output_width;
int coeff_w_col =
(1 - stride_width * output_detph * output_height * output_width);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) { for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
src_val += data_col[offset + d_col * coeff_d_col + int d_off = (d - d_col * stride_depth);
h_col * coeff_h_col + w_col * coeff_w_col]; int h_off = (h - h_col * stride_height);
int w_off = (w - w_col * stride_width);
if (d_off % dilation_d == 0 && h_off % dilation_h == 0 &&
w_off % dilation_w == 0) {
d_off /= dilation_d;
h_off /= dilation_h;
w_off /= dilation_w;
int data_col_index =
(((((c * filter_depth + d_off) * filter_height + h_off) *
filter_width +
w_off)));
data_col_index =
((data_col_index * output_detph + d_col) * output_height +
h_col) *
output_width +
w_col;
src_val += data_col[data_col_index];
}
} }
} }
} }
...@@ -161,17 +199,18 @@ template <class T> ...@@ -161,17 +199,18 @@ template <class T>
class Col2VolFunctor<platform::GPUPlace, T> { class Col2VolFunctor<platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor& vol, const framework::Tensor& col, const framework::Tensor& col,
int stride_depth, int stride_height, int stride_width, const std::vector<int>& dilations,
int padding_depth, int padding_height, const std::vector<int>& strides,
int padding_width) const { const std::vector<int>& paddings,
PADDLE_ENFORCE(vol.dims().size() == 4); framework::Tensor* vol) const {
PADDLE_ENFORCE(vol->dims().size() == 4);
PADDLE_ENFORCE(col.dims().size() == 7); PADDLE_ENFORCE(col.dims().size() == 7);
int input_channels = vol.dims()[0]; int input_channels = vol->dims()[0];
int input_depth = vol.dims()[1]; int input_depth = vol->dims()[1];
int input_height = vol.dims()[2]; int input_height = vol->dims()[2];
int input_width = vol.dims()[3]; int input_width = vol->dims()[3];
int filter_depth = col.dims()[1]; int filter_depth = col.dims()[1];
int filter_height = col.dims()[2]; int filter_height = col.dims()[2];
int filter_width = col.dims()[3]; int filter_width = col.dims()[3];
...@@ -179,6 +218,28 @@ class Col2VolFunctor<platform::GPUPlace, T> { ...@@ -179,6 +218,28 @@ class Col2VolFunctor<platform::GPUPlace, T> {
int output_height = col.dims()[5]; int output_height = col.dims()[5];
int output_width = col.dims()[6]; int output_width = col.dims()[6];
PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
((dilations[0] * (filter_depth - 1) + 1))) /
strides[0] +
1,
output_depth,
"input_depth and output_depth are "
"Mismatching.");
PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
((dilations[1] * (filter_height - 1) + 1))) /
strides[1] +
1,
output_height,
"input_height and output_height are "
"Mismatching.");
PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
((dilations[2] * (filter_width - 1) + 1))) /
strides[2] +
1,
output_width,
"input_width and output_width are "
"Mismatching.");
int num_kernels = input_channels * input_depth * input_height * input_width; int num_kernels = input_channels * input_depth * input_height * input_width;
const int threads = 1024; const int threads = 1024;
...@@ -188,9 +249,10 @@ class Col2VolFunctor<platform::GPUPlace, T> { ...@@ -188,9 +249,10 @@ class Col2VolFunctor<platform::GPUPlace, T> {
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>( .stream()>>>(
num_kernels, col.data<T>(), input_depth, input_height, input_width, num_kernels, col.data<T>(), input_depth, input_height, input_width,
filter_depth, filter_height, filter_width, stride_depth, stride_height, dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
stride_width, padding_depth, padding_height, padding_width, filter_width, strides[0], strides[1], strides[2], paddings[0],
output_depth, output_height, output_width, vol.data<T>()); paddings[1], paddings[2], output_depth, output_height, output_width,
vol->data<T>());
} }
}; };
......
...@@ -31,6 +31,15 @@ namespace math { ...@@ -31,6 +31,15 @@ namespace math {
* \param colData Column data. * \param colData Column data.
* \param colShape The shape of colData. * \param colShape The shape of colData.
* *
* \param dilations dilation data.
* \param 3-dimension [dilation_depth, dilation_height, dilation_width].
*
* \param strides stride data.
* \param 3-dimension [stride_depth, stride_height, stride_width].
*
* \param paddings padding data.
* \param 3-dimension [d_pad, h_pad, w_pad].
*
* The shape of colData is: * The shape of colData is:
* [input_channels, filter_depth, filter_height, filter_width, output_depth, * [input_channels, filter_depth, filter_height, filter_width, output_depth,
* output_height, output_width] * output_height, output_width]
...@@ -57,20 +66,22 @@ template <typename Place, typename T> ...@@ -57,20 +66,22 @@ template <typename Place, typename T>
class Vol2ColFunctor { class Vol2ColFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& vol, framework::Tensor& col, const framework::Tensor& vol,
int stride_depth, int stride_height, int stride_width, const std::vector<int>& dilations,
int padding_depth, int padding_height, const std::vector<int>& strides,
int padding_width) const; const std::vector<int>& paddings,
framework::Tensor* col) const;
}; };
template <typename Place, typename T> template <typename Place, typename T>
class Col2VolFunctor { class Col2VolFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor& vol, const framework::Tensor& col, const framework::Tensor& col,
int stride_depth, int stride_height, int stride_width, const std::vector<int>& dilations,
int padding_depth, int padding_height, const std::vector<int>& strides,
int padding_width) const; const std::vector<int>& paddings,
framework::Tensor* vol) const;
}; };
} // namespace math } // namespace math
......
...@@ -62,11 +62,15 @@ void testVol2col() { ...@@ -62,11 +62,15 @@ void testVol2col() {
int input_height = 2; int input_height = 2;
int input_width = 3; int input_width = 3;
int filter_size = 2; int filter_size = 2;
int stride = 1; std::vector<int> strides({1, 1, 1});
int padding = 0; std::vector<int> paddings({0, 0, 0});
int output_depth = (input_depth - filter_size + 2 * padding) / stride + 1; std::vector<int> dilations({1, 1, 1});
int output_height = (input_height - filter_size + 2 * padding) / stride + 1; int output_depth =
int output_width = (input_width - filter_size + 2 * padding) / stride + 1; (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
int output_height =
(input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
int output_width =
(input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
// Vol2Col test // Vol2Col test
float* input_ptr = float* input_ptr =
...@@ -85,8 +89,7 @@ void testVol2col() { ...@@ -85,8 +89,7 @@ void testVol2col() {
*place); *place);
paddle::operators::math::Vol2ColFunctor<Place, float> vol2col; paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
vol2col(*context, input, output, stride, stride, stride, padding, padding, vol2col(*context, input, dilations, strides, paddings, &output);
padding);
float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
float* out_cfo_ptr; float* out_cfo_ptr;
...@@ -111,8 +114,7 @@ void testVol2col() { ...@@ -111,8 +114,7 @@ void testVol2col() {
} }
paddle::operators::math::Col2VolFunctor<Place, float> col2vol; paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
col2vol(*context, input, output, stride, stride, stride, padding, padding, col2vol(*context, output, dilations, strides, paddings, &input);
padding);
float* in_ptr; float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#pragma once #pragma once
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/matmul.h" #include "paddle/operators/math/matmul.h"
#include "paddle/operators/transpose_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -76,7 +76,10 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context, ...@@ -76,7 +76,10 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
if (in_dims.size() == 3) { if (in_dims.size() == 3) {
output.Resize({in_dims[1], in_dims[0], in_dims[2]}); output.Resize({in_dims[1], in_dims[0], in_dims[2]});
output.mutable_data<T>(context.GetPlace()); output.mutable_data<T>(context.GetPlace());
EigenTranspose<Place, T, 3>(context, input, output, {1, 0, 2}); std::vector<int> axis = {1, 0, 2};
math::Transpose<Place, T, 3> trans;
trans(context.device_context(), input, &output, axis);
std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
} else { } else {
output.ShareDataWith(input); output.ShareDataWith(input);
......
...@@ -81,22 +81,21 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> { ...@@ -81,22 +81,21 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
if (in_x_grad) { if (in_x_grad) {
in_x_grad->mutable_data<T>(context.GetPlace()); in_x_grad->mutable_data<T>(context.GetPlace());
auto temp = framework::EigenVector<T>::Flatten(*in_x_grad); auto& device_ctx = context.device_context();
temp.device(context.GetEigenDevice<Place>()) = math::set_constant(device_ctx, in_x_grad, 0);
temp.constant(static_cast<T>(0));
switch (ksize.size()) { switch (ksize.size()) {
case 2: { case 2: {
paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T> paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
pool2d_backward; pool2d_backward;
pool2d_backward(context.device_context(), *out_grad, *mask, ksize, pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
strides, paddings, in_x_grad); paddings, in_x_grad);
} break; } break;
case 3: { case 3: {
paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T> paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
pool3d_backward; pool3d_backward;
pool3d_backward(context.device_context(), *out_grad, *mask, ksize, pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
strides, paddings, in_x_grad); paddings, in_x_grad);
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
} }
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/sequence_conv_op.h" #include "paddle/operators/sequence_conv_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/context_project.h" #include "paddle/operators/math/context_project.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
...@@ -62,9 +61,9 @@ class SequenceConvKernel : public framework::OpKernel<T> { ...@@ -62,9 +61,9 @@ class SequenceConvKernel : public framework::OpKernel<T> {
math::ContextProjectFunctor<Place, T> seq_project_functor; math::ContextProjectFunctor<Place, T> seq_project_functor;
seq_project_functor(context.device_context(), *in, *padding_data, col, seq_project_functor(context.device_context(), *in, *padding_data,
padding_trainable, context_start, context_length, padding_trainable, context_start, context_length,
context_stride, up_pad, down_pad); context_stride, up_pad, down_pad, &col);
math::matmul<Place, T>(context.device_context(), col, false, filter, false, math::matmul<Place, T>(context.device_context(), col, false, filter, false,
static_cast<T>(1.0), out, static_cast<T>(0.0)); static_cast<T>(1.0), out, static_cast<T>(0.0));
...@@ -117,10 +116,10 @@ class SequenceConvGradKernel : public framework::OpKernel<T> { ...@@ -117,10 +116,10 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
in_g->set_lod(in->lod()); in_g->set_lod(in->lod());
set_zero(context.device_context(), in_g, static_cast<T>(0)); set_zero(context.device_context(), in_g, static_cast<T>(0));
seq_project_grad_functor(context.device_context(), *in_g, *padding_data_g, seq_project_grad_functor(context.device_context(), *in_g,
col, padding_trainable, context_start, padding_trainable, context_start, context_length,
context_length, context_stride, up_pad, down_pad, context_stride, up_pad, down_pad, false, true,
true, false); padding_data_g, &col);
} }
if (padding_trainable && padding_data_g) { if (padding_trainable && padding_data_g) {
...@@ -129,9 +128,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> { ...@@ -129,9 +128,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
LoDTensor* input = const_cast<LoDTensor*>(in); LoDTensor* input = const_cast<LoDTensor*>(in);
seq_project_grad_functor(context.device_context(), *input, seq_project_grad_functor(context.device_context(), *input,
*padding_data_g, col, padding_trainable, padding_trainable, context_start, context_length,
context_start, context_length, context_stride, context_stride, up_pad, down_pad, true, false,
up_pad, down_pad, false, true); padding_data_g, &col);
} }
if (filter_g) { if (filter_g) {
...@@ -146,9 +145,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> { ...@@ -146,9 +145,9 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
padding_data = context.Input<Tensor>("PaddingData"); padding_data = context.Input<Tensor>("PaddingData");
} }
seq_project_functor(context.device_context(), *in, *padding_data, col, seq_project_functor(context.device_context(), *in, *padding_data,
padding_trainable, context_start, context_length, padding_trainable, context_start, context_length,
context_stride, up_pad, down_pad); context_stride, up_pad, down_pad, &col);
math::matmul<Place, T>(context.device_context(), col, true, out_grad, math::matmul<Place, T>(context.device_context(), col, true, out_grad,
false, T(1.0), &filter_grad, T(1.0)); false, T(1.0), &filter_grad, T(1.0));
......
...@@ -20,11 +20,11 @@ namespace paddle { ...@@ -20,11 +20,11 @@ namespace paddle {
namespace operators { namespace operators {
namespace { namespace {
template <typename T> template <typename T, int block_size>
__global__ void SparseSGDFunctorKernel(const T* selected_rows, __global__ void SparseSGDFunctorKernel(const T* selected_rows,
const int64_t* rows, const int64_t* rows,
const T* learning_rate, T* tensor_out, const T* learning_rate, T* tensor_out,
int64_t row_numel, int block_size) { int64_t row_numel) {
const int ty = blockIdx.y; const int ty = blockIdx.y;
int tid = threadIdx.x; int tid = threadIdx.x;
...@@ -59,14 +59,15 @@ struct SparseSGDFunctor<platform::GPUPlace, T> { ...@@ -59,14 +59,15 @@ struct SparseSGDFunctor<platform::GPUPlace, T> {
auto* in_data = in_value.data<T>(); auto* in_data = in_value.data<T>();
auto* out_data = output->data<T>(); auto* out_data = output->data<T>();
int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid(1, in_rows.size()); dim3 grid(1, in_rows.size());
SparseSGDFunctorKernel< SparseSGDFunctorKernel<
T><<<grid, threads, 0, T, 256><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(in_data, in_rows.data(), learning_rate.data<T>(), .stream()>>>(in_data, in_rows.data(),
out_data, in_row_numel, block_size); learning_rate.data<T>(), out_data,
in_row_numel);
} }
}; };
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/operators/softmax_with_cross_entropy_op.h" #include "paddle/operators/softmax_with_cross_entropy_op.h"
#include <paddle/function/TensorType.h> #include <paddle/function/TensorType.h>
#include <iostream>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -12,7 +12,7 @@ limitations under the License. */ ...@@ -12,7 +12,7 @@ limitations under the License. */
#include "paddle/operators/sum_op.h" #include "paddle/operators/sum_op.h"
#include <vector> #include <vector>
#include "paddle/framework/var_type_inference.h" #include "paddle/framework/var_type_inference.h"
#include "paddle/operators/net_op.h" #include "paddle/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -60,7 +60,8 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -60,7 +60,8 @@ class SumOp : public framework::OperatorWithKernel {
x_vars[0]->Get<framework::SelectedRows>().value().type()), x_vars[0]->Get<framework::SelectedRows>().value().type()),
ctx.device_context()); ctx.device_context());
} else if (x_vars[0]->IsType<framework::LoDTensorArray>()) { } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
auto& array = x_vars[0]->Get<framework::LoDTensorArray>(); for (auto& x_var : x_vars) {
auto& array = x_var->Get<framework::LoDTensorArray>();
for (auto& each : array) { for (auto& each : array) {
if (each.numel() != 0) { if (each.numel() != 0) {
return framework::OpKernelType(framework::ToDataType(each.type()), return framework::OpKernelType(framework::ToDataType(each.type()),
...@@ -68,6 +69,8 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -68,6 +69,8 @@ class SumOp : public framework::OperatorWithKernel {
} }
} }
} }
PADDLE_THROW("Cannot find the input data type by all input data");
}
PADDLE_THROW("Unexpected branch. Input type is %s", PADDLE_THROW("Unexpected branch. Input type is %s",
x_vars[0]->Type().name()); x_vars[0]->Type().name());
} }
...@@ -97,6 +100,11 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -97,6 +100,11 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
auto& inputs = op_desc.Input("X"); auto& inputs = op_desc.Input("X");
auto var_type = framework::VarDesc::SELECTED_ROWS; auto var_type = framework::VarDesc::SELECTED_ROWS;
for (auto& name : op_desc.Input("X")) {
VLOG(10) << name << " "
<< block->FindRecursiveOrCreateVar(name)->GetType();
}
bool any_input_is_lod_tensor = std::any_of( bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string& name) { inputs.begin(), inputs.end(), [block](const std::string& name) {
return block->FindRecursiveOrCreateVar(name)->GetType() == return block->FindRecursiveOrCreateVar(name)->GetType() ==
...@@ -104,7 +112,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -104,7 +112,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
}); });
auto is_tensor_array = [block](const std::string& name) { auto is_tensor_array = [block](const std::string& name) {
return block->FindRecursiveOrCreateVar(name)->GetType() == return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
framework::VarDesc::LOD_TENSOR_ARRAY; framework::VarDesc::LOD_TENSOR_ARRAY;
}; };
...@@ -114,14 +122,26 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -114,14 +122,26 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
std::all_of(inputs.begin(), inputs.end(), is_tensor_array); std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
if (any_input_is_tensor_array) { if (any_input_is_tensor_array) {
PADDLE_ENFORCE(all_inputs_are_tensor_array); if (!all_inputs_are_tensor_array) {
std::ostringstream os;
for (auto& each : inputs) {
os << " " << each << " type is "
<< detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType()
<< "\n";
}
PADDLE_ENFORCE(all_inputs_are_tensor_array,
"Not all inputs are tensor array:\n%s", os.str());
}
var_type = framework::VarDesc::LOD_TENSOR_ARRAY; var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
} else if (any_input_is_lod_tensor) { } else if (any_input_is_lod_tensor) {
var_type = framework::VarDesc::LOD_TENSOR; var_type = framework::VarDesc::LOD_TENSOR;
} }
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = op_desc.Output("Out").front();
block->FindRecursiveOrCreateVar(out_var_name)->SetType(var_type); auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name));
out_var.SetType(var_type);
auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
out_var.SetDataType(in_var.GetDataType());
} }
}; };
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/array_operator.h" #include "paddle/operators/array_operator.h"
#include "paddle/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,6 +33,8 @@ class WriteToArrayOp : public ArrayOp { ...@@ -33,6 +33,8 @@ class WriteToArrayOp : public ArrayOp {
auto *out = auto *out =
scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>(); scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
if (offset >= out->size()) { if (offset >= out->size()) {
VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
<< " to " << offset + 1;
out->resize(offset + 1); out->resize(offset + 1);
} }
auto *out_tensor = &out->at(offset); auto *out_tensor = &out->at(offset);
...@@ -85,11 +87,15 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { ...@@ -85,11 +87,15 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDescBind &op_desc, void operator()(const framework::OpDescBind &op_desc,
framework::BlockDescBind *block) const override { framework::BlockDescBind *block) const override {
for (auto &out_var : op_desc.OutputArgumentNames()) { auto x_name = op_desc.Input("X")[0];
VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY"; auto out_name = op_desc.Output("Out")[0];
block->FindRecursiveOrCreateVar(out_var)->SetType( VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
framework::VarDesc::LOD_TENSOR_ARRAY); auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
} "Cannot found %s", out_name);
out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
auto &x =
detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name);
out.SetDataType(x.GetDataType());
} }
}; };
...@@ -107,11 +113,11 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -107,11 +113,11 @@ class ReadFromArrayOp : public ArrayOp {
auto &x_array = x->Get<framework::LoDTensorArray>(); auto &x_array = x->Get<framework::LoDTensorArray>();
auto *out = scope.FindVar(Output("Out")); auto *out = scope.FindVar(Output("Out"));
PADDLE_ENFORCE(out != nullptr, "Out must be set"); PADDLE_ENFORCE(out != nullptr, "Out must be set");
auto *out_tesnor = out->GetMutable<framework::LoDTensor>(); auto *out_tensor = out->GetMutable<framework::LoDTensor>();
size_t offset = GetOffset(scope, dev_ctx); size_t offset = GetOffset(scope, dev_ctx);
PADDLE_ENFORCE_LT(offset, x_array.size()); PADDLE_ENFORCE_LT(offset, x_array.size());
out_tesnor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx); out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
out_tesnor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} }
}; };
......
...@@ -14,61 +14,58 @@ ...@@ -14,61 +14,58 @@
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename Place, typename T, int Rank>
void EigenTranspose(const framework::ExecutionContext& context,
const framework::Tensor& in, framework::Tensor& out,
std::vector<int> axis) {
Eigen::array<int, Rank> permute;
for (int i = 0; i < Rank; i++) {
permute[i] = axis[i];
}
auto in_dim = in.dims();
auto out_dim = out.dims();
auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
auto eigen_out = framework::EigenTensor<T, Rank>::From(out);
auto& dev = context.GetEigenDevice<Place>();
eigen_out.device(dev) = eigen_in.shuffle(permute);
}
template <typename Place, typename T> template <typename Place, typename T>
class TransposeKernel : public framework::OpKernel<T> { inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx,
public: const framework::Tensor& in, framework::Tensor* out,
void Compute(const framework::ExecutionContext& context) const override { const std::vector<int>& axis) {
auto* x = context.Input<framework::Tensor>("X"); switch (dim) {
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
std::vector<int> axis = context.Attr<std::vector<int>>("axis");
int ndims = axis.size();
switch (ndims) {
case 1: case 1:
EigenTranspose<Place, T, 1>(context, *x, *out, axis); math::Transpose<Place, T, 1> trans1;
trans1(dev_ctx, in, out, axis);
break; break;
case 2: case 2:
EigenTranspose<Place, T, 2>(context, *x, *out, axis); math::Transpose<Place, T, 2> trans2;
trans2(dev_ctx, in, out, axis);
break; break;
case 3: case 3:
EigenTranspose<Place, T, 3>(context, *x, *out, axis); math::Transpose<Place, T, 3> trans3;
trans3(dev_ctx, in, out, axis);
break; break;
case 4: case 4:
EigenTranspose<Place, T, 4>(context, *x, *out, axis); math::Transpose<Place, T, 4> trans4;
trans4(dev_ctx, in, out, axis);
break; break;
case 5: case 5:
EigenTranspose<Place, T, 5>(context, *x, *out, axis); math::Transpose<Place, T, 5> trans5;
trans5(dev_ctx, in, out, axis);
break; break;
case 6: case 6:
EigenTranspose<Place, T, 6>(context, *x, *out, axis); math::Transpose<Place, T, 6> trans6;
trans6(dev_ctx, in, out, axis);
break; break;
default: default:
PADDLE_THROW("Tensors with rank at most 6 are supported"); PADDLE_THROW("Tensors with rank at most 6 are supported");
} }
}
template <typename Place, typename T>
class TransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<framework::Tensor>("X");
auto* out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
std::vector<int> axis = context.Attr<std::vector<int>>("axis");
int ndims = axis.size();
auto& dev_ctx = context.device_context();
TransCompute<Place, T>(ndims, dev_ctx, *x, out, axis);
} }
}; };
...@@ -80,9 +77,9 @@ class TransposeGradKernel : public framework::OpKernel<T> { ...@@ -80,9 +77,9 @@ class TransposeGradKernel : public framework::OpKernel<T> {
context.Input<framework::Tensor>(framework::GradVarName("Out")); context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* x_grad = auto* x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X")); context.Output<framework::Tensor>(framework::GradVarName("X"));
if (x_grad) { if (!x_grad) return;
x_grad->mutable_data<T>(context.GetPlace());
x_grad->mutable_data<T>(context.GetPlace());
std::vector<int> axis = context.Attr<std::vector<int>>("axis"); std::vector<int> axis = context.Attr<std::vector<int>>("axis");
std::vector<int> reversed_axis(axis); std::vector<int> reversed_axis(axis);
...@@ -91,36 +88,8 @@ class TransposeGradKernel : public framework::OpKernel<T> { ...@@ -91,36 +88,8 @@ class TransposeGradKernel : public framework::OpKernel<T> {
} }
int ndims = axis.size(); int ndims = axis.size();
auto& dev_ctx = context.device_context();
switch (ndims) { TransCompute<Place, T>(ndims, dev_ctx, *out_grad, x_grad, reversed_axis);
case 1:
EigenTranspose<Place, T, 1>(context, *out_grad, *x_grad,
reversed_axis);
break;
case 2:
EigenTranspose<Place, T, 2>(context, *out_grad, *x_grad,
reversed_axis);
break;
case 3:
EigenTranspose<Place, T, 3>(context, *out_grad, *x_grad,
reversed_axis);
break;
case 4:
EigenTranspose<Place, T, 4>(context, *out_grad, *x_grad,
reversed_axis);
break;
case 5:
EigenTranspose<Place, T, 5>(context, *out_grad, *x_grad,
reversed_axis);
break;
case 6:
EigenTranspose<Place, T, 6>(context, *out_grad, *x_grad,
reversed_axis);
break;
default:
PADDLE_THROW("Tensors with rank at most 6 are supported");
}
}
} }
}; };
......
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
#include <vector> #include <vector>
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,8 +28,9 @@ using LoDTensor = framework::LoDTensor; ...@@ -26,8 +28,9 @@ using LoDTensor = framework::LoDTensor;
constexpr char kStepBlock[] = "step_block"; constexpr char kStepBlock[] = "step_block";
constexpr char kCondition[] = "Condition"; constexpr char kCondition[] = "Condition";
constexpr char kStepScopes[] = "StepScopes"; constexpr char kStepScopes[] = "StepScopes";
constexpr char kParamGrads[] = "X@Grad";
constexpr char kParameters[] = "X"; constexpr char kParameters[] = "X";
constexpr char kParamGrads[] = "X@GRAD";
constexpr char kOutputs[] = "Out";
class WhileOp : public framework::OperatorBase { class WhileOp : public framework::OperatorBase {
public: public:
...@@ -71,9 +74,9 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -71,9 +74,9 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
kCondition, kCondition,
"(Bool) An scalar. When it's False, the While Op will be terminated.") "(Bool) An scalar. When it's False, the While Op will be terminated.")
.AsDuplicable(); .AsDuplicable();
AddOutput("Out", AddOutput(kOutputs,
"A set of variables, which will be assigned with values " "A set of variables, which will be assigned with values "
"generated by perators inside the block of While Op.") "generated by the operators inside the block of While Op.")
.AsDuplicable(); .AsDuplicable();
AddOutput(kStepScopes, AddOutput(kStepScopes,
"(StepScopeVar) A vector of local scope, which size equals the " "(StepScopeVar) A vector of local scope, which size equals the "
...@@ -104,17 +107,64 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -104,17 +107,64 @@ class WhileGradOp : public framework::OperatorBase {
auto *step_scopes = auto *step_scopes =
scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>(); scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
auto inside_og_names =
Attr<std::vector<std::string>>("original_output_grad");
PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
for (auto cur_scope_iter = step_scopes->rbegin(); for (auto cur_scope_iter = step_scopes->rbegin();
cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
VLOG(3) << "Start backward at time_step "
<< cur_scope_iter - step_scopes->rbegin();
framework::Scope &cur_scope = **cur_scope_iter;
// Link OG from outside to inside
for (size_t i = 0; i < outside_og_names.size(); ++i) {
auto outside_og_name = outside_og_names[i];
auto inside_og_name = inside_og_names[i];
VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
<< inside_og_name;
auto &og_outside = detail::Ref(scope.FindVar(outside_og_name));
auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name));
if (og_outside.Type().hash_code() ==
typeid(framework::LoDTensor).hash_code()) {
auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
auto &inside_tensor =
detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
inside_tensor.set_lod(outside_tensor.lod());
inside_tensor.ShareDataWith(outside_tensor);
} else if (og_outside.Type().hash_code() ==
typeid(framework::LoDTensorArray).hash_code()) {
auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
auto &inside_array =
detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
VLOG(10) << outside_og_name << " size = " << outside_array.size();
inside_array.resize(outside_array.size());
for (size_t j = 0; j < inside_array.size(); ++j) {
VLOG(10) << j << " " << outside_array[j].numel();
if (outside_array[j].numel() != 0) {
inside_array[j].set_lod(outside_array[j].lod());
inside_array[j].ShareDataWith(outside_array[j]);
} else {
PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
}
}
}
}
executor.Run(*program, *cur_scope_iter, block->ID(), false); executor.Run(*program, *cur_scope_iter, block->ID(), false);
auto &pg_names = Outputs(kParamGrads); auto &pg_names = Outputs(kParamGrads);
auto &p_names = Inputs(kParameters); auto &p_names = Inputs(kParameters);
PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) { for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
auto inside_grad_name = framework::GradVarName(p_names[prog_id]); if (pg_names[param_id] == framework::kEmptyVarName) {
continue; // iterator doesn't have gradient
}
auto inside_grad_name = framework::GradVarName(p_names[param_id]);
// // TODO(tonyyang-savil: Not sure we need the following // // TODO(tonyyang-svail): Not sure we need the following
// // If does not compute gradient of that variable inside rnn, // // If does not compute gradient of that variable inside rnn,
// just // just
// // continue // // continue
...@@ -126,7 +176,7 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -126,7 +176,7 @@ class WhileGradOp : public framework::OperatorBase {
// zero gradient variable in step 0 // zero gradient variable in step 0
if (cur_scope_iter == step_scopes->rbegin()) { if (cur_scope_iter == step_scopes->rbegin()) {
auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
auto &inside_tensor = var->Get<framework::LoDTensor>(); auto &inside_tensor = var->Get<framework::LoDTensor>();
framework::AttributeMap attrs; framework::AttributeMap attrs;
...@@ -135,27 +185,18 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -135,27 +185,18 @@ class WhileGradOp : public framework::OperatorBase {
attrs["value"] = 0.0f; attrs["value"] = 0.0f;
auto zero_op = framework::OpRegistry::CreateOp( auto zero_op = framework::OpRegistry::CreateOp(
"fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs); "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
zero_op->Run(scope, dev_ctx); zero_op->Run(scope, dev_ctx);
} }
} }
// sum gradient // sum gradient
auto *outside_var = scope.FindVar(pg_names[prog_id]); auto new_inside_name = cur_scope.Rename(inside_grad_name);
PADDLE_ENFORCE_NOT_NULL(outside_var);
auto &outside_tensor = *outside_var->GetMutable<framework::LoDTensor>();
std::string result_var_name;
auto *local_result_var = (*cur_scope_iter)->Var(&result_var_name);
auto &local_result_tensor =
*local_result_var->GetMutable<framework::LoDTensor>();
local_result_tensor.ShareDataWith(outside_tensor);
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {result_var_name, inside_grad_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {result_var_name}}}, {}); {{"Out", {pg_names[param_id]}}}, {});
sum_op->Run(**cur_scope_iter, dev_ctx); sum_op->Run(cur_scope, dev_ctx);
cur_scope.Rename(new_inside_name, inside_grad_name);
} }
} }
} }
...@@ -169,29 +210,110 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -169,29 +210,110 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
virtual std::unique_ptr<framework::OpDescBind> Apply() const { virtual std::unique_ptr<framework::OpDescBind> Apply() const {
auto *grad = new framework::OpDescBind(); auto *grad = new framework::OpDescBind();
grad->SetType("while_grad"); grad->SetType("while_grad");
for (auto &input_param : this->InputNames()) { grad->SetInput(kParameters, Input(kParameters));
grad->SetInput(input_param, this->Input(input_param)); grad->SetOutput(
grad->SetOutput(framework::GradVarName(input_param), framework::GradVarName(kParameters),
this->InputGrad(input_param)); InputGrad(kParameters, /*do not drop empty gradient*/ false));
grad->SetInput(kOutputs, Output(kOutputs));
// OG should be re-calculated by step blocks, since many outputs of while op
// do not need to calculate gradients.
std::unordered_set<std::string> block_ins;
{
for (auto &p : Input(kParameters)) {
block_ins.insert(p);
}
for (auto &o : Output(kOutputs)) {
block_ins.insert(o);
}
}
std::unordered_set<std::string> extra_inputs;
for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
for (auto &input_name : grad_block_[0]->Op(i)->InputArgumentNames()) {
if (block_ins.find(input_name) != block_ins.end()) {
continue;
}
extra_inputs.insert(input_name);
} }
for (auto &output_param : this->OutputNames()) { for (auto &output_name : grad_block_[0]->Op(i)->OutputArgumentNames()) {
grad->SetInput(output_param, this->Output(output_param)); block_ins.insert(output_name);
if (output_param != kStepScopes) {
grad->SetInput(framework::GradVarName(output_param),
this->OutputGrad(output_param));
} }
} }
std::vector<std::string> extra_inputs_list;
extra_inputs_list.resize(extra_inputs.size());
std::copy(extra_inputs.begin(), extra_inputs.end(),
extra_inputs_list.begin());
grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
grad->SetInput(kStepScopes, Output(kStepScopes));
grad->SetAttrMap(this->Attrs()); grad->SetAttrMap(this->Attrs());
grad->SetBlockAttr(kStepBlock, *grad_block_[0]); grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
// record the original output gradient names, since the gradient name of
// while operator could be renamed.
grad->SetAttr("original_output_grad", extra_inputs_list);
return std::unique_ptr<framework::OpDescBind>(grad); return std::unique_ptr<framework::OpDescBind>(grad);
} }
}; };
class WhileGradOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDescBind &op_desc,
framework::BlockDescBind *block) const override {
auto p_names = op_desc.Input(kParameters);
auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
for (size_t i = 0; i < p_names.size(); ++i) {
auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
auto *g_var = block->FindVarRecursive(pg_names[i]);
if (g_var != nullptr) { // Gradient could be @EMPTY@
VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
<< " type: " << p_var.GetType();
g_var->SetType(p_var.GetType());
g_var->SetDataType(p_var.GetDataType());
}
}
}
};
class WhileGradOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
ctx->HasInputs(kParameters);
ctx->HasOutputs(framework::GradVarName(kParameters));
ctx->HasInputs(kOutputs);
ctx->HasInputs(framework::GradVarName(kOutputs));
auto p_names = ctx->Inputs(kParameters);
auto pg_names = ctx->Outputs(kParamGrads);
auto dims = ctx->GetInputsDim(kParameters);
auto var_types = ctx->GetInputsVarType(kParameters);
std::vector<std::string> names_to_set;
std::vector<framework::DDim> dims_to_set;
for (size_t i = 0; i < p_names.size(); ++i) {
if (pg_names[i] == framework::kEmptyVarName) {
continue;
}
if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims[i]);
} else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
// not sure how to set the dim of LOD_TENSOR_ARRAY
names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims[i]);
}
}
ctx->SetDims(names_to_set, dims_to_set);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OPERATOR(while, paddle::operators::WhileOp, REGISTER_OPERATOR(while, paddle::operators::WhileOp,
paddle::operators::WhileOpMaker, paddle::operators::WhileOpMaker,
paddle::operators::WhileGradOpDescMaker); paddle::operators::WhileGradOpDescMaker);
REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
paddle::operators::WhileGradOpShapeInference,
paddle::operators::WhileGradOpVarTypeInference);
...@@ -62,6 +62,8 @@ extern void *cublas_dso_handle; ...@@ -62,6 +62,8 @@ extern void *cublas_dso_handle;
DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \
__macro(cublasSgemv_v2); \ __macro(cublasSgemv_v2); \
__macro(cublasDgemv_v2); \ __macro(cublasDgemv_v2); \
__macro(cublasSgemm_v2); \ __macro(cublasSgemm_v2); \
......
...@@ -109,5 +109,10 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, ...@@ -109,5 +109,10 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream), cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer"); "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
} }
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
"cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -60,6 +60,9 @@ void GpuMemcpySync(void *dst, const void *src, size_t count, ...@@ -60,6 +60,9 @@ void GpuMemcpySync(void *dst, const void *src, size_t count,
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device, void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream); size_t count, cudaStream_t stream);
//! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
......
...@@ -43,6 +43,54 @@ function ver2num() { ...@@ -43,6 +43,54 @@ function ver2num() {
set +e set +e
} }
function cpu_config() {
# auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
# only when MKLDNN or MKLML enabled
if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
return 0
fi
ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
if [ $ht -eq 1 ]; then # HT is OFF
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,0,0"
fi
if [ -z "$OMP_DYNAMIC" ]; then
export OMP_DYNAMIC="FALSE"
fi
else # HT is ON
if [ -z "$KMP_AFFINITY" ]; then
export KMP_AFFINITY="granularity=fine,compact,1,0"
fi
if [ -z "$OMP_DYNAMIC" ]; then
export OMP_DYNAMIC="True"
fi
fi
}
function threads_config() {
# auto set OMP_NUM_THREADS and MKL_NUM_THREADS
# according to trainer_count and total processors
# only when MKLDNN or MKLML enabled
if [ "@WITH_MKLDNN@" == "OFF" ] && [ "@WITH_MKLML@" == "OFF"]; then
return 0
fi
processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
if [ -z $trainers ]; then
trainers=1
fi
threads=$((processors / trainers))
if [ $threads -eq 0 ]; then
threads=1
fi
if [ -z "$OMP_NUM_THREADS" ]; then
export OMP_NUM_THREADS=$threads
fi
if [ -z "$MKL_NUM_THREADS" ]; then
export MKL_NUM_THREADS=$threads
fi
}
PADDLE_CONF_HOME="$HOME/.config/paddle" PADDLE_CONF_HOME="$HOME/.config/paddle"
mkdir -p ${PADDLE_CONF_HOME} mkdir -p ${PADDLE_CONF_HOME}
...@@ -92,9 +140,13 @@ else: ...@@ -92,9 +140,13 @@ else:
sys.exit(0) sys.exit(0)
EOF EOF
cpu_config
# echo $KMP_AFFINITY $OMP_DYNAMIC
case "$1" in case "$1" in
"train") "train")
threads_config $@
# echo $OMP_NUM_THREADS $MKL_NUM_THREADS
${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2} ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
;; ;;
"merge_model") "merge_model")
......
...@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config, ...@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
} }
} }
if (FLAGS_use_mkldnn) {
CHECK_EQ(FLAGS_trainer_count, 1UL) << "MKLDNN only need 1 trainer";
}
if (testing) { if (testing) {
LOG(INFO) << "trainer: in testing mode"; LOG(INFO) << "trainer: in testing mode";
if (config_->getOptConfig().use_sparse_remote_updater() || if (config_->getOptConfig().use_sparse_remote_updater() ||
......
...@@ -2987,8 +2987,10 @@ def img_cmrnorm_layer(input, ...@@ -2987,8 +2987,10 @@ def img_cmrnorm_layer(input,
layer_attr=None): layer_attr=None):
""" """
Response normalization across feature maps. Response normalization across feature maps.
The details please refer to
`Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_. Reference:
ImageNet Classification with Deep Convolutional Neural Networks
http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
The example usage is: The example usage is:
...@@ -2997,7 +2999,7 @@ def img_cmrnorm_layer(input, ...@@ -2997,7 +2999,7 @@ def img_cmrnorm_layer(input,
norm = img_cmrnorm_layer(input=net, size=5) norm = img_cmrnorm_layer(input=net, size=5)
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: None | basestring :type name: basestring
:param input: The input of this layer. :param input: The input of this layer.
:type input: LayerOutput :type input: LayerOutput
:param size: Normalize in number of :math:`size` feature maps. :param size: Normalize in number of :math:`size` feature maps.
...@@ -3006,9 +3008,11 @@ def img_cmrnorm_layer(input, ...@@ -3006,9 +3008,11 @@ def img_cmrnorm_layer(input,
:type scale: float :type scale: float
:param power: The hyper-parameter. :param power: The hyper-parameter.
:type power: float :type power: float
:param num_channels: input layer's filers number or channels. If :param num_channels: The number of input channels. If the parameter is not set or
num_channels is None, it will be set automatically. set to None, its actual value will be automatically set to
:param layer_attr: Extra Layer Attribute. the channels number of the input.
:param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -3036,7 +3040,7 @@ def batch_norm_layer(input, ...@@ -3036,7 +3040,7 @@ def batch_norm_layer(input,
use_global_stats=None, use_global_stats=None,
mean_var_names=None): mean_var_names=None):
""" """
Batch Normalization Layer. The notation of this layer as follow. Batch Normalization Layer. The notation of this layer is as follows.
:math:`x` is the input features over a mini-batch. :math:`x` is the input features over a mini-batch.
...@@ -3050,8 +3054,10 @@ def batch_norm_layer(input, ...@@ -3050,8 +3054,10 @@ def batch_norm_layer(input,
\\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
The details of batch normalization please refer to this Reference:
`paper <http://arxiv.org/abs/1502.03167>`_. Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift
http://arxiv.org/abs/1502.03167
The example usage is: The example usage is:
...@@ -3061,48 +3067,47 @@ def batch_norm_layer(input, ...@@ -3061,48 +3067,47 @@ def batch_norm_layer(input,
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input: batch normalization input. Better be linear activation. :param input: This layer's input which is to be performed batch normalization on.
Because there is an activation inside batch_normalization.
:type input: LayerOutput :type input: LayerOutput
:param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm. :param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm.
batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm
requires cuDNN version greater or equal to v4 (>=v4). requires cuDNN version greater or equal to v4 (>=v4).
But cudnn_batch_norm is faster and needs less But cudnn_batch_norm is faster and needs less
memory than batch_norm. mkldnn_batch_norm requires memory than batch_norm. mkldnn_batch_norm requires
enable use_mkldnn. By default (None), we will use_mkldnn is enabled. By default (None), we will
automaticly select cudnn_batch_norm for GPU, automatically select cudnn_batch_norm for GPU,
mkldnn_batch_norm for MKLDNN and batch_norm for CPU. mkldnn_batch_norm for MKLDNN and batch_norm for CPU.
Otherwise, select batch norm type based on the Users can specify the batch norm type. If you use
specified type. If you use cudnn_batch_norm, cudnn_batch_norm, we suggested you use latest version,
we suggested you use latest version, such as v5.1. such as v5.1.
:type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm" :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
or "mkldnn_batch_norm" or "mkldnn_batch_norm"
:param act: Activation Type. Better be relu. Because batch :param act: Activation type. ReluActivation is the default activation.
normalization will normalize input near zero.
:type act: BaseActivation :type act: BaseActivation
:param num_channels: num of image channels or previous layer's number of :param num_channels: The number of input channels. If the parameter is not set or
filters. None will automatically get from layer's set to None, its actual value will be automatically set to
input. the channels number of the input.
:type num_channels: int :type num_channels: int
:param bias_attr: :math:`\\beta`, better be zero when initialize. So the :param bias_attr: :math:`\\beta`. The bias attribute. If the parameter is set to
initial_std=0, initial_mean=1 is best practice. False or an object whose type is not ParameterAttribute, no
bias is defined. If the parameter is set to True, the bias is
initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any :type bias_attr: ParameterAttribute | None | bool | Any
:param param_attr: :math:`\\gamma`, better be one when initialize. So the :param param_attr: :math:`\\gamma`. The parameter attribute. See ParameterAttribute
initial_std=0, initial_mean=1 is best practice. for details.
:type param_attr: ParameterAttribute :type param_attr: ParameterAttribute
:param layer_attr: Extra Layer Attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:param use_global_stats: whether use moving mean/variance statistics :param use_global_stats: Whether use moving mean/variance statistics during
during testing peroid. If None or True, testing peroid. If the parameter is set to None or
it will use moving mean/variance statistics during True, it will use moving mean/variance statistics
testing. If False, it will use the mean during testing. If the parameter is set to False, it
and variance of current batch of test data for will use the mean and variance of the current batch
testing. of test data.
:type use_global_stats: bool | None. :type use_global_stats: bool | None.
:param moving_average_fraction: Factor used in the moving average :param moving_average_fraction: Factor used in the moving average computation.
computation, referred to as facotr, :math:`runningMean = newMean*(1-factor) + runningMean*factor`
:math:`runningMean = newMean*(1-factor)
+ runningMean*factor`
:type moving_average_fraction: float. :type moving_average_fraction: float.
:param mean_var_names: [mean name, variance name] :param mean_var_names: [mean name, variance name]
:type mean_var_names: string list :type mean_var_names: string list
...@@ -3164,8 +3169,9 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None): ...@@ -3164,8 +3169,9 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
:type input: LayerOutput :type input: LayerOutput
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
:type layer_attr: ExtraLayerAttribute. for details.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
...@@ -3200,7 +3206,8 @@ def row_l2_norm_layer(input, name=None, layer_attr=None): ...@@ -3200,7 +3206,8 @@ def row_l2_norm_layer(input, name=None, layer_attr=None):
:type input: LayerOutput :type input: LayerOutput
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param layer_attr: extra layer attributes. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
for details.
:type layer_attr: ExtraLayerAttribute. :type layer_attr: ExtraLayerAttribute.
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -3237,22 +3244,17 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): ...@@ -3237,22 +3244,17 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
act=ReluActivation(), act=ReluActivation(),
bias_attr=False) bias_attr=False)
This layer just simply add all input layers together, then activate the sum This layer just simply adds all input layers together, then activates the
inputs. Each input of this layer should be the same size, which is also the sum. All inputs should share the same dimension, which is also the dimension
output size of this layer. of this layer's output.
There is no weight matrix for each input, because it just a simple add There is no weight matrix for each input, because it just a simple add
operation. If you want a complicated operation before add, please use operation. If you want a complicated operation before add, please use
mixed_layer. mixed_layer.
It is a very good way to set dropout outside the layers. Since not all
PaddlePaddle layer support dropout, you can add an add_to layer, set
dropout here.
Please refer to dropout_layer for details.
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input: Input layers. It could be a LayerOutput or list/tuple of :param input: The input layers. It could be a LayerOutput or list/tuple of
LayerOutput. LayerOutput.
:type input: LayerOutput | list | tuple :type input: LayerOutput | list | tuple
:param act: Activation Type. LinearActivation is the default activation. :param act: Activation Type. LinearActivation is the default activation.
...@@ -3261,7 +3263,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): ...@@ -3261,7 +3263,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
parameter is set to True, the bias is initialized to zero. parameter is set to True, the bias is initialized to zero.
:type bias_attr: ParameterAttribute | None | bool | Any :type bias_attr: ParameterAttribute | None | bool | Any
:param layer_attr: Extra Layer attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -3300,8 +3303,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None): ...@@ -3300,8 +3303,8 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
@layer_support(DROPOUT, ERROR_CLIPPING) @layer_support(DROPOUT, ERROR_CLIPPING)
def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
""" """
Concat all input vector into one huge vector. Concatenate all input vectors to one vector.
Inputs can be list of LayerOutput or list of projection. Inputs can be a list of LayerOutput or a list of projection.
The example usage is: The example usage is:
...@@ -3311,11 +3314,12 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): ...@@ -3311,11 +3314,12 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param input: input layers or projections :param input: The input layers or projections
:type input: list | tuple | collections.Sequence :type input: list | tuple | collections.Sequence
:param act: Activation type. IdentityActivation is the default activation. :param act: Activation type. IdentityActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
:param layer_attr: Extra Layer Attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
...@@ -3385,7 +3389,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None): ...@@ -3385,7 +3389,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
bias_attr=None): bias_attr=None):
""" """
Concat sequence a with sequence b. Concatenate sequence a and sequence b.
Inputs: Inputs:
- a = [a1, a2, ..., am] - a = [a1, a2, ..., am]
...@@ -3404,13 +3408,14 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None, ...@@ -3404,13 +3408,14 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
:param name: The name of this layer. It is optional. :param name: The name of this layer. It is optional.
:type name: basestring :type name: basestring
:param a: input sequence layer :param a: The first input sequence layer
:type a: LayerOutput :type a: LayerOutput
:param b: input sequence layer :param b: The second input sequence layer
:type b: LayerOutput :type b: LayerOutput
:param act: Activation type. IdentityActivation is the default activation. :param act: Activation type. IdentityActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
:param layer_attr: Extra Layer Attribute. :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
details.
:type layer_attr: ExtraLayerAttribute :type layer_attr: ExtraLayerAttribute
:param bias_attr: The bias attribute. If the parameter is set to False or an object :param bias_attr: The bias attribute. If the parameter is set to False or an object
whose type is not ParameterAttribute, no bias is defined. If the whose type is not ParameterAttribute, no bias is defined. If the
...@@ -3447,31 +3452,25 @@ def memory(name, ...@@ -3447,31 +3452,25 @@ def memory(name,
boot_bias_active_type=None, boot_bias_active_type=None,
boot_with_const_id=None): boot_with_const_id=None):
""" """
The memory layers is a layer cross each time step. Reference this output The memory takes a layer's output at previous time step as its own output.
as previous time step layer :code:`name` 's output.
The default memory is zero in first time step, previous time step's If boot_bias, the activation of the bias is the initial value of the memory.
output in the rest time steps.
If boot_bias, the first time step value is this bias and If boot_with_const_id is set, then the memory's output at the first time step
with activation. is a IndexSlot, the Arguments.ids()[0] is this :code:`cost_id`.
If boot_with_const_id, then the first time stop is a IndexSlot, the If boot_layer is specified, the memory's output at the first time step will
Arguments.ids()[0] is this :code:`cost_id`. be the boot_layer's output.
If boot_layer is not null, the memory is just the boot_layer's output. In other case, the default memory's output at the first time step is zero.
Set :code:`is_seq` is true boot layer is sequence.
The same name layer in recurrent group will set memory on each time
step.
.. code-block:: python .. code-block:: python
mem = memory(size=256, name='state') mem = memory(size=256, name='state')
state = fc_layer(input=mem, size=256, name='state') state = fc_layer(input=mem, size=256, name='state')
If you do not want to specify the name, you can equivalently use set_input() If you do not want to specify the name, you can also use set_input()
to specify the layer needs to be remembered as the following: to specify the layer to be remembered as the following:
.. code-block:: python .. code-block:: python
...@@ -3479,26 +3478,31 @@ def memory(name, ...@@ -3479,26 +3478,31 @@ def memory(name,
state = fc_layer(input=mem, size=256) state = fc_layer(input=mem, size=256)
mem.set_input(mem) mem.set_input(mem)
:param name: the name of the layer which this memory remembers. :param name: The name of the layer which this memory remembers.
If name is None, user should call set_input() to specify the If name is None, user should call set_input() to specify the
name of the layer which this memory remembers. name of the layer which this memory remembers.
:type name: basestring :type name: basestring
:param size: size of memory. :param size: The dimensionality of memory.
:type size: int :type size: int
:param memory_name: the name of the memory. :param memory_name: The name of the memory. It is ignored when name is provided.
It is ignored when name is provided.
:type memory_name: basestring :type memory_name: basestring
:param is_seq: DEPRECATED. is sequence for boot_layer :param is_seq: DEPRECATED. is sequence for boot_layer
:type is_seq: bool :type is_seq: bool
:param boot_layer: boot layer of memory. :param boot_layer: This parameter specifies memory's output at the first time
step and the output is boot_layer's output.
:type boot_layer: LayerOutput | None :type boot_layer: LayerOutput | None
:param boot_bias: boot layer's bias :param boot_bias: The bias attribute of memory's output at the first time step.
If the parameter is set to False or an object whose type is not
ParameterAttribute, no bias is defined. If the parameter is set
to True, the bias is initialized to zero.
:type boot_bias: ParameterAttribute | None :type boot_bias: ParameterAttribute | None
:param boot_bias_active_type: boot layer's active type. :param boot_bias_active_type: Activation type for memory's bias at the first time
step. LinearActivation is the default activation.
:type boot_bias_active_type: BaseActivation :type boot_bias_active_type: BaseActivation
:param boot_with_const_id: boot layer's id. :param boot_with_const_id: This parameter specifies memory's output at the first
time step and the output is an index.
:type boot_with_const_id: int :type boot_with_const_id: int
:return: LayerOutput object which is a memory. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
if boot_bias_active_type is None: if boot_bias_active_type is None:
......
...@@ -33,7 +33,6 @@ import networks ...@@ -33,7 +33,6 @@ import networks
import minibatch import minibatch
import plot import plot
import image import image
import model
import paddle.trainer.config_parser as cp import paddle.trainer.config_parser as cp
__all__ = [ __all__ = [
...@@ -58,7 +57,6 @@ __all__ = [ ...@@ -58,7 +57,6 @@ __all__ = [
'evaluator', 'evaluator',
'image', 'image',
'master', 'master',
'model',
] ]
cp.begin_parse() cp.begin_parse()
...@@ -78,6 +76,31 @@ def init(**kwargs): ...@@ -78,6 +76,31 @@ def init(**kwargs):
for key in args_dict.keys(): for key in args_dict.keys():
args.append('--%s=%s' % (key, str(args_dict[key]))) args.append('--%s=%s' % (key, str(args_dict[key])))
# auto set cpu environment
def set_env(key, value):
'''If the key has not been set in the environment, set it with value.'''
assert isinstance(key, str)
assert isinstance(value, str)
envset = os.environ.get(key)
if envset is None:
os.environ[key] = value
ht = os.popen("lscpu |grep \"per core\"|awk -F':' '{print $2}'|xargs")
ht = int(ht.read())
if ht == 1: # ht is off
set_env("OMP_DYNAMIC", "false")
set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
else:
set_env("OMP_DYNAMIC", "true")
set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
processors = os.popen("grep \"processor\" /proc/cpuinfo|sort -u|wc -l")
processors = int(processors.read())
trainers = kwargs.get('trainer_count', 1)
threads = processors / trainers
threads = '1' if threads < 1 else str(threads)
set_env("OMP_NUM_THREADS", threads)
set_env("MKL_NUM_THREADS", threads)
if 'use_gpu' in kwargs: if 'use_gpu' in kwargs:
cp.g_command_config_args['use_gpu'] = kwargs['use_gpu'] cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
if 'use_mkldnn' in kwargs: if 'use_mkldnn' in kwargs:
......
...@@ -12,9 +12,9 @@ def unique_name(prefix): ...@@ -12,9 +12,9 @@ def unique_name(prefix):
return "_".join([prefix, str(uid)]) return "_".join([prefix, str(uid)])
def _debug_string_(proto): def _debug_string_(proto, throw_on_error=True):
error_fields = list() error_fields = list()
if not proto.IsInitialized(error_fields): if not proto.IsInitialized(error_fields) and throw_on_error:
raise ValueError("{0} are not initialized\nThe message is {1}".format( raise ValueError("{0} are not initialized\nThe message is {1}".format(
error_fields, proto)) error_fields, proto))
return proto.__str__() return proto.__str__()
...@@ -101,9 +101,12 @@ class Variable(object): ...@@ -101,9 +101,12 @@ class Variable(object):
self.stop_gradient = stop_gradient self.stop_gradient = stop_gradient
def __str__(self): def __str__(self):
return self.to_string(True)
def to_string(self, throw_on_error):
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
proto = framework_pb2.VarDesc.FromString(str(protostr)) proto = framework_pb2.VarDesc.FromString(str(protostr))
return _debug_string_(proto) return _debug_string_(proto, throw_on_error)
__repr__ = __str__ __repr__ = __str__
...@@ -291,10 +294,13 @@ class Operator(object): ...@@ -291,10 +294,13 @@ class Operator(object):
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
self.desc.infer_shape(self.block.desc) self.desc.infer_shape(self.block.desc)
def __str__(self): def to_string(self, throw_on_error):
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
proto = framework_pb2.OpDesc.FromString(str(protostr)) proto = framework_pb2.OpDesc.FromString(str(protostr))
return _debug_string_(proto) return _debug_string_(proto, throw_on_error)
def __str__(self):
return self.to_string(True)
__repr__ = __str__ __repr__ = __str__
...@@ -349,9 +355,12 @@ class Block(object): ...@@ -349,9 +355,12 @@ class Block(object):
self.program = program self.program = program
def __str__(self): def __str__(self):
return self.to_string(True)
def to_string(self, throw_on_error):
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
proto = framework_pb2.BlockDesc.FromString(str(protostr)) proto = framework_pb2.BlockDesc.FromString(str(protostr))
return _debug_string_(proto) return _debug_string_(proto, throw_on_error)
__repr__ = __str__ __repr__ = __str__
...@@ -454,9 +463,12 @@ class Program(object): ...@@ -454,9 +463,12 @@ class Program(object):
self.current_block_idx = 0 self.current_block_idx = 0
def __str__(self): def __str__(self):
return self.to_string(True)
def to_string(self, throw_on_error):
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
proto = framework_pb2.ProgramDesc.FromString(str(protostr)) proto = framework_pb2.ProgramDesc.FromString(str(protostr))
return _debug_string_(proto) return _debug_string_(proto, throw_on_error)
def clone(self): def clone(self):
p = Program() p = Program()
...@@ -512,7 +524,14 @@ class Program(object): ...@@ -512,7 +524,14 @@ class Program(object):
assert isinstance(target, Variable) assert isinstance(target, Variable)
if no_grad_set is None: if no_grad_set is None:
no_grad_set = set() no_grad_set = set()
param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set) try:
param_to_grad_info = self.desc.append_backward(target.desc,
no_grad_set)
except Exception as e:
raise core.EnforceNotMet(
str(e) + "\nCurrent protobuf is\n{0}".format(
self.to_string(False)))
self.sync_with_cpp() self.sync_with_cpp()
return param_to_grad_info return param_to_grad_info
......
...@@ -165,7 +165,7 @@ def save_inference_model(dirname, ...@@ -165,7 +165,7 @@ def save_inference_model(dirname,
:param target_vars: Variables from which we can get inference results. :param target_vars: Variables from which we can get inference results.
:param executor: executor that save inference model :param executor: executor that save inference model
:param main_program: original program, which will be pruned to build the inference model. :param main_program: original program, which will be pruned to build the inference model.
Default g_program. Default g_main_program.
:return: None :return: None
""" """
...@@ -234,3 +234,35 @@ def load_inference_model(dirname, executor): ...@@ -234,3 +234,35 @@ def load_inference_model(dirname, executor):
fetch_vars = [program.global_block().var(name) for name in fetch_var_names] fetch_vars = [program.global_block().var(name) for name in fetch_var_names]
return [program, feed_var_names, fetch_vars] return [program, feed_var_names, fetch_vars]
def get_parameter_value(para, executor):
"""
Get the LoDTensor for the parameter
:param executor: executor for retrieving the value
:param para: the given parameter
:return: the LoDTensor for the parameter
"""
assert is_parameter(para)
get_program = Program()
block = get_program.global_block()
new_var = _clone_var_in_block_(block, para)
return executor.run(get_program, feed={}, fetch_list=[new_var])[0]
def get_parameter_value_by_name(name, executor, program=None):
"""
Get the LoDTensor for paramter with the given name
:param executor: executor for retrieving the value
:param name: the name of the parameter
:param program: the program where the variable is found
Default g_main_program.
:return: the LoDTensor for the variable
"""
if program is None:
program = g_main_program
var = program.global_block().var(name)
return get_parameter_value(var, executor)
...@@ -72,7 +72,7 @@ class LayerHelper(object): ...@@ -72,7 +72,7 @@ class LayerHelper(object):
@property @property
def bias_attr(self): def bias_attr(self):
default = {'name': None, 'initializer': XavierInitializer()} default = {'name': None, 'initializer': ConstantInitializer()}
bias_attr = self.kwargs.get('bias_attr', None) bias_attr = self.kwargs.get('bias_attr', None)
if bias_attr is None: if bias_attr is None:
bias_attr = default bias_attr = default
...@@ -149,24 +149,19 @@ class LayerHelper(object): ...@@ -149,24 +149,19 @@ class LayerHelper(object):
persistable=True, persistable=True,
initializer=initializer) initializer=initializer)
def append_bias_op(self, input_var, num_flatten_dims=None): def append_bias_op(self, input_var, dim_start=1, dim_end=None):
""" """
Append bias operator and return its output. If the user does not set Append bias operator and return its output. If the user does not set
bias_attr, append_bias_op will return input_var bias_attr, append_bias_op will return input_var
:param input_var: the input variable. The len(input_var.shape) is larger :param input_var: the input variable. The len(input_var.shape) is larger
or equal than 2. or equal than 2.
:param num_flatten_dims: The input tensor will be flatten as a matrix :param dim_start:
when adding bias. :param dim_end: the shape of the bias will be
`matrix.shape = product(input_var.shape[0:num_flatten_dims]), product( input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
input_var.shape[num_flatten_dims:])` dimensions and added to input_var to get the output
""" """
if num_flatten_dims is None: size = list(input_var.shape[dim_start:dim_end])
num_flatten_dims = self.kwargs.get('num_flatten_dims', None)
if num_flatten_dims is None:
num_flatten_dims = 1
size = list(input_var.shape[num_flatten_dims:])
bias_attr = self.bias_attr bias_attr = self.bias_attr
if not bias_attr: if not bias_attr:
return input_var return input_var
...@@ -178,7 +173,8 @@ class LayerHelper(object): ...@@ -178,7 +173,8 @@ class LayerHelper(object):
type='elementwise_add', type='elementwise_add',
inputs={'X': [input_var], inputs={'X': [input_var],
'Y': [b]}, 'Y': [b]},
outputs={'Out': [tmp]}) outputs={'Out': [tmp]},
attrs={'axis': dim_start})
return tmp return tmp
def append_activation(self, input_var): def append_activation(self, input_var):
......
...@@ -694,7 +694,7 @@ def conv2d(input, ...@@ -694,7 +694,7 @@ def conv2d(input,
'paddings': padding, 'paddings': padding,
'groups': groups}) 'groups': groups})
pre_act = helper.append_bias_op(pre_bias, 1) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
return helper.append_activation(pre_act) return helper.append_activation(pre_act)
......
...@@ -66,10 +66,13 @@ def parse_graph(program, graph, var_dict, **kwargs): ...@@ -66,10 +66,13 @@ def parse_graph(program, graph, var_dict, **kwargs):
if not var_dict.has_key(var): if not var_dict.has_key(var):
var_dict[var] = "Feed" var_dict[var] = "Feed"
temp_id = 0
proto = framework_pb2.ProgramDesc.FromString( proto = framework_pb2.ProgramDesc.FromString(
program.desc.serialize_to_string()) program.desc.serialize_to_string())
for block in proto.blocks: for block in proto.blocks:
for op in block.ops: for op in block.ops:
op.type = op.type + "_" + str(temp_id)
temp_id += 1
graph.node(**draw_node(op)) graph.node(**draw_node(op))
for o in op.outputs: for o in op.outputs:
for arg in o.arguments: for arg in o.arguments:
...@@ -78,6 +81,7 @@ def parse_graph(program, graph, var_dict, **kwargs): ...@@ -78,6 +81,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
for arg in e.arguments: for arg in e.arguments:
if var_dict.has_key(arg): if var_dict.has_key(arg):
graph.edge(**draw_edge(var_dict, op, e, arg)) graph.edge(**draw_edge(var_dict, op, e, arg))
break # only plot the first block
def draw_graph(startup_program, main_program, **kwargs): def draw_graph(startup_program, main_program, **kwargs):
......
import unittest import unittest
import numpy as np import numpy as np
import paddle.v2.fluid.core as core
from paddle.v2.fluid.op import Operator
from op_test import OpTest from op_test import OpTest
import math
class TestAdagradOp1(OpTest): class TestAdagradOp1(OpTest):
...@@ -65,5 +68,110 @@ class TestAdagradOp2(OpTest): ...@@ -65,5 +68,110 @@ class TestAdagradOp2(OpTest):
self.check_output() self.check_output()
class TestSparseAdagradOp(unittest.TestCase):
def check_with_place(self, place):
scope = core.Scope()
# create and initialize Grad Variable
height = 10
rows = [0, 4, 7, 4]
row_numel = 12
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
np_array = np.ones((len(rows), row_numel)).astype("float32")
np_array[0, 0] = 2.0
np_array[2, 8] = 4.0
grad_tensor = grad_selected_rows.get_tensor()
grad_tensor.set(np_array, place)
# create and initialize Param Variable
param = scope.var('Param').get_tensor()
param_array = np.full((height, row_numel), 5.0).astype("float32")
param.set(param_array, place)
# create and initialize LeraningRate Variable
lr = scope.var('LearningRate').get_tensor()
lr_array = np.full((1), 2.0).astype("float32")
lr.set(lr_array, place)
# create and initialize moment Variable
moment = scope.var('Moment').get_tensor()
moment_np_array = np.full((height, row_numel), 2.0).astype("float32")
moment.set(moment_np_array, place)
# create and run sgd operator
adagrad_op = Operator(
"adagrad",
Param='Param',
Grad='Grad',
ParamOut='Param',
Moment='Moment',
MomentOut='Moment',
LearningRate='LearningRate',
epsilon=2.0)
ctx = core.DeviceContext.create(place)
adagrad_op.run(scope, ctx)
# get and compare moment result
moment_result_array = np.array(moment)
self.assertAlmostEqual(6.0, moment_result_array[rows[0], 0])
self.assertAlmostEqual(3.0, moment_result_array[rows[0], 2])
self.assertAlmostEqual(2.0, moment_result_array[1, 0])
# 2.0 + (1.0 + 1.0)^2
self.assertAlmostEqual(6.0, moment_result_array[rows[1], 10])
self.assertAlmostEqual(6.0, moment_result_array[rows[3], 4])
self.assertAlmostEqual(2.0, moment_result_array[5, 8])
self.assertAlmostEqual(3.0, moment_result_array[rows[2], 1])
self.assertAlmostEqual(18.0, moment_result_array[rows[2], 8])
# get and compare param result
result_array = np.array(param)
def get_out(param, lr, grad, m, epsilon):
return param - lr * grad / (math.sqrt(m) + epsilon)
self.assertAlmostEqual(
get_out(5.0, 2.0, 2.0, 6.0, 2.0),
result_array[rows[0], 0],
places=5)
self.assertAlmostEqual(
get_out(5.0, 2.0, 1.0, 3.0, 2.0),
result_array[rows[0], 2],
places=5)
self.assertAlmostEqual(
get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[1, 0], places=5)
# grad_merge = 1.0 + 1.0
# m = 6.0
self.assertAlmostEqual(
get_out(5.0, 2.0, 2.0, 6.0, 2.0),
result_array[rows[1], 10],
places=5)
self.assertAlmostEqual(
get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[5, 8], places=5)
self.assertAlmostEqual(
get_out(5.0, 2.0, 1.0, 3.0, 2.0),
result_array[rows[2], 1],
places=5)
self.assertAlmostEqual(
get_out(5.0, 2.0, 4.0, 18.0, 2.0),
result_array[rows[2], 8],
places=5)
def test_sparse_adagrad(self):
places = [core.CPUPlace()]
if core.is_compile_gpu():
places.append(core.GPUPlace(0))
for place in places:
self.check_with_place(place)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
import logging
from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
import paddle.v2.fluid.core as core
import unittest
import numpy as np
def create_tensor(scope, name, np_data):
tensor = scope.var(name).get_tensor()
tensor.set(np_data, core.CPUPlace())
return tensor
class BeamSearchOpTester(unittest.TestCase):
def setUp(self):
self.scope = core.Scope()
self.ctx = core.DeviceContext.create(core.CPUPlace())
self._create_ids()
self._create_scores()
self._create_pre_ids()
self.scope.var('selected_ids')
self.scope.var('selected_scores')
def test_run(self):
op = Operator(
'beam_search',
pre_ids="pre_ids",
ids='ids',
scores='scores',
selected_ids='selected_ids',
selected_scores='selected_scores',
level=0,
beam_size=2,
end_id=0, )
op.run(self.scope, self.ctx)
selected_ids = self.scope.find_var("selected_ids").get_tensor()
print 'selected_ids', np.array(selected_ids)
print 'lod', selected_ids.lod()
def _create_pre_ids(self):
np_data = np.array([[1, 2, 3, 4]], dtype='int32')
tensor = create_tensor(self.scope, "pre_ids", np_data)
def _create_ids(self):
self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
np_data = np.array(
[[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int32')
tensor = create_tensor(self.scope, "ids", np_data)
tensor.set_lod(self.lod)
def _create_scores(self):
np_data = np.array(
[
[0.5, 0.3, 0.2],
[0.6, 0.3, 0.1],
[0.9, 0.5, 0.1],
[0.7, 0.5, 0.1],
],
dtype='float32')
tensor = create_tensor(self.scope, "scores", np_data)
tensor.set_lod(self.lod)
if __name__ == '__main__':
unittest.main()
...@@ -10,23 +10,33 @@ def conv2d_forward_naive(input, filter, group, conv_param): ...@@ -10,23 +10,33 @@ def conv2d_forward_naive(input, filter, group, conv_param):
assert np.mod(out_c, group) == 0 assert np.mod(out_c, group) == 0
sub_out_c = out_c / group sub_out_c = out_c / group
stride, pad = conv_param['stride'], conv_param['pad'] stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
out_h = 1 + (in_h + 2 * pad[0] - f_h) / stride[0] 'dilation']
out_w = 1 + (in_w + 2 * pad[1] - f_w) / stride[1] out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) / stride[0]
out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
out = np.zeros((in_n, out_c, out_h, out_w)) out = np.zeros((in_n, out_c, out_h, out_w))
d_bolck_w = (dilation[0] * (f_h - 1) + 1)
d_bolck_h = (dilation[1] * (f_w - 1) + 1)
input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )), input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
mode='constant', mode='constant',
constant_values=0) constant_values=0)
filter_dilation = np.zeros((out_c, f_c, d_bolck_h, d_bolck_w))
filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
1]] = filter
for i in range(out_h): for i in range(out_h):
for j in range(out_w): for j in range(out_w):
for g in range(group): for g in range(group):
input_pad_masked = \ input_pad_masked = \
input_pad[:, g * f_c:(g + 1) * f_c, input_pad[:, g * f_c:(g + 1) * f_c,
i * stride[0]:i * stride[0] + f_h, i * stride[0]:i * stride[0] + d_bolck_h,
j * stride[1]:j * stride[1] + f_w] j * stride[1]:j * stride[1] + d_bolck_w]
f_sub = filter[g * sub_out_c:(g + 1) * sub_out_c, :, :, :] f_sub = filter_dilation[g * sub_out_c:(g + 1) *
sub_out_c, :, :, :]
for k in range(sub_out_c): for k in range(sub_out_c):
out[:, g * sub_out_c + k, i, j] = \ out[:, g * sub_out_c + k, i, j] = \
np.sum(input_pad_masked * f_sub[k, :, :, :], np.sum(input_pad_masked * f_sub[k, :, :, :],
...@@ -39,9 +49,14 @@ class TestConv2dOp(OpTest): ...@@ -39,9 +49,14 @@ class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
self.init_op_type() self.init_op_type()
self.init_group() self.init_group()
self.init_dilation()
self.init_test_case() self.init_test_case()
conv2d_param = {'stride': self.stride, 'pad': self.pad} conv2d_param = {
'stride': self.stride,
'pad': self.pad,
'dilation': self.dilations
}
input = np.random.random(self.input_size).astype("float32") input = np.random.random(self.input_size).astype("float32")
filter = np.random.random(self.filter_size).astype("float32") filter = np.random.random(self.filter_size).astype("float32")
output = conv2d_forward_naive(input, filter, self.groups, output = conv2d_forward_naive(input, filter, self.groups,
...@@ -80,12 +95,14 @@ class TestConv2dOp(OpTest): ...@@ -80,12 +95,14 @@ class TestConv2dOp(OpTest):
def init_test_case(self): def init_test_case(self):
self.pad = [0, 0] self.pad = [0, 0]
self.stride = [1, 1] self.stride = [1, 1]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0 assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3] self.filter_size = [6, f_c, 3, 3]
def init_dilation(self):
self.dilations = [1, 1]
def init_group(self): def init_group(self):
self.groups = 1 self.groups = 1
...@@ -93,32 +110,90 @@ class TestConv2dOp(OpTest): ...@@ -93,32 +110,90 @@ class TestConv2dOp(OpTest):
self.op_type = "conv2d" self.op_type = "conv2d"
class TestWithPad(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWithStride(TestConv2dOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
class TestWithGroup(TestConv2dOp): class TestWithGroup(TestConv2dOp):
def init_group(self): def init_group(self):
self.groups = 3 self.groups = 3
def init_op_type(self):
self.op_type = "conv2d"
class TestWith1x1(TestConv2dOp):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 1, 1]
#----------------Conv2dCudnn---------------- def init_group(self):
self.groups = 3
class TestCudnn(TestConv2dOp): class TestWithDilation(TestConv2dOp):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 10, 10] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3]
def init_dilation(self):
self.dilations = [2, 2]
def init_group(self): def init_group(self):
self.groups = 1 self.groups = 3
#----------------Conv2dCudnn----------------
class TestCudnn(TestConv2dOp):
def init_op_type(self):
self.op_type = "conv_cudnn"
class TestCudnnWithPad(TestWithPad):
def init_op_type(self): def init_op_type(self):
self.op_type = "conv_cudnn" self.op_type = "conv_cudnn"
class TestCudnnWithGroup(TestConv2dOp): class TestCudnnWithStride(TestWithStride):
def init_group(self): def init_op_type(self):
self.groups = 3 self.op_type = "conv_cudnn"
class TestCudnnWithGroup(TestWithGroup):
def init_op_type(self): def init_op_type(self):
self.op_type = "conv_cudnn" self.op_type = "conv_cudnn"
class TestCudnnWith1x1(TestWith1x1):
def init_op_type(self):
self.op_type = "conv_cudnn"
# cudnn v5 does not support dilation conv.
# class TestCudnnWithDilation(TestWithDilation):
# def init_op_type(self):
# self.op_type = "conv_cudnn"
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -4,9 +4,7 @@ from op_test import OpTest ...@@ -4,9 +4,7 @@ from op_test import OpTest
def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
# [2, 3, 5, 5]
in_n, in_c, in_h, in_w = input_.shape in_n, in_c, in_h, in_w = input_.shape
# [3, 6, 3, 3]
f_c, out_c, f_h, f_w = filter_.shape f_c, out_c, f_h, f_w = filter_.shape
assert in_c == f_c assert in_c == f_c
...@@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param): ...@@ -29,6 +27,7 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
j1, j2 = j * stride[0], j * stride[0] + f_w j1, j2 = j * stride[0], j * stride[0] + f_w
out[n, k, i1:i2, j1:j2] += tmp_out out[n, k, i1:i2, j1:j2] += tmp_out
out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
return out return out
...@@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest): ...@@ -36,8 +35,6 @@ class TestConv2dTransposeOp(OpTest):
def setUp(self): def setUp(self):
# init as conv transpose # init as conv transpose
self.init_op_type() self.init_op_type()
# [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
self.init_test_case() self.init_test_case()
conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad} conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
...@@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest): ...@@ -55,7 +52,6 @@ class TestConv2dTransposeOp(OpTest):
self.outputs = {'Output': output} self.outputs = {'Output': output}
def test_check_output(self): def test_check_output(self):
print 'check output here for', self.op_type
self.check_output() self.check_output()
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
...@@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest): ...@@ -88,6 +84,26 @@ class TestConv2dTransposeOp(OpTest):
self.op_type = "conv2d_transpose" self.op_type = "conv2d_transpose"
class TestWithPad(TestConv2dTransposeOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [1, 1]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3]
class TestWithStride(TestConv2dTransposeOp):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.dilations = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3]
# ------------ test_cudnn ------------ # ------------ test_cudnn ------------
class TestCudnn(TestConv2dTransposeOp): class TestCudnn(TestConv2dTransposeOp):
def init_op_type(self): def init_op_type(self):
......
...@@ -10,26 +10,39 @@ def conv3d_forward_naive(input, filter, group, conv_param): ...@@ -10,26 +10,39 @@ def conv3d_forward_naive(input, filter, group, conv_param):
assert np.mod(out_c, group) == 0 assert np.mod(out_c, group) == 0
sub_out_c = out_c / group sub_out_c = out_c / group
stride, pad = conv_param['stride'], conv_param['pad'] stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
out_d = 1 + (in_d + 2 * pad[0] - f_h) / stride[0] 'dilations']
out_h = 1 + (in_h + 2 * pad[1] - f_h) / stride[1]
out_w = 1 + (in_w + 2 * pad[2] - f_w) / stride[2] out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) / stride[0]
out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) / stride[1]
out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) / stride[2]
out = np.zeros((in_n, out_c, out_d, out_h, out_w)) out = np.zeros((in_n, out_c, out_d, out_h, out_w))
d_bolck_d = (dilation[0] * (f_d - 1) + 1)
d_bolck_h = (dilation[1] * (f_h - 1) + 1)
d_bolck_w = (dilation[2] * (f_w - 1) + 1)
input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ), input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ),
(pad[2], )), (pad[2], )),
mode='constant', mode='constant',
constant_values=0) constant_values=0)
filter_dilation = np.zeros((out_c, f_c, d_bolck_d, d_bolck_h, d_bolck_w))
filter_dilation[:, :, 0:d_bolck_d:dilation[0], 0:d_bolck_h:dilation[1], 0:
d_bolck_w:dilation[2]] = filter
for d in range(out_d): for d in range(out_d):
for i in range(out_h): for i in range(out_h):
for j in range(out_w): for j in range(out_w):
for g in range(group): for g in range(group):
input_pad_masked = \ input_pad_masked = \
input_pad[:, g * f_c:(g + 1) * f_c, input_pad[:, g * f_c:(g + 1) * f_c,
d * stride[0]:d * stride[0] + f_d, d * stride[0]:d * stride[0] + d_bolck_d,
i * stride[1]:i * stride[1] + f_h, i * stride[1]:i * stride[1] + d_bolck_h,
j * stride[2]:j * stride[2] + f_w] j * stride[2]:j * stride[2] + d_bolck_w]
f_sub = filter[g * sub_out_c:(g + 1) *
f_sub = filter_dilation[g * sub_out_c:(g + 1) *
sub_out_c, :, :, :, :] sub_out_c, :, :, :, :]
for k in range(sub_out_c): for k in range(sub_out_c):
out[:, g * sub_out_c + k, d, i, j] = \ out[:, g * sub_out_c + k, d, i, j] = \
...@@ -43,9 +56,14 @@ class TestConv3dOp(OpTest): ...@@ -43,9 +56,14 @@ class TestConv3dOp(OpTest):
def setUp(self): def setUp(self):
self.init_group() self.init_group()
self.init_op_type() self.init_op_type()
self.init_dilation()
self.init_test_case() self.init_test_case()
conv3d_param = {'stride': self.stride, 'pad': self.pad} conv3d_param = {
'stride': self.stride,
'pad': self.pad,
'dilations': self.dilations
}
input = np.random.random(self.input_size).astype("float32") input = np.random.random(self.input_size).astype("float32")
filter = np.random.random(self.filter_size).astype("float32") filter = np.random.random(self.filter_size).astype("float32")
output = conv3d_forward_naive(input, filter, self.groups, output = conv3d_forward_naive(input, filter, self.groups,
...@@ -55,7 +73,8 @@ class TestConv3dOp(OpTest): ...@@ -55,7 +73,8 @@ class TestConv3dOp(OpTest):
self.attrs = { self.attrs = {
'strides': self.stride, 'strides': self.stride,
'paddings': self.pad, 'paddings': self.pad,
'groups': self.groups 'groups': self.groups,
'dilations': self.dilations
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
...@@ -88,6 +107,9 @@ class TestConv3dOp(OpTest): ...@@ -88,6 +107,9 @@ class TestConv3dOp(OpTest):
f_c = self.input_size[1] / self.groups f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3, 3] self.filter_size = [6, f_c, 3, 3, 3]
def init_dilation(self):
self.dilations = [1, 1, 1]
def init_group(self): def init_group(self):
self.groups = 1 self.groups = 1
...@@ -104,27 +126,47 @@ class TestCase1(TestConv3dOp): ...@@ -104,27 +126,47 @@ class TestCase1(TestConv3dOp):
f_c = self.input_size[1] / self.groups f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 3, 3, 3] self.filter_size = [6, f_c, 3, 3, 3]
def init_group(self):
self.groups = 1
def init_op_type(self): class TestWithGroup1(TestConv3dOp):
self.op_type = "conv3d" def init_group(self):
self.groups = 3
class TestWithGroup1(TestConv3dOp): class TestWithGroup2(TestCase1):
def init_group(self): def init_group(self):
self.groups = 3 self.groups = 3
def init_op_type(self):
self.op_type = "conv3d"
class TestWith1x1(TestConv3dOp):
def init_test_case(self):
self.pad = [0, 0, 0]
self.stride = [1, 1, 1]
self.input_size = [2, 3, 4, 4, 4] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 1, 1, 1]
def init_dilation(self):
self.dilations = [1, 1, 1]
class TestWithGroup2(TestCase1):
def init_group(self): def init_group(self):
self.groups = 3 self.groups = 3
def init_op_type(self):
self.op_type = "conv3d" class TestWithDilation(TestConv3dOp):
def init_test_case(self):
self.pad = [0, 0, 0]
self.stride = [1, 1, 1]
self.input_size = [2, 3, 6, 6, 6] # NCDHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] / self.groups
self.filter_size = [6, f_c, 2, 2, 2]
def init_dilation(self):
self.dilations = [2, 2, 2]
def init_group(self):
self.groups = 3
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -4,9 +4,7 @@ from op_test import OpTest ...@@ -4,9 +4,7 @@ from op_test import OpTest
def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
# [2, 3, 5, 5, 5]
in_n, in_c, in_d, in_h, in_w = input_.shape in_n, in_c, in_d, in_h, in_w = input_.shape
# [3, 6, 3, 3, 3]
f_c, out_c, f_d, f_h, f_w = filter_.shape f_c, out_c, f_d, f_h, f_w = filter_.shape
assert in_c == f_c assert in_c == f_c
...@@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): ...@@ -14,7 +12,6 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
out_d = (in_d - 1) * stride[0] + f_d out_d = (in_d - 1) * stride[0] + f_d
out_h = (in_h - 1) * stride[1] + f_h out_h = (in_h - 1) * stride[1] + f_h
out_w = (in_w - 1) * stride[2] + f_w out_w = (in_w - 1) * stride[2] + f_w
out = np.zeros((in_n, out_c, out_d, out_h, out_w)) out = np.zeros((in_n, out_c, out_d, out_h, out_w))
for n in range(in_n): for n in range(in_n):
...@@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param): ...@@ -33,6 +30,8 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
j1, j2 = j * stride[2], j * stride[2] + f_w j1, j2 = j * stride[2], j * stride[2] + f_w
out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
pad[2]]
return out return out
...@@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest): ...@@ -40,8 +39,6 @@ class TestConv3dTransposeOp(OpTest):
def setUp(self): def setUp(self):
# init as conv transpose # init as conv transpose
self.init_op_type() self.init_op_type()
# [2, 3, 5, 5, 5] -> kernel [3, 6, 3, 3, 3] -> output [2, 6, 7, 7, 7]
self.init_test_case() self.init_test_case()
conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad} conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
...@@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest): ...@@ -49,7 +46,6 @@ class TestConv3dTransposeOp(OpTest):
filter_ = np.random.random(self.filter_size).astype("float32") filter_ = np.random.random(self.filter_size).astype("float32")
output = conv3dtranspose_forward_naive( output = conv3dtranspose_forward_naive(
input_, filter_, conv3dtranspose_param).astype("float32") input_, filter_, conv3dtranspose_param).astype("float32")
# print 'deconv output py', output, output.shape
self.inputs = {'Input': input_, 'Filter': filter_} self.inputs = {'Input': input_, 'Filter': filter_}
self.attrs = { self.attrs = {
...@@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest): ...@@ -60,7 +56,6 @@ class TestConv3dTransposeOp(OpTest):
self.outputs = {'Output': output} self.outputs = {'Output': output}
def test_check_output(self): def test_check_output(self):
print 'check output here'
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
...@@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest): ...@@ -85,7 +80,7 @@ class TestConv3dTransposeOp(OpTest):
self.pad = [0, 0, 0] self.pad = [0, 0, 0]
self.stride = [1, 1, 1] self.stride = [1, 1, 1]
self.dilations = [1, 1, 1] self.dilations = [1, 1, 1]
self.input_size = [2, 3, 5, 5, 5] # NCHW self.input_size = [2, 3, 5, 5, 5] # NCDHW
f_c = self.input_size[1] f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3, 3] self.filter_size = [f_c, 6, 3, 3, 3]
...@@ -93,5 +88,25 @@ class TestConv3dTransposeOp(OpTest): ...@@ -93,5 +88,25 @@ class TestConv3dTransposeOp(OpTest):
self.op_type = "conv3d_transpose" self.op_type = "conv3d_transpose"
class TestWithPad(TestConv3dTransposeOp):
def init_test_case(self):
self.pad = [1, 1, 1]
self.stride = [1, 1, 1]
self.dilations = [1, 1, 1]
self.input_size = [2, 3, 5, 5, 5] # NCDHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3, 3]
class TestWithStride(TestConv3dTransposeOp):
def init_test_case(self):
self.pad = [1, 1, 1]
self.stride = [2, 2, 2]
self.dilations = [1, 1, 1]
self.input_size = [2, 3, 5, 5, 5] # NCDHW
f_c = self.input_size[1]
self.filter_size = [f_c, 6, 3, 3, 3]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
import unittest import unittest
from paddle.v2.fluid.framework import g_main_program from paddle.v2.fluid.framework import g_main_program
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.io as io
from paddle.v2.fluid.initializer import ConstantInitializer
import numpy as np
class TestParameter(unittest.TestCase): class TestParameter(unittest.TestCase):
def test_param(self): def test_param(self):
b = g_main_program.create_block() shape = [784, 100]
val = 1.0625
b = g_main_program.global_block()
param = b.create_parameter( param = b.create_parameter(
name='fc.w', name='fc.w',
shape=[784, 100], shape=shape,
dtype='float32', dtype='float32',
initialize_attr={ initializer=ConstantInitializer(val))
'type': 'uniform_random',
'seed': 13,
'min': -5.0,
'max': 5.0
})
self.assertIsNotNone(param) self.assertIsNotNone(param)
self.assertEqual('fc.w', param.name) self.assertEqual('fc.w', param.name)
self.assertEqual((784, 100), param.shape) self.assertEqual((784, 100), param.shape)
self.assertEqual(core.DataType.FP32, param.data_type) self.assertEqual(core.DataType.FP32, param.data_type)
self.assertEqual(0, param.block.idx) self.assertEqual(0, param.block.idx)
exe = Executor(core.CPUPlace())
p = exe.run(g_main_program, fetch_list=[param])[0]
self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -2,6 +2,7 @@ import unittest ...@@ -2,6 +2,7 @@ import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.backward import append_backward_ops
import numpy import numpy
...@@ -16,7 +17,7 @@ class TestWhileOp(unittest.TestCase): ...@@ -16,7 +17,7 @@ class TestWhileOp(unittest.TestCase):
i = layers.zeros(shape=[1], dtype='int64') i = layers.zeros(shape=[1], dtype='int64')
i.stop_gradient = True i.stop_gradient = True
init = layers.zeros(shape=[10], dtype='float32') init = layers.zeros(shape=[10], dtype='float32')
mem_array = layers.array_write(init, i=i) mem_array = layers.array_write(x=init, i=i)
data_array = layers.array_write(x=d0, i=i) data_array = layers.array_write(x=d0, i=i)
i = layers.increment(i) i = layers.increment(i)
...@@ -29,17 +30,23 @@ class TestWhileOp(unittest.TestCase): ...@@ -29,17 +30,23 @@ class TestWhileOp(unittest.TestCase):
i.stop_gradient = True i.stop_gradient = True
array_len = layers.fill_constant(shape=[1], dtype='int64', value=3) array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
array_len.stop_gradient = True
cond = layers.less_than(x=i, y=array_len) cond = layers.less_than(x=i, y=array_len)
while_op = layers.While(cond=cond) while_op = layers.While(cond=cond)
with while_op.block(): with while_op.block():
d = layers.array_read(array=data_array, i=i) d = layers.array_read(array=data_array, i=i)
prev = layers.array_read(array=mem_array, i=i) prev = layers.array_read(array=mem_array, i=i)
i = layers.increment(x=i, in_place=True)
result = layers.sums(input=[d, prev]) result = layers.sums(input=[d, prev])
i = layers.increment(x=i, in_place=True)
layers.array_write(result, i=i, array=mem_array) layers.array_write(result, i=i, array=mem_array)
layers.less_than(x=i, y=array_len, cond=cond) layers.less_than(x=i, y=array_len, cond=cond)
sum_result = layers.array_read(mem_array, i=array_len)
sum_result = layers.array_read(array=mem_array, i=i)
loss = layers.mean(x=sum_result)
append_backward_ops(loss)
cpu = core.CPUPlace() cpu = core.CPUPlace()
exe = Executor(cpu) exe = Executor(cpu)
......
import unittest
import numpy as np
from paddle.v2.framework.op import Operator
import paddle.v2.framework.core as core
def create_tensor(scope, name, np_data):
tensor = scope.var(name).get_tensor()
tensor.set_dims(np_data.shape)
tensor.set(np_data, core.CPUPlace())
return tensor
class TestIsEmptyOp(unittest.TestCase):
def setUp(self):
self.scope = core.Scope()
# create input variables
np_data0 = np.array([0, 1, 2])
create_tensor(self.scope, "X0", np_data0)
np_data1 = np.array([1])
t = create_tensor(self.scope, "X1", np_data1)
t.set_dims([0])
# create output variables
self.scope.var("out")
def test_no_empty(self):
self.one_case("X0", False)
def test_empty(self):
self.one_case("X1", True)
def one_case(self, input, target):
op = Operator(type="is_empty", X=input, Out="out")
ctx = core.DeviceContext.create(core.CPUPlace())
op.run(self.scope, ctx)
out = self.scope.var("out").get_tensor()
self.assertEqual(np.array(out)[0], target)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import errno
import uuid
import paddle.v2.master
__all__ = ["save_model", "load_model"]
trainer_id = str(uuid.uuid4())
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def save_model(parameters, path):
need_request = "KUBERNETES_SERVICE_HOST" in os.environ.keys()
if need_request:
# TODO(helin): figure out how MPI trains, since MPI only save
# model when trainer_id == "0", we can consolidate the logic
# here.
# TODO(helin): change this environment variable name from
# MASTER_IP to ETCD_IP
etcd_name = "MASTER_IP"
if etcd_name not in os.environ.keys():
raise Exception('not find ' + etcd_name +
' in environment variable.')
etcd_ip = os.environ.get(etcd_name)
client = paddle.v2.master.client("http://" + etcd_ip + ":2379", 5, 0)
r = client.request_save_model(trainer_id, 5000)
if r == 0:
# do not need to save
return
elif r < 0:
# error
return
else:
# save model
path = os.path.join(path, trainer_id)
path = os.path.join(path, "model.tar")
mkdir_p(path)
with open(path, 'wb') as f:
parameters.to_tar(f)
def load_model(parameters, path):
with open(path, 'rb') as f:
parameters.from_tar(f)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册