提交 770aff2c 编写于 作者: Y Yibing Liu

Merge the update in profiling tool

...@@ -93,6 +93,15 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz ...@@ -93,6 +93,15 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 | | MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 | | MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
- Alexnet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
| OpenBLAS | | | | | |
| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
chart TBD
### Laptop ### Laptop
TBD TBD
...@@ -19,7 +19,11 @@ args = { ...@@ -19,7 +19,11 @@ args = {
'num_samples': num_samples 'num_samples': num_samples
} }
define_py_data_sources2( define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args) "train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings( settings(
batch_size=batch_size, batch_size=batch_size,
......
...@@ -8,15 +8,19 @@ function clock_to_seconds() { ...@@ -8,15 +8,19 @@ function clock_to_seconds() {
} }
function infer() { function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
thread=`nproc` trainers=`nproc`
if [ $thread -gt $bs ]; then if [ $trainers -gt $bs ]; then
thread=$bs trainers=$bs
fi fi
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log" log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
threads=$((`nproc` / trainers))
if [ $threads -eq 0 ]; then
threads=1
fi
export OPENBLAS_NUM_THREADS=$threads
models_in="models/${topology}-${layer_num}/pass-00000/" models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then if [ ! -d $models_in ]; then
...@@ -28,7 +32,7 @@ function infer() { ...@@ -28,7 +32,7 @@ function infer() {
--config="${topology}.py" \ --config="${topology}.py" \
--use_mkldnn=False \ --use_mkldnn=False \
--use_gpu=False \ --use_gpu=False \
--trainer_count=$thread \ --trainer_count=$trainers \
--log_period=$log_period \ --log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \ --init_model_path=$models_in \
......
set -e set -e
function train() { function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY export OPENBLAS_NUM_THREADS=1
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
......
...@@ -252,6 +252,11 @@ first_seq ...@@ -252,6 +252,11 @@ first_seq
.. autoclass:: paddle.v2.layer.first_seq .. autoclass:: paddle.v2.layer.first_seq
:noindex: :noindex:
sub_seq
---------
.. autoclass:: paddle.v2.layer.sub_seq
:noindex:
concat concat
------ ------
.. autoclass:: paddle.v2.layer.concat .. autoclass:: paddle.v2.layer.concat
......
...@@ -68,12 +68,6 @@ scale ...@@ -68,12 +68,6 @@ scale
:noindex: :noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
transpose transpose
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.transpose .. autofunction:: paddle.v2.fluid.layers.transpose
......
# Backward Building
## Motivation
In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
## Challenges
The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place.
## Usage
Although the whole algorithm is comprised of many functions, only one is exposed as API:
```python
def append_backward(loss, parameter_list=None, no_grad_set=None):
"""
Append backward part to main_program
Args:
loss(Variable): The variable generated by the cost function.
parameter_list(list): Parameters that need to be updated by optimizers.
If None, it means all parameters need to be updated.
no_grad_set(set): Variables that have no gradients in Block 0.
If None, the set will be generated inside the function and
contains all variables with `step_gradient=True` from all blocks.
Return:
(list[Variable]): list of (parameters, gradients) pair.
"""
```
By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
This API will be invoked automatically before optimizer building.
As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
## Implementation
The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables.
### Creating `grad_op`s
The creating of `grad_op`s is implemented by:
```python
def _append_backward_ops_(target,
block,
target_block,
no_grad_dict,
grad_to_var):
"""
Create all grad ops, and insert them into given block
Args:
target(Variable): the target variable of forward pass
block(Block): the block where forward ops are
target_block(Block): the block which is going to hold new generated grad ops
no_grad_dict(dict):
key(int) block index
val(set) a set of varibale names. These varibales have no gradient
grad_to_var(dict)(output argument):
key(str): grad variable name
val(str): corresponding forward variable name
"""
```
Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`.
However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
```
******* pseudo-code ********
for op in reversed(block.ops):
if op has an attribute named 'sub_block':
Get the sub-block(`s_block`) from op's attribute.
Create a new block(`grad_s_block`), whose father is `s_block`.
Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
Invoke `core.get_grad_op_desc()` to get op's grad_op.
Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
Assign grad_s_block to grad_op as it's 'sub_block' attribute.
Append grad_op to current target_block.
```
The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
### Corner Cases of `grad_op` Creating
In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
#### Shared Variables
If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up.
For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
#### No Gradient Variables
In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass.
Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros.
This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False).
### Creating Backward Variables
Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
```python
def _append_backward_vars_(block,
start_op_idx,
grad_to_var,
grad_info_map):
"""
Create new variables required by backward pass.
Args:
block(Block): the block where new variables will be created
start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
grad_to_var(dict):
key(str): grad variable name
val(str): corresponding forward variable name
In most cases, this dict is generated by _append_backward_ops_()
grad_info_map(dict)(output argument):
key(str): forward variable name
val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
"""
```
Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
```
for op in block.ops[start_op_idx : ]:
if op has an attribute named 'sub_block':
Get the sub-block(`s_block`) from op's attribute.
Invoke _append_backward_vars_(), with `block=s_block`
for var_name in op.all_output_names():
if block.has_var_recursive(var_name) or var_name is the name of empty variable:
continue
create a new variable named 'var_name' in block
if grad_to_var.has_key(var_name):
set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
do op's var type inference
do op's shape inference
```
...@@ -79,7 +79,7 @@ class Optimizer(object): ...@@ -79,7 +79,7 @@ class Optimizer(object):
def minimize(self, loss, parameter_list): def minimize(self, loss, parameter_list):
"""Add operations to minimize `loss` by updating `parameter_list`. """Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward_ops()` and This method combines interface `append_backward()` and
`create_optimization_pass()` into one. `create_optimization_pass()` into one.
""" """
params_grads = self.create_backward_pass(loss, parameter_list) params_grads = self.create_backward_pass(loss, parameter_list)
......
## Introduction
There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
## Architecture
The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.
```python
for i in xrange(M): # M is the iteration number
for op in operator_lists: # The `operator_lists` contains all the operators in the network.
op.run();
```
In summary, the proflier should have following features:
- records time span in loop.
- supports nested time span.
- supports multiple threads/multiple GPUs.
- supports to be enabled and disabled by users.
But how to record the time for the mixed C++ and CUDA program? There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.
The overall flow is shown as the following figure.
<img src="./images/profiler.png" align="center"/><br/>
### Event
In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
```c++
enum EventKind {
kMark,
kPushRange,
kPopRange};
```
- kMark: only a marker without time range.
- kPushRange: mark the starting event for time range.
- kPopRange: mark the ending event for time range.
For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece.
```c++
class Event {
public:
// The DeviceContext is used to get current CUDA stream.
Event(EventKind kind, std::string name, uint32_t thread_id,
const platform::DeviceContext* dev_ctx = nullptr);
double CpuElapsedUs(const Event& e) const;
double CudaElapsedUs(const Event& e) const;
private:
EventKind kind_;
std::string name_;
uint32_t thread_id_;
int64_t cpu_ns_;
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_ = nullptr;
int device_ = -1;
#endif
};
struct EventList {
std::forward_list<std::vector<Event>> event_blocks;
};
```
As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
```c++
enum ProfilerState {
kDisabled,
kCPU,
kCUDA
};
ProfilerState g_state;
```
- kDisabled: the disabled state.
- kCPU: CPU profiling state.
- kCUDA: GPU profiling state.
A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
```c++
struct RecordEvent {
explicit RecordEvent(const std::string name,
platform::DeviceContext* dev_ctx = nullptr) {
if (kState == ProfilerState::kDisabled) return;
// push the starting event to the event lists.
}
~RecordEvent() {
if (kState == ProfilerState::kDisabled) return;
// push the ending event to the event lists.
}
};
```
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
获取PaddlePaddle的Docker镜像 获取PaddlePaddle的Docker镜像
------------------------------ ------------------------------
执行下面的命令获取最新的PaddlePaddle Docker镜像 执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl:
.. code-block:: bash .. code-block:: bash
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
docker pull docker.paddlepaddle.org/paddle docker pull docker.paddlepaddle.org/paddle
下载GPU版本的Docker镜像: 下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像:
.. code-block:: bash .. code-block:: bash
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
.. _docker_run: .. _docker_run:
在Docker中执行PaddlePaddle训练程序 在Docker中执行PaddlePaddle训练程序
------------------------------ ----------------------------------
假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考 假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ `PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
...@@ -82,7 +82,7 @@ ...@@ -82,7 +82,7 @@
.. _docker_run_book: .. _docker_run_book:
使用Docker启动PaddlePaddle Book教程 使用Docker启动PaddlePaddle Book教程
------------------------------ -----------------------------------
使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。 使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
......
...@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps. ...@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
Pull PaddlePaddle Docker Image Pull PaddlePaddle Docker Image
------------------------------ ------------------------------
Run the following command to download the latest Docker images: Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
.. code-block:: bash .. code-block:: bash
...@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror: ...@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:
docker pull docker.paddlepaddle.org/paddle docker pull docker.paddlepaddle.org/paddle
Download GPU version images: Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
.. code-block:: bash .. code-block:: bash
...@@ -58,7 +58,7 @@ and run: ...@@ -58,7 +58,7 @@ and run:
.. _docker_run: .. _docker_run:
Launch your training program in Docker Launch your training program in Docker
------------------------------ --------------------------------------
Assume that you have already written a PaddlePaddle program Assume that you have already written a PaddlePaddle program
named :code:`train.py` under directory :code:`/home/work` (refer to named :code:`train.py` under directory :code:`/home/work` (refer to
......
...@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具 ...@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
------------------------------ ------------------------------
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。 执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件,版本为cpu_avx_openblas
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行: 如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash .. code-block:: bash
......
...@@ -12,14 +12,14 @@ Install Using pip ...@@ -12,14 +12,14 @@ Install Using pip
------------------------------ ------------------------------
Run the following command to install PaddlePaddle on the current Run the following command to install PaddlePaddle on the current
machine, it will also download requirements. machine, it will also download requirements, the version is cpu_avx_openblas.
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
If you wish to install GPU version, just run: If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
.. code-block:: bash .. code-block:: bash
......
...@@ -7,13 +7,13 @@ ...@@ -7,13 +7,13 @@
++++++++ ++++++++
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。 PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装: 执行下面的命令完成快速安装,版本为cpu_avx_openblas
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行: 如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash .. code-block:: bash
......
...@@ -8,13 +8,13 @@ Quick Install ...@@ -8,13 +8,13 @@ Quick Install
You can use pip to install PaddlePaddle with a single command, supports You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install: Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash .. code-block:: bash
pip install paddlepaddle pip install paddlepaddle
If you need to install GPU version, run: If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash .. code-block:: bash
......
...@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) ...@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) if (WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
endif ()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) if (WITH_GPU)
nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
else()
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
...@@ -21,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc) ...@@ -21,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc)
cc_library(scope SRCS scope.cc DEPS glog) cc_library(scope SRCS scope.cc DEPS glog)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto) cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context) cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
cc_library(attribute SRCS attribute.cc DEPS framework_proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto)
...@@ -37,7 +45,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) ...@@ -37,7 +45,7 @@ cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
py_proto_compile(framework_py_proto SRCS framework.proto) py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module. # Generate an empty __init__.py to make framework_py_proto as a valid python module.
......
# Operator/expression 's Backward
## Motivation
In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. Hence we need a module that chains the gradient operators/expressions together to construct the backward pass. Every forward network needs a backward network to construct the full computation graph. The operator/expression's backward pass will be generated with respect to the forward pass.
## Implementation
In this design doc, we exported only one API for generating the backward pass.
```c++
std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars);
```
The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
### Backward Operator Registry
A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
| | forward operator | backward operator
| ---------------------- | ---------------- |------------------------- |
| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients |
| **Operator::outputs_** | Outputs | InputGradients |
In most cases, there is a one-to-one relation between the forward and backward operators. These relations are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and to make operators pluggable, the registry mechanism is introduced.
For example, we have `mul_op`, and we can register its information and corresponding backward operator by the following macro:
```cpp
REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
```
`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
### Backward Opeartor Creating
Given a certain forward operator, we can get its corresponding backward operator by calling:
```cpp
OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
```
The function `BuildGradOp` will sequentially execute following processes:
1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
2. Build two maps named `inputs` and `outputs` to temporarily store backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
### Backward Network Building
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and appending them together one by one. There are some corner cases that need special processing.
1. Op
When the input forward network is an Op, return its gradient Operator immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
2. NetOp
In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
3. RnnOp
RnnOp is a nested stepnet operator. Backward module needs to recusively call `Backward` for every stepnet.
4. Sharing Variables
As illustrated in the figure 1 and figure 2, two operators share the same variable name **W@GRAD**, which will overwrite their shared input variable.
<p align="center">
<img src="./images/duplicate_op.png" width="50%" ><br/>
​ Figure 1. Sharing variables in operators.
</p>
​ Sharing variable between operators or same input variable used in multiple operators can lead to duplicate gradient variables. As illustrated in figure 2, we need to rename the gradient names recursively and add a generic add operator to prevent overwriting.
<p align="center">
<img src="images/duplicate_op2.png" width="40%" ><br/>
​ Figure 2. Replace sharing variable's gradient with `Add` operator.
</p>
​ Because the framework finds variables according to their names, we need to rename the output links. We add an integer suffix to represent its position in the clockwise direction.
5. Part of the Gradient is Zero.
In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator, we need to fill a same shape gradient matrix in the position. In our implementation, we insert a special `fillZeroLike` operator.
Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
...@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <functional>
#include "paddle/framework/data_transform.h" #include "paddle/framework/data_transform.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -22,5 +25,111 @@ DataTransformFnMap& DataTransformFnMap::Instance() { ...@@ -22,5 +25,111 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
return data_transform_map; return data_transform_map;
} }
auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain);
auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain);
auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain);
auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNCHW, LibraryType::kPlain);
void TransDataType(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataType Only Support DataType transform on same place!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
auto dims = src.dims();
dst->Resize(dims);
auto dst_type = kernel_pair.second.data_type_;
auto src_type = kernel_pair.first.data_type_;
switch (src_type) {
case proto::DataType::FP32:
framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
break;
case proto::DataType::FP64:
framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
break;
case proto::DataType::INT32:
framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
break;
case proto::DataType::INT64:
framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
break;
case proto::DataType::BOOL:
framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
break;
default:
PADDLE_THROW("Not support type %d", src_type);
}
}
void TransDataLayout(const std::vector<int>& axis,
const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataLayout only support DataLayout transform on same place!");
PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
"TransDataLayout only support Datatype are same!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
auto place = kernel_pair.second.place_;
CopyFrom(src, place, *ctx, dst);
auto src_dim = src.dims();
std::vector<int64_t> dst_dim;
dst_dim.resize(axis.size());
for (size_t i = 0; i < axis.size(); i++) {
dst_dim[i] = src_dim[axis[i]];
}
dst->Resize(make_ddim(dst_dim));
auto src_type = kernel_pair.first.data_type_;
framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
dst->set_layout(kernel_pair.second.data_layout_);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
namespace f = paddle::framework;
namespace {
std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
}
REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
std::bind(f::TransDataLayout, NHWC2NCHW,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));
REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
std::bind(f::TransDataLayout, NCHW2NHWC,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));
...@@ -21,17 +21,20 @@ limitations under the License. */ ...@@ -21,17 +21,20 @@ limitations under the License. */
#include "paddle/framework/op_kernel_type.h" #include "paddle/framework/op_kernel_type.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/framework/variable.h" #include "paddle/framework/variable.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include "paddle/platform/macros.h" #include "paddle/platform/macros.h"
#include "paddle/platform/transform.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using DataTransformFN =
std::function<void(const std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out)>;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>; using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
using DataTransformFn =
std::function<void(const platform::DeviceContext*, const KernelTypePair&,
const Variable&, Variable*)>;
struct KernelTypePairHash { struct KernelTypePairHash {
static void HashCombine(const OpKernelType& t, std::size_t* seed) { static void HashCombine(const OpKernelType& t, std::size_t* seed) {
OpKernelType::Hash kernel_type_hasher; OpKernelType::Hash kernel_type_hasher;
...@@ -46,8 +49,69 @@ struct KernelTypePairHash { ...@@ -46,8 +49,69 @@ struct KernelTypePairHash {
} }
}; };
template <typename InType, typename OutType>
struct CastDataTypeFunctor {
HOSTDEVICE inline OutType operator()(InType in) const {
return static_cast<OutType>(in);
}
};
template <typename InType>
struct CastDataType {
CastDataType(const framework::Tensor& in, framework::Tensor* out,
const platform::DeviceContext* ctx)
: in_(in), out_(out), ctx_(ctx) {}
const framework::Tensor in_;
framework::Tensor* out_;
const platform::DeviceContext* ctx_;
template <typename OutType>
void operator()() {
auto place = ctx_->GetPlace();
auto* in_begin = in_.data<InType>();
auto numel = in_.numel();
auto* in_end = in_begin + numel;
auto* out_begin = out_->mutable_data<OutType>(place);
if (platform::is_cpu_place(place)) {
platform::Transform<platform::CPUDeviceContext> trans;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
} else {
// TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
PADDLE_THROW("Unsupport CPU <-> GPU!");
}
}
};
struct CastDataLayout {
CastDataLayout(const platform::DeviceContext* ctx,
const std::vector<int>& axis, const framework::Tensor& in,
framework::Tensor* out)
: in_(in), out_(out), ctx_(ctx), axis_(axis) {}
const framework::Tensor in_;
framework::Tensor* out_;
const platform::DeviceContext* ctx_;
const std::vector<int> axis_;
template <typename T>
void operator()() {
auto place = ctx_->GetPlace();
if (platform::is_cpu_place(place)) {
operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans4(*context, in_, out_, axis_);
} else {
PADDLE_THROW("Unsupport CPU <-> GPU!");
}
}
};
using DataTransformMap = using DataTransformMap =
std::unordered_map<KernelTypePair, DataTransformFN, KernelTypePairHash>; std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
class DataTransformFnMap { class DataTransformFnMap {
public: public:
...@@ -58,25 +122,25 @@ class DataTransformFnMap { ...@@ -58,25 +122,25 @@ class DataTransformFnMap {
} }
void Insert(const OpKernelType& left, const OpKernelType& right, void Insert(const OpKernelType& left, const OpKernelType& right,
const DataTransformFN& data_tranform_fn) { const DataTransformFn& data_tranform_fn) {
Insert(std::make_pair(left, right), data_tranform_fn); Insert(std::make_pair(left, right), data_tranform_fn);
} }
void Insert(const KernelTypePair& kernel_type_pair, void Insert(const KernelTypePair& kernel_type_pair,
const DataTransformFN& data_tranform_fn) { const DataTransformFn& data_tranform_fn) {
PADDLE_ENFORCE(!Has(kernel_type_pair), PADDLE_ENFORCE(!Has(kernel_type_pair),
"KernelTypePair %s has been registered", ""); "KernelTypePair %s has been registered", "");
map_.insert({kernel_type_pair, data_tranform_fn}); map_.insert({kernel_type_pair, data_tranform_fn});
} }
const DataTransformFN& Get(const KernelTypePair& key_pair) const { const DataTransformFn& Get(const KernelTypePair& key_pair) const {
auto data_transformer = GetNullable(key_pair); auto data_transformer = GetNullable(key_pair);
PADDLE_ENFORCE_NOT_NULL(data_transformer, PADDLE_ENFORCE_NOT_NULL(data_transformer,
"DataTransformFN should not be NULL"); "DataTransformFn should not be NULL");
return *data_transformer; return *data_transformer;
} }
const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const { const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
auto it = map_.find(key_pair); auto it = map_.find(key_pair);
if (it == map_.end()) { if (it == map_.end()) {
return nullptr; return nullptr;
......
...@@ -11,36 +11,67 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,36 +11,67 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <array>
#include <vector>
#include "paddle/framework/data_transform.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/framework/data_transform.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using namespace platform; using namespace platform;
/**
* @brief cross validation of different kernel type transform
* We use four bit map represent different combination.
* If the field has multiple possible value, only choose two of them.
* For DataType, only test the FP32(float), FP64(double).
* e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
* 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
*/
std::array<proto::DataType, 2> kDataType = {
{proto::DataType::FP32, proto::DataType::FP64}};
std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
std::array<DataLayout, 2> kDataLayout = {{
DataLayout::kNHWC, DataLayout::kNCHW,
}};
std::array<LibraryType, 2> kLibraryType = {{
LibraryType::kPlain, LibraryType::kMKLDNN,
}};
OpKernelType GenFromBit(const std::vector<bool> bits) {
return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
kLibraryType[bits[3]]);
}
int test_value = 0; int test_value = 0;
OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW, auto kernel0 = GenFromBit({0, 0, 0, 0});
LibraryType::kCUDNN); auto kernel1 = GenFromBit({0, 0, 0, 1});
OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0), auto kernel2 = GenFromBit({0, 0, 1, 0});
DataLayout::kNCHW, LibraryType::kCUDNN); auto kernel3 = GenFromBit({0, 0, 1, 1});
OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0),
DataLayout::kNCHW, LibraryType::kCUDNN);
void type1_to_type2(std::vector<platform::DeviceContext*> ctx, void TransDataType_t(const platform::DeviceContext* ctx,
const Variable& in, Variable* out) { const KernelTypePair& p, const Variable& in,
Variable* out) {
test_value++; test_value++;
} }
void type2_to_type3(std::vector<platform::DeviceContext*> ctx, void TransDataLayout_t(const platform::DeviceContext* ctx,
const Variable& in, Variable* out) { const KernelTypePair& p, const Variable& in,
Variable* out) {
test_value--; test_value--;
} }
void type1_to_type3(std::vector<platform::DeviceContext*> ctx, void TransLibraryType_t(const platform::DeviceContext* ctx,
const Variable& in, Variable* out) { const KernelTypePair& p, const Variable& in,
Variable* out) {
test_value += 2; test_value += 2;
} }
...@@ -49,30 +80,89 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx, ...@@ -49,30 +80,89 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
namespace frw = paddle::framework; namespace frw = paddle::framework;
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2, REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
frw::type1_to_type2); REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3, REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
frw::type2_to_type3);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3,
frw::type1_to_type3);
TEST(DataTransform, Register) { TEST(DataTransform, Register) {
using namespace paddle::framework; using namespace paddle::framework;
using namespace paddle::platform; using namespace paddle::platform;
auto& instance = DataTransformFnMap::Instance(); auto& instance = DataTransformFnMap::Instance();
ASSERT_EQ(instance.Map().size(), 3UL);
std::vector<DeviceContext*> ctx;
paddle::framework::Variable in; paddle::framework::Variable in;
paddle::framework::Variable out; paddle::framework::Variable out;
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in, DeviceContext* ctx = new CPUDeviceContext();
&out); auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
instance.Get(pair0)(ctx, pair0, in, &out);
ASSERT_EQ(test_value, 1); ASSERT_EQ(test_value, 1);
instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in,
&out); auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
instance.Get(pair1)(ctx, pair1, in, &out);
ASSERT_EQ(test_value, 0); ASSERT_EQ(test_value, 0);
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in,
&out); auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
instance.Get(pair3)(ctx, pair3, in, &out);
ASSERT_EQ(test_value, 2); ASSERT_EQ(test_value, 2);
} }
TEST(DataTransform, DataLayout) {
using namespace paddle::framework;
using namespace paddle::platform;
auto& instance = DataTransformFnMap::Instance();
Variable in;
Variable out;
Tensor* src = in.GetMutable<Tensor>();
src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
src->set_layout(DataLayout::kNHWC);
DeviceContext* ctx = new CPUDeviceContext();
{
auto kernel1 = GenFromBit({1, 0, 0, 0});
auto kernel2 = GenFromBit({1, 0, 1, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, in, &out);
}
Tensor dst = out.Get<Tensor>();
EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
{
auto kernel1 = GenFromBit({1, 0, 1, 0});
auto kernel2 = GenFromBit({1, 0, 0, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, out, &in);
}
EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
}
TEST(DataTransform, DataType) {
using namespace paddle::framework;
using namespace paddle::platform;
auto& instance = DataTransformFnMap::Instance();
DeviceContext* ctx = new CPUDeviceContext();
Variable in;
Variable out;
Tensor* src = in.GetMutable<Tensor>();
float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
for (int i = 0; i < 6; ++i) {
ptr[i] = i / 3;
}
{
auto kernel1 = GenFromBit({0, 0, 0, 0});
auto kernel2 = GenFromBit({1, 0, 0, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, in, &out);
}
Tensor dst = out.Get<Tensor>();
EXPECT_TRUE(dst.data<double>() != nullptr);
}
...@@ -14,18 +14,17 @@ limitations under the License. */ ...@@ -14,18 +14,17 @@ limitations under the License. */
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <set> #include <set>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h"
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { ...@@ -58,6 +57,19 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
} }
} }
static void CheckTensorNANOrInf(const std::string& name,
const framework::Tensor& tensor) {
if (tensor.memory_size() == 0) {
return;
}
if (tensor.type().hash_code() != typeid(float).hash_code() &&
tensor.type().hash_code() != typeid(double).hash_code()) {
return;
}
PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) { bool create_local_scope, bool create_vars) {
// TODO(tonyyang-svail): // TODO(tonyyang-svail):
...@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -101,8 +113,17 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugString(); VLOG(3) << op->DebugString();
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
}
}
} }
if (create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
} }
......
...@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) { ...@@ -71,7 +71,7 @@ bool InitDevices(const std::vector<std::string> &devices) {
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
LOG(WARNING) << "Not specified CPU device, create CPU by Default."; LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
} }
platform::DeviceContextPool::Create(places); platform::DeviceContextPool::Init(places);
return true; return true;
} }
......
...@@ -20,7 +20,11 @@ namespace framework { ...@@ -20,7 +20,11 @@ namespace framework {
// For more details about the design of LibraryType, Please refer to // For more details about the design of LibraryType, Please refer to
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
enum class LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; enum class LibraryType {
kPlain = 0,
kMKLDNN = 1,
kCUDNN = 2,
};
inline std::string LibraryTypeToString(const LibraryType& library_type) { inline std::string LibraryTypeToString(const LibraryType& library_type) {
switch (library_type) { switch (library_type) {
...@@ -31,7 +35,26 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) { ...@@ -31,7 +35,26 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
case LibraryType::kCUDNN: case LibraryType::kCUDNN:
return "CUDNN"; return "CUDNN";
default: default:
PADDLE_THROW("unknown LibraryType %d", library_type); PADDLE_THROW("unknown LibraryType %d", static_cast<int>(library_type));
}
}
inline LibraryType StringToLibraryType(const char* ctype) {
std::string s(ctype);
if (s == std::string("PLAIN")) {
return LibraryType::kPlain;
} else if (s == std::string("MKLDNN")) {
return LibraryType::kMKLDNN;
} else if (s == std::string("CUDNN")) {
return LibraryType::kCUDNN;
// To be compatible with register macro.
// CPU, CUDA, PLAIN are same library type.
} else if (s == std::string("CPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) {
return LibraryType::kPlain;
} else {
PADDLE_THROW("Unknown LibraryType %s", s.c_str());
} }
} }
......
...@@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { ...@@ -189,62 +189,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
void SerializeToStream(std::ostream &os, const LoDTensor &tensor, void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) { const platform::DeviceContext &dev_ctx) {
// TODO(typhoonzero): serialize to ostream { // the 1st field, uint32_t version for LoDTensor
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0; constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char *>(&version), sizeof(version)); os.write(reinterpret_cast<const char *>(&version), sizeof(version));
} }
{ // the 2nd field, tensor description {
// int32_t size // the 2st field, LoD information
// void* protobuf message // uint64_t lod_level
proto::TensorDesc desc; // uint64_t lod_level_1 size in byte.
desc.set_data_type(framework::ToDataType(tensor.type())); // int* lod_level_1 data
auto dims = framework::vectorize(tensor.dims()); // ...
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto lod = tensor.lod(); auto lod = tensor.lod();
uint64_t size = lod.size(); uint64_t size = lod.size();
os.write(reinterpret_cast<const char *>(&size), sizeof(size)); os.write(reinterpret_cast<const char *>(&size), sizeof(size));
...@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, ...@@ -256,49 +210,19 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
static_cast<std::streamsize>(size)); static_cast<std::streamsize>(size));
} }
} }
// the 3st field, Tensor
SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
} }
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
uint32_t version; {
is.read(reinterpret_cast<char *>(&version), sizeof(version)); // the 1st field, unit32_t version for SelectedRows
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); uint32_t version;
proto::TensorDesc desc; is.read(reinterpret_cast<char *>(&version), sizeof(version));
{ // int32_t size PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
// proto buffer }
int32_t size; {
is.read(reinterpret_cast<char *>(&size), sizeof(size)); // the 2st field, LoD information
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case proto::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case proto::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case proto::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case proto::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
uint64_t lod_level; uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level)); is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod(); auto &lod = *tensor->mutable_lod();
...@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { ...@@ -312,6 +236,8 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
lod[i] = tmp; lod[i] = tmp;
} }
} }
// the 3st filed, Tensor
DeserializeFromStream(is, static_cast<Tensor *>(tensor));
} }
} // namespace framework } // namespace framework
......
...@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) { ...@@ -126,6 +126,20 @@ TEST_F(LoDTensorTester, ShrinkInLevel) {
EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>()); EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
} }
TEST_F(LoDTensorTester, SerializeAndDeserialize) {
LoDTensor dst_tensor;
platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
std::ostringstream oss;
SerializeToStream(oss, lod_tensor_, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < kLodTensorSize; ++i) {
EXPECT_EQ(dst_ptr[i], i);
}
EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
}
TEST(LodExpand, test) { TEST(LodExpand, test) {
LoD lod{{0, 2}}; LoD lod{{0, 2}};
LoDTensor tensor; LoDTensor tensor;
......
...@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, ...@@ -88,6 +88,14 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
need_update_ = true; need_update_ = true;
} }
void OpDesc::CopyFrom(const OpDesc &op_desc) {
desc_.set_type(op_desc.Type());
inputs_ = op_desc.inputs_;
outputs_ = op_desc.outputs_;
attrs_ = op_desc.attrs_;
need_update_ = true;
}
OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
: desc_(desc), need_update_(false) { : desc_(desc), need_update_(false) {
// restore inputs_ // restore inputs_
...@@ -252,7 +260,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { ...@@ -252,7 +260,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
void operator()(int v) const { attr_->set_i(v); } void operator()(int v) const { attr_->set_i(v); }
void operator()(float v) const { attr_->set_f(v); } void operator()(float v) const { attr_->set_f(v); }
void operator()(const std::string &v) const { attr_->set_s(v); } void operator()(const std::string &v) const { attr_->set_s(v); }
void operator()(bool b) const { attr_->set_b(b); }
// Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
template <class T,
class = typename std::enable_if<std::is_same<bool, T>::value>::type>
void operator()(T b) const {
attr_->set_b(b);
}
void operator()(const std::vector<int> &v) const { void operator()(const std::vector<int> &v) const {
VectorToRepeated(v, attr_->mutable_ints()); VectorToRepeated(v, attr_->mutable_ints());
...@@ -266,9 +280,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { ...@@ -266,9 +280,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
void operator()(const std::vector<bool> &v) const { void operator()(const std::vector<bool> &v) const {
VectorToRepeated(v, attr_->mutable_bools()); VectorToRepeated(v, attr_->mutable_bools());
} }
void operator()(proto::BlockDesc *desc) const { void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
attr_->set_block_idx(desc->idx());
}
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
}; };
......
...@@ -35,6 +35,8 @@ class OpDesc { ...@@ -35,6 +35,8 @@ class OpDesc {
OpDesc(const proto::OpDesc &desc, ProgramDesc *prog); OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
void CopyFrom(const OpDesc &op_desc);
proto::OpDesc *Proto(); proto::OpDesc *Proto();
std::string Type() const { return desc_.type(); } std::string Type() const { return desc_.type(); }
......
...@@ -68,6 +68,8 @@ struct OpKernelType { ...@@ -68,6 +68,8 @@ struct OpKernelType {
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_; library_type_ == o.library_type_;
} }
bool operator!=(const OpKernelType& o) const { return !(*this == o); }
}; };
inline std::ostream& operator<<(std::ostream& os, inline std::ostream& operator<<(std::ostream& os,
...@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os, ...@@ -78,5 +80,11 @@ inline std::ostream& operator<<(std::ostream& os,
return os; return os;
} }
inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
std::ostringstream stream;
stream << kernel_key;
return stream.str();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) { ...@@ -26,10 +26,8 @@ TEST(OpKernelType, ToString) {
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN); LibraryType::kCUDNN);
std::ostringstream stream;
stream << op_kernel_type;
ASSERT_EQ( ASSERT_EQ(
stream.str(), paddle::framework::KernelTypeToString(op_kernel_type),
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
} }
...@@ -48,4 +46,4 @@ TEST(OpKernelType, Hash) { ...@@ -48,4 +46,4 @@ TEST(OpKernelType, Hash) {
OpKernelType::Hash hasher; OpKernelType::Hash hasher;
ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2)); ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
} }
\ No newline at end of file
...@@ -37,8 +37,8 @@ class Registrar { ...@@ -37,8 +37,8 @@ class Registrar {
public: public:
// In our design, various kinds of classes, e.g., operators and kernels, // In our design, various kinds of classes, e.g., operators and kernels,
// have their corresponding registry and registrar. The action of // have their corresponding registry and registrar. The action of
// registration is in the constructor of a global registrar variable, which, // registration is in the constructor of a global registrar variable, which
// however, are not used in the code that calls package framework, and would // are not used in the code that calls package framework, and would
// be removed from the generated binary file by the linker. To avoid such // be removed from the generated binary file by the linker. To avoid such
// removal, we add Touch to all registrar classes and make USE_OP macros to // removal, we add Touch to all registrar classes and make USE_OP macros to
// call this method. So, as long as the callee code calls USE_OP, the global // call this method. So, as long as the callee code calls USE_OP, the global
...@@ -79,30 +79,31 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { ...@@ -79,30 +79,31 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
using KERNEL_TYPE = using KERNEL_TYPE =
typename std::tuple_element<I, std::tuple<KernelTypes...>>::type; typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
void operator()(const char* op_type) const { void operator()(const char* op_type, const char* library_type) const {
using T = typename KERNEL_TYPE::ELEMENT_TYPE; using T = typename KERNEL_TYPE::ELEMENT_TYPE;
OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType()); OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
DataLayout::kAnyLayout, StringToLibraryType(library_type));
OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE); OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...> OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
func; func;
func(op_type); func(op_type, library_type);
} }
}; };
template <typename PlaceType, size_t I, typename... KernelType> template <typename PlaceType, size_t I, typename... KernelType>
struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> { struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
void operator()(const char* op_type) const {} void operator()(const char* op_type, const char* library_type) const {}
}; };
// User can register many kernel in one place. The data type could be different. // User can register many kernel in one place. The data type could be different.
template <typename PlaceType, typename... KernelType> template <typename PlaceType, typename... KernelType>
class OpKernelRegistrar : public Registrar { class OpKernelRegistrar : public Registrar {
public: public:
explicit OpKernelRegistrar(const char* op_type) { explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func; OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
func(op_type); func(op_type, library_type);
} }
}; };
...@@ -181,7 +182,8 @@ class OpKernelRegistrar : public Registrar { ...@@ -181,7 +182,8 @@ class OpKernelRegistrar : public Registrar {
__reg_op_kernel_##op_type##_##DEVICE_TYPE##__, \ __reg_op_kernel_##op_type##_##DEVICE_TYPE##__, \
"REGISTER_OP_KERNEL must be called in global namespace"); \ "REGISTER_OP_KERNEL must be called in global namespace"); \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \ static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type); \ __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type, \
#DEVICE_TYPE); \
int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() { \ int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() { \
__op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch(); \ __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch(); \
return 0; \ return 0; \
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -182,3 +196,71 @@ TEST(OperatorRegistrar, Test) { ...@@ -182,3 +196,71 @@ TEST(OperatorRegistrar, Test) {
using namespace paddle::framework; using namespace paddle::framework;
OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos"); OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
} }
namespace paddle {
namespace framework {
class OpKernelTestMaker : public OpProtoAndCheckerMaker {
public:
OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddComment("NoGradOp, same input output. no Grad");
}
};
class OpWithKernelTest : public OperatorWithKernel {
public:
using OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(InferShapeContext* ctx) const override {}
framework::OpKernelType GetActualKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
}
};
template <typename DeviceContext, typename T>
class OpKernelTest : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {}
};
} // namespace framework
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
paddle::framework::OpWithKernelTest,
paddle::framework::OpKernelTestMaker);
REGISTER_OP_CPU_KERNEL(
op_with_kernel,
paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(op_with_kernel,
paddle::framework::OpKernelTest<
paddle::platform::CUDADeviceContext, float>);
TEST(OperatorRegistrar, CPU) {
paddle::framework::proto::OpDesc op_desc;
paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope;
op_desc.set_type("op_with_kernel");
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
op->Run(scope, cpu_place);
}
#ifdef PADDLE_WITH_CUDA
TEST(OperatorRegistrar, CUDA) {
paddle::framework::proto::OpDesc op_desc;
paddle::platform::CUDAPlace cuda_place(0);
paddle::framework::Scope scope;
op_desc.set_type("op_with_kernel");
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
op->Run(scope, cuda_place);
}
#endif
...@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -384,12 +384,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
const Scope& scope_; const Scope& scope_;
}; };
const platform::DeviceContext* GetDeviceContext(
framework::KernelTypePair& kernel_pair) {
auto& actual_kernel_key = kernel_pair.first;
auto& expected_kernel_key = kernel_pair.second;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
if (platform::is_gpu_place(actual_kernel_key.place_) &&
platform::is_cpu_place(expected_kernel_key.place_)) {
return pool.Get(actual_kernel_key.place_);
} else if (platform::is_cpu_place(actual_kernel_key.place_) &&
platform::is_gpu_place(expected_kernel_key.place_)) {
return pool.Get(expected_kernel_key.place_);
} else {
PADDLE_THROW(
"Currently, model parallelism is only supported between CPU and CUDA");
}
}
void OperatorWithKernel::Run(const Scope& scope, void OperatorWithKernel::Run(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto dev_ctx = pool.Borrow(place); auto dev_ctx = pool.Get(place);
// check if op[type] has kernel registered. // check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels(); auto& all_op_kernels = AllOpKernels();
...@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -413,37 +431,47 @@ void OperatorWithKernel::Run(const Scope& scope,
} }
if (actual_kernel_key == expected_kernel_key) { if (actual_kernel_key == expected_kernel_key) {
kernel_iter->second->Compute(ctx); PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
"Currently, model parallelism is only supported between "
"CPU and other devices. For example, multi-GPU model "
"parallelism will failed.");
} else { } else {
Scope& op_scope = scope.NewScope(); auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
auto input_vars = this->InputVars(); const DataTransformFn* trans_fun =
for (auto var_name : input_vars) { DataTransformFnMap::Instance().GetNullable(kernel_pair);
op_scope.Var(var_name); if (trans_fun) {
} auto input_vars = this->InputVars();
// TODO(qijun) filter the input vars that do not need to be transformed
// TODO(qijun) get appropriate DeviceContext from DeviceContext pool
platform::DeviceContext* trans_dev_ctx = nullptr; // filter vars that has been transformed
std::vector<platform::DeviceContext*> trans_dev_ctx_vec{trans_dev_ctx}; std::vector<std::string> need_trans;
for (auto var_name : input_vars) {
auto var_name_trans =
var_name + framework::KernelTypeToString(expected_kernel_key);
if (!scope.FindVar(var_name_trans)) {
const_cast<Scope&>(scope).Var(var_name_trans);
need_trans.push_back(var_name);
}
}
// TODO(qijun) get appropriate DataTransformFN from global map if (!need_trans.empty()) {
framework::DataTransformFN trans_fun = nullptr; auto trans_dev_ctx = GetDeviceContext(kernel_pair);
// Wait for transform starting // Wait for transform starting
dev_ctx->Wait(); dev_ctx->Wait();
for (auto var_name : input_vars) { for (auto var_name : need_trans) {
trans_fun(trans_dev_ctx_vec, *(scope.FindVar(var_name)), (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
op_scope.FindVar(var_name)); scope.FindVar(var_name + framework::KernelTypeToString(
} expected_kernel_key)));
// Wait for data transform finishing }
for (auto ctx : trans_dev_ctx_vec) { // Wait for data transform finishing
ctx->Wait(); trans_dev_ctx->Wait();
}
} }
// Create a new ExecutionContext
ExecutionContext op_ctx(*this, op_scope, *dev_ctx);
kernel_iter->second->Compute(op_ctx);
} }
kernel_iter->second->Compute(ctx);
} }
OpKernelType OperatorWithKernel::GetActualKernelType( OpKernelType OperatorWithKernel::GetActualKernelType(
......
...@@ -12,5 +12,58 @@ limitations under the License. */ ...@@ -12,5 +12,58 @@ limitations under the License. */
#include "paddle/framework/selected_rows.h" #include "paddle/framework/selected_rows.h"
namespace paddle { namespace paddle {
namespace framework {} // namespace framework namespace framework {
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{
// the 2st field, rows information
auto& rows = selected_rows.rows();
uint64_t size = rows.size();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
for (uint64_t i = 0; i < size; ++i) {
os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
}
}
{
// the 3st field, the height of SelectedRows
int64_t height = selected_rows.height();
os.write(reinterpret_cast<const char*>(&height), sizeof(height));
}
// the 4st field, Tensor data
SerializeToStream(os, selected_rows.value(), dev_ctx);
}
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
auto tensor = *selected_rows->mutable_value();
{
// the 1st field, unit32_t version for SelectedRows
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
}
{
// the 2st field, rows information
uint64_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
auto& rows = *selected_rows->mutable_rows();
rows.resize(size);
for (uint64_t i = 0; i < size; ++i) {
is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
}
}
{
// the 3st field, the height of the SelectedRows
int64_t height;
is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
selected_rows->set_height(height);
}
// the 4st field, tensor which contains the data
DeserializeFromStream(is, &tensor);
}
} // namespace framework
} // namespace paddle } // namespace paddle
...@@ -59,5 +59,14 @@ class SelectedRows { ...@@ -59,5 +59,14 @@ class SelectedRows {
int64_t height_; int64_t height_;
}; };
/*
* Serialize/Desiralize SelectedRows to std::ostream
* You can pass ofstream or ostringstream to serilize to file
* or to a in memory string. GPU tensor will be copied to CPU.
*/
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) { ...@@ -43,5 +43,19 @@ TEST_F(SelectedRowsTester, complete_dims) {
ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
} }
TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
SelectedRows dst_tensor;
platform::CPUDeviceContext cpu_ctx(place_);
std::ostringstream oss;
SerializeToStream(oss, *selected_rows_, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -178,7 +178,7 @@ class Tensor { ...@@ -178,7 +178,7 @@ class Tensor {
DDim dims_; DDim dims_;
/** /**
* @brief the layout of memory block, default is NCHW. * @brief the layout of memory block, default is NHWC.
* *
* @note the memory allocation order, describe how weight/data is stored * @note the memory allocation order, describe how weight/data is stored
* For example, in 4-D Tensor(rank=4), there are three commonly * For example, in 4-D Tensor(rank=4), there are three commonly
......
...@@ -15,12 +15,13 @@ ...@@ -15,12 +15,13 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> #include <string>
namespace framework = paddle::framework;
namespace platform = paddle::platform;
TEST(Tensor, Dims) { TEST(Tensor, Dims) {
using namespace paddle::framework; framework::Tensor tt;
using namespace paddle::platform;
Tensor tt;
tt.Resize({2, 3, 4}); tt.Resize({2, 3, 4});
DDim dims = tt.dims(); framework::DDim dims = tt.dims();
ASSERT_EQ(arity(dims), 3); ASSERT_EQ(arity(dims), 3);
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
EXPECT_EQ(i + 2, dims[i]); EXPECT_EQ(i + 2, dims[i]);
...@@ -28,12 +29,12 @@ TEST(Tensor, Dims) { ...@@ -28,12 +29,12 @@ TEST(Tensor, Dims) {
} }
TEST(Tensor, DataAssert) { TEST(Tensor, DataAssert) {
paddle::framework::Tensor src_tensor; framework::Tensor src_tensor;
bool caught = false; bool caught = false;
try { try {
src_tensor.data<double>(); src_tensor.data<double>();
} catch (paddle::platform::EnforceNotMet err) { } catch (platform::EnforceNotMet err) {
caught = true; caught = true;
std::string msg = std::string msg =
"holder_ should not be null\nTensor holds no memory. Call " "holder_ should not be null\nTensor holds no memory. Call "
...@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) { ...@@ -50,61 +51,65 @@ TEST(Tensor, DataAssert) {
because Memory::Alloc() and Memory::Free() have not been ready. because Memory::Alloc() and Memory::Free() have not been ready.
*/ */
TEST(Tensor, MutableData) { TEST(Tensor, MutableData) {
using namespace paddle::framework;
using namespace paddle::platform;
{ {
Tensor src_tensor; framework::Tensor src_tensor;
float* p1 = nullptr; float* p1 = nullptr;
float* p2 = nullptr; float* p2 = nullptr;
// initialization // initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::CPUPlace());
EXPECT_NE(p1, nullptr); EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size // set src_tensor a new dim with large size
// momery is supposed to be re-allocated // momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
platform::CPUPlace());
EXPECT_NE(p2, nullptr); EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2); EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size // set src_tensor a new dim with same size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::CPUPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size // set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::CPUPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; framework::Tensor src_tensor;
float* p1 = nullptr; float* p1 = nullptr;
float* p2 = nullptr; float* p2 = nullptr;
// initialization // initialization
p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CUDAPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
platform::CUDAPlace());
EXPECT_NE(p1, nullptr); EXPECT_NE(p1, nullptr);
// set src_tensor a new dim with large size // set src_tensor a new dim with large size
// momery is supposed to be re-allocated // momery is supposed to be re-allocated
p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CUDAPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
platform::CUDAPlace());
EXPECT_NE(p2, nullptr); EXPECT_NE(p2, nullptr);
EXPECT_NE(p1, p2); EXPECT_NE(p1, p2);
// set src_tensor a new dim with same size // set src_tensor a new dim with same size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CUDAPlace()); p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
platform::CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
// set src_tensor a new dim with smaller size // set src_tensor a new dim with smaller size
// momery block is supposed to be unchanged // momery block is supposed to be unchanged
p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CUDAPlace()); p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
platform::CUDAPlace());
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#endif #endif
} }
TEST(Tensor, ShareDataWith) { TEST(Tensor, ShareDataWith) {
using namespace paddle::framework;
using namespace paddle::platform;
{ {
Tensor src_tensor; framework::Tensor src_tensor;
Tensor dst_tensor; framework::Tensor dst_tensor;
// Try to share data form uninitialized tensor // Try to share data form uninitialized tensor
bool caught = false; bool caught = false;
try { try {
...@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) { ...@@ -121,16 +126,18 @@ TEST(Tensor, ShareDataWith) {
} }
ASSERT_TRUE(caught); ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace()); src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::CPUPlace());
dst_tensor.ShareDataWith(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; framework::Tensor src_tensor;
Tensor dst_tensor; framework::Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CUDAPlace()); src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
platform::CUDAPlace());
dst_tensor.ShareDataWith(src_tensor); dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
...@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) { ...@@ -138,13 +145,12 @@ TEST(Tensor, ShareDataWith) {
} }
TEST(Tensor, Slice) { TEST(Tensor, Slice) {
using namespace paddle::framework;
using namespace paddle::platform;
{ {
Tensor src_tensor; framework::Tensor src_tensor;
src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace()); src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
Tensor slice_tensor = src_tensor.Slice(1, 3); platform::CPUPlace());
DDim slice_dims = slice_tensor.dims(); framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 3); ASSERT_EQ(arity(slice_dims), 3);
EXPECT_EQ(slice_dims[0], 2); EXPECT_EQ(slice_dims[0], 2);
EXPECT_EQ(slice_dims[1], 3); EXPECT_EQ(slice_dims[1], 3);
...@@ -153,11 +159,12 @@ TEST(Tensor, Slice) { ...@@ -153,11 +159,12 @@ TEST(Tensor, Slice) {
uintptr_t src_data_address = uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<int>()); reinterpret_cast<uintptr_t>(src_tensor.data<int>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace())); src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
uintptr_t slice_data_address = uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<int>()); reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t slice_mutable_data_address =
slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace())); reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
slice_tensor.dims(), platform::CPUPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
...@@ -165,22 +172,25 @@ TEST(Tensor, Slice) { ...@@ -165,22 +172,25 @@ TEST(Tensor, Slice) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; framework::Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), CUDAPlace()); src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
Tensor slice_tensor = src_tensor.Slice(2, 6); platform::CUDAPlace());
DDim slice_dims = slice_tensor.dims(); framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
framework::DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2); ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4); EXPECT_EQ(slice_dims[0], 4);
EXPECT_EQ(slice_dims[1], 9); EXPECT_EQ(slice_dims[1], 9);
uintptr_t src_data_address = uintptr_t src_data_address =
reinterpret_cast<uintptr_t>(src_tensor.data<double>()); reinterpret_cast<uintptr_t>(src_tensor.data<double>());
uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t src_mutable_data_address =
src_tensor.mutable_data<double>(src_tensor.dims(), CUDAPlace())); reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
src_tensor.dims(), platform::CUDAPlace()));
uintptr_t slice_data_address = uintptr_t slice_data_address =
reinterpret_cast<uintptr_t>(slice_tensor.data<double>()); reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>( uintptr_t slice_mutable_data_address =
slice_tensor.mutable_data<double>(slice_tensor.dims(), CUDAPlace())); reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
slice_tensor.dims(), platform::CUDAPlace()));
EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(src_data_address, src_mutable_data_address);
EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address);
EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
...@@ -189,23 +199,19 @@ TEST(Tensor, Slice) { ...@@ -189,23 +199,19 @@ TEST(Tensor, Slice) {
} }
TEST(Tensor, ReshapeToMatrix) { TEST(Tensor, ReshapeToMatrix) {
using namespace paddle::framework; framework::Tensor src;
using namespace paddle::platform; int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
Tensor src;
int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
src_ptr[i] = i; src_ptr[i] = i;
} }
Tensor res = ReshapeToMatrix(src, 2); framework::Tensor res = framework::ReshapeToMatrix(src, 2);
ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[0], 2 * 3);
ASSERT_EQ(res.dims()[1], 4 * 9); ASSERT_EQ(res.dims()[1], 4 * 9);
} }
TEST(Tensor, Layout) { TEST(Tensor, Layout) {
using namespace paddle::framework; framework::Tensor src;
using namespace paddle::platform; ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
Tensor src; src.set_layout(framework::DataLayout::kAnyLayout);
ASSERT_EQ(src.layout(), DataLayout::kNHWC); ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
src.set_layout(DataLayout::kAnyLayout);
ASSERT_EQ(src.layout(), DataLayout::kAnyLayout);
} }
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/tensor_util.h"
namespace paddle {
namespace framework {
template <typename Predicate, typename DevCtx>
struct AnyDTypeVisitor {
Predicate predicate_;
const Tensor& tensor_;
const DevCtx& ctx_;
Tensor* out_;
AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
Tensor* out)
: predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
template <typename T>
void operator()() const {
auto t = EigenVector<T>::Flatten(tensor_);
auto o = EigenScalar<bool>::From(*out_);
// return any of predicate_(t) is true.
o.device(*ctx_.eigen_device()) = predicate_(t).any();
}
};
template <typename Predicate, typename DevCtx>
inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
const DevCtx& ctx, framework::Tensor* out) {
VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
predicate, tensor, ctx, out));
}
template <typename Predicate>
struct AnyVisitor : public boost::static_visitor<bool> {
const framework::Tensor& tensor_;
Predicate predicate_;
AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
: tensor_(tensor), predicate_(std::move(predicate)) {}
template <typename Place>
bool operator()(const Place& place) const {
framework::Tensor out;
out.Resize({1});
out.mutable_data<bool>(place);
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
AnyImpl(predicate_, tensor_, *ctx, &out);
return this->GetResult(out, place);
}
bool GetResult(const framework::Tensor& out,
const platform::CUDAPlace& gpu) const {
platform::CPUPlace cpu;
framework::Tensor tmp;
tmp.Resize({1});
tmp.mutable_data<bool>(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait();
CopyFrom(out, cpu, *gpuctx, &tmp);
gpuctx->Wait();
return GetResult(tmp, cpu);
}
bool GetResult(const framework::Tensor& out,
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
}
};
template <typename Predicate>
inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
AnyVisitor<Predicate> visitor(tensor, predicate);
auto place = tensor.place();
return platform::VisitPlace(place, visitor);
}
struct HasNANPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isnan()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isnan();
}
};
bool HasNAN(const framework::Tensor& tensor) {
HasNANPredicate predicate;
return Any(tensor, predicate);
}
struct HasInfPredicate {
template <typename T>
auto operator()(const T& eigen_vec) const
-> decltype(std::declval<T>().isinf()) {
// Cast eigen_vector to vector of bool. true if is inf.
return eigen_vec.isinf();
}
};
bool HasInf(const framework::Tensor& tensor) {
HasInfPredicate predicate;
return Any(tensor, predicate);
}
} // namespace framework
} // namespace paddle
./tensor_util.cc
\ No newline at end of file
...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/data_type.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) { ...@@ -205,5 +209,109 @@ inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
src_ptr, size); src_ptr, size);
} }
// Returns true if a tensor contains NAN, i.e., Not A Number.
bool HasNAN(const framework::Tensor& tensor);
// Returns true if a tensor contains Inf, i.e., Infinity.
bool HasInf(const framework::Tensor& tensor);
inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto* pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char*>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto* data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
static_cast<std::streamsize>(size));
}
}
}
inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
proto::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char*>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char*>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void* buf;
platform::Place cpu = platform::CPUPlace();
// TODO(Yancey1989): use VisiterDataType instead of DataType switch
switch (desc.data_type()) {
case proto::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case proto::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case proto::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case proto::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char*>(buf), tensor->memory_size());
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "paddle/framework/tensor_util.h" #include "paddle/framework/tensor_util.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <cmath>
#include <string> #include <string>
namespace paddle { namespace paddle {
...@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) { ...@@ -230,5 +231,78 @@ TEST(CopyToVector, Tensor) {
#endif #endif
} }
TEST(HasNAN, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src;
float* buf = src.mutable_data<float>({3}, CPUPlace());
buf[0] = 0.0;
buf[1] = NAN;
buf[2] = 0.0;
ASSERT_TRUE(HasNAN(src));
}
TEST(HasInf, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src;
double* buf = src.mutable_data<double>({3}, CPUPlace());
buf[0] = 1.0;
buf[1] = INFINITY;
buf[2] = 0.0;
ASSERT_TRUE(HasInf(src));
}
TEST(Tensor, SerializeAndDeserialize) {
framework::Tensor src_tensor;
int array[6] = {1, 2, 3, 4, 5, 6};
src_tensor.Resize({2, 3});
int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) {
src_ptr[i] = array[i];
}
{
framework::Tensor dst_tensor;
auto place = new platform::CPUPlace();
platform::CPUDeviceContext cpu_ctx(*place);
std::ostringstream oss;
SerializeToStream(oss, src_tensor, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 5; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]);
}
delete place;
}
#ifdef PADDLE_WITH_CUDA
{
Tensor gpu_tensor;
gpu_tensor.Resize({2, 3});
Tensor dst_tensor;
auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
std::ostringstream oss;
SerializeToStream(oss, gpu_tensor, gpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]);
}
delete gpu_place;
}
#endif
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
static __global__ void FillNAN(float* buf) {
buf[0] = 0.0;
buf[1] = 0.1;
buf[2] = NAN;
}
static __global__ void FillInf(float* buf) {
buf[0] = 0.0;
buf[1] = INFINITY;
buf[2] = 0.5;
}
TEST(HasNAN, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
auto* cuda_ctx = pool.GetByPlace(gpu);
float* buf = tensor.mutable_data<float>({3}, gpu);
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(HasNAN(tensor));
}
TEST(HasInf, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
auto* cuda_ctx = pool.GetByPlace(gpu);
float* buf = tensor.mutable_data<float>({3}, gpu);
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(HasInf(tensor));
}
} // namespace framework
} // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <condition_variable> #include <condition_variable>
#include <functional> #include <functional>
#include <future>
#include <mutex> #include <mutex>
#include <queue> #include <queue>
#include <thread> #include <thread>
...@@ -25,10 +26,11 @@ limitations under the License. */ ...@@ -25,10 +26,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
typedef std::function<void()> Task;
class ThreadPool { class ThreadPool {
public: public:
typedef std::packaged_task<void()> Task;
typedef std::function<void()> Fun;
/** /**
* @brief Get a instance of threadpool, the thread number will * @brief Get a instance of threadpool, the thread number will
* be specified as the number of hardware thread contexts * be specified as the number of hardware thread contexts
...@@ -61,13 +63,18 @@ class ThreadPool { ...@@ -61,13 +63,18 @@ class ThreadPool {
/** /**
* @brief Push a function to the queue, and will be scheduled and * @brief Push a function to the queue, and will be scheduled and
* executed if a thread is available. * executed if a thread is available.
* @param[in] Task will be pushed to the task queue. * @param[in] Task, will be pushed to the task queue.
* @return std::future<void>, we could wait for the task finished by
* f.wait().
*/ */
void Run(const Task& fn) { std::future<void> Run(const Fun& fn) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
tasks_.push(fn); Task task(std::bind(fn));
std::future<void> f = task.get_future();
tasks_.push(std::move(task));
lock.unlock(); lock.unlock();
scheduled_.notify_one(); scheduled_.notify_one();
return f;
} }
/** /**
...@@ -110,7 +117,7 @@ class ThreadPool { ...@@ -110,7 +117,7 @@ class ThreadPool {
break; break;
} }
// pop a task from the task queue // pop a task from the task queue
auto task = tasks_.front(); auto task = std::move(tasks_.front());
tasks_.pop(); tasks_.pop();
--available_; --available_;
......
...@@ -20,16 +20,21 @@ limitations under the License. */ ...@@ -20,16 +20,21 @@ limitations under the License. */
namespace framework = paddle::framework; namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) { void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
std::vector<std::future<void>> fs;
for (int i = 0; i < cnt; ++i) { for (int i = 0; i < cnt; ++i) {
pool->Run([&sum]() { sum.fetch_add(1); }); auto f = pool->Run([&sum]() { sum.fetch_add(1); });
fs.push_back(std::move(f));
}
for (auto& f : fs) {
f.wait();
} }
} }
TEST(ThreadPool, ConcurrentInit) { TEST(ThreadPool, ConcurrentInit) {
framework::ThreadPool* pool; framework::ThreadPool* pool;
int concurrent_cnt = 50; int n = 50;
std::vector<std::thread> threads; std::vector<std::thread> threads;
for (int i = 0; i < concurrent_cnt; ++i) { for (int i = 0; i < n; ++i) {
std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
threads.push_back(std::move(t)); threads.push_back(std::move(t));
} }
...@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) { ...@@ -38,13 +43,13 @@ TEST(ThreadPool, ConcurrentInit) {
} }
} }
TEST(ThreadPool, ConcurrentStart) { TEST(ThreadPool, ConcurrentRun) {
framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
std::atomic<int> sum(0); std::atomic<int> sum(0);
std::vector<std::thread> threads; std::vector<std::thread> threads;
int concurrent_cnt = 50; int n = 50;
// sum = (n * (n + 1)) / 2 // sum = (n * (n + 1)) / 2
for (int i = 1; i <= concurrent_cnt; ++i) { for (int i = 1; i <= n; ++i) {
std::thread t(do_sum, pool, std::ref(sum), i); std::thread t(do_sum, pool, std::ref(sum), i);
threads.push_back(std::move(t)); threads.push_back(std::move(t));
} }
...@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) { ...@@ -52,5 +57,5 @@ TEST(ThreadPool, ConcurrentStart) {
t.join(); t.join();
} }
pool->Wait(); pool->Wait();
EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2); EXPECT_EQ(sum, ((n + 1) * n) / 2);
} }
...@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const { ...@@ -74,7 +74,7 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
case proto::VarDesc::LOD_TENSOR_ARRAY: case proto::VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().tensor(); return desc_.tensor_array().tensor();
default: default:
PADDLE_THROW("Unexpected branch."); PADDLE_THROW("The type of var '", this->Name(), "' is unsupported.");
} }
} }
......
...@@ -126,14 +126,165 @@ public: ...@@ -126,14 +126,165 @@ public:
inputData += inputChannels * inputHeight * inputWidth; inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth; outputData += outputChannels * outputHeight * outputWidth;
} }
}
};
#ifdef PADDLE_MOBILE_INFERENCE #ifdef PADDLE_MOBILE_INFERENCE
if (Device == DEVICE_TYPE_CPU) {
memory_.reset(); /*
* \brief Forward calculation of convolution, optimized for mobile.
*/
template <DeviceType Device>
class GemmConvMobileFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
check(inputs, outputs);
// TODO(hedaoyuan): Need to define some index macros,
// to avoid useing 0 and 1.
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
real beta;
if (outputs[0].getArgType() == ADD_TO) {
beta = 1.0;
} else {
beta = 0.0;
} }
#endif
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>();
bool needIm2col = isNeedIm2col(filter);
TensorShape imShape =
TensorShape({inputChannels / groups_, inputHeight, inputWidth});
TensorShape colShape;
real* colData = NULL;
size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
size_t colWidth = outputHeight * outputWidth;
// Max col matrix height 256, Max col matrix width 1024
size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
if (needIm2col) {
colShape = TensorShape({inputChannels / groups_,
filterHeight,
filterWidth,
outputHeight,
outputWidth});
resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
colData = reinterpret_cast<real*>(memory_->getBuf());
}
Im2ColMobileFunctor<real> im2col;
size_t inputOffset = imShape.getElements();
size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth;
size_t filterOffset = filter.getElements() / groups_;
int nStride = colWidth;
int kStride = colHeight;
for (size_t i = 0; i < batchSize; i++) {
for (size_t g = 0; g < groups_; g++) {
if (needIm2col) {
real beta_ = beta;
for (size_t colHeightStart = 0; colHeightStart < colHeight;
colHeightStart += stepColHeight) {
for (size_t colWidthStart = 0; colWidthStart < colWidth;
colWidthStart += stepColWidth) {
int N = std::min(colWidth - colWidthStart, stepColWidth);
int K = std::min(colHeight - colHeightStart, stepColHeight);
// im2col
im2col(inputData + g * inputOffset,
imShape,
colData,
colShape,
strideH(),
strideW(),
paddingH(),
paddingW(),
dilationH(),
dilationW(),
colHeightStart,
K,
colWidthStart,
N);
// gemm
int M = outputChannels / groups_;
BlasGemm<Device, real>::compute(
false,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset + colHeightStart,
kStride,
colData,
N,
beta_,
outputData + g * outputOffset + colWidthStart,
nStride);
}
beta_ = 1.0;
}
} else {
int M = outputChannels / groups_;
int N = outputHeight * outputWidth;
int K = inputChannels / groups_ * filterHeight * filterWidth;
BlasGemm<Device, real>::compute(false,
false,
M,
N,
K,
1.0f,
filterData + g * filterOffset,
K,
inputData + g * inputOffset,
N,
beta,
outputData + g * outputOffset,
N);
}
}
inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
}
memory_.reset();
} }
}; };
#endif
/* /*
* \brief Backward input calculation of convolution. * \brief Backward input calculation of convolution.
*/ */
...@@ -348,7 +499,11 @@ public: ...@@ -348,7 +499,11 @@ public:
} }
}; };
#ifdef PADDLE_MOBILE_INFERENCE
REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
#else
REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
#endif
REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction); REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction); REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -98,4 +98,54 @@ public: ...@@ -98,4 +98,54 @@ public:
int dilationWidth = 1); int dilationWidth = 1);
}; };
template <class T>
class Im2ColMobileFunctor {
public:
void operator()(const T* imData,
const TensorShape& imShape,
T* colData,
const TensorShape& colShape,
int strideHeight,
int strideWidth,
int paddingHeight,
int paddingWidth,
int dilationHeight,
int dilationWidth,
int colHeightStart,
int colHeightSize,
int colWidthStart,
int colWidthSize) {
int inputHeight = imShape[1];
int inputWidth = imShape[2];
int filterHeight = colShape[1];
int filterWidth = colShape[2];
int outputWidth = colShape[4];
for (int colh = 0; colh < colHeightSize; colh++) {
int wOffset = (colHeightStart + colh) % filterWidth;
int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
for (int colw = 0; colw < colWidthSize; colw++) {
int h = (colWidthStart + colw) / outputWidth;
int w = (colWidthStart + colw) % outputWidth;
int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) < 0 ||
(imRowIdx - paddingHeight) >= inputHeight ||
(imColIdx - paddingWidth) < 0 ||
(imColIdx - paddingWidth) >= inputWidth) {
colData[colh * colWidthSize + colw] = static_cast<T>(0);
} else {
imRowIdx += c_im * inputHeight - paddingHeight;
imColIdx -= paddingWidth;
colData[colh * colWidthSize + colw] =
imData[imRowIdx * inputWidth + imColIdx];
}
}
}
}
};
} // namespace paddle } // namespace paddle
...@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); } ...@@ -138,4 +138,86 @@ TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
#endif #endif
template <class T>
void TestIm2ColMobileFunctor() {
for (size_t channels : {32}) {
for (size_t inputHeight : {33, 100}) {
for (size_t inputWidth : {32, 96}) {
for (size_t filterHeight : {5}) {
for (size_t filterWidth : {7}) {
for (size_t stride : {2}) {
for (size_t padding : {1}) {
for (size_t dilation : {1, 3}) {
size_t filterSizeH = (filterHeight - 1) * dilation + 1;
size_t filterSizeW = (filterWidth - 1) * dilation + 1;
if (inputHeight + 2 * padding < filterSizeH ||
inputWidth + 2 * padding < filterSizeW)
break;
if (padding >= filterSizeH || padding >= filterSizeW) break;
size_t outputHeight =
(inputHeight - filterSizeH + 2 * padding) / stride + 1;
size_t outputWidth =
(inputWidth - filterSizeW + 2 * padding) / stride + 1;
TensorShape imShape =
TensorShape({channels, inputHeight, inputWidth});
TensorShape colShape1 = TensorShape({channels,
filterHeight,
filterWidth,
outputHeight,
outputWidth});
size_t height = channels * filterHeight * filterWidth;
size_t width = outputHeight * outputWidth;
VectorPtr input1 =
Vector::create(imShape.getElements(), false);
VectorPtr input2 =
Vector::create(imShape.getElements(), false);
MatrixPtr output1 =
Matrix::create(height, width, false, false);
MatrixPtr output2 =
Matrix::create(height, width, false, false);
input1->uniform(0.001, 1);
input2->copyFrom(*input1);
Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
Im2ColMobileFunctor<T> im2Col2;
im2Col1(input1->getData(),
imShape,
output1->getData(),
colShape1,
stride,
stride,
padding,
padding,
dilation,
dilation);
im2Col2(input2->getData(),
imShape,
output2->getData(),
colShape1,
stride,
stride,
padding,
padding,
dilation,
dilation,
0,
height,
0,
width);
autotest::TensorCheckEqual(*output1, *output2);
}
}
}
}
}
}
}
}
}
TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
} // namespace paddle } // namespace paddle
...@@ -34,6 +34,16 @@ else() ...@@ -34,6 +34,16 @@ else()
message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations") message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
endif() endif()
if(NOT WITH_MKLML)
file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
message(STATUS "Skip compiling with MKLPackedLayers")
else()
message(STATUS "Compile with MKLPackedLayers")
endif()
if(NOT WITH_GPU) if(NOT WITH_GPU)
list(REMOVE_ITEM GSERVER_HEADER list(REMOVE_ITEM GSERVER_HEADER
layers/CudnnConvBaseLayer.h layers/CudnnConvBaseLayer.h
......
...@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap, ...@@ -29,7 +29,7 @@ bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
} }
/* the size of inputs for norm-layer is 1 */ /* the size of inputs for norm-layer is 1 */
CHECK_EQ(config_.inputs_size(), 1UL); CHECK_EQ(config_.inputs_size(), 1);
const NormConfig& conf = config_.inputs(0).norm_conf(); const NormConfig& conf = config_.inputs(0).norm_conf();
localSize_ = conf.size(); localSize_ = conf.size();
alpha_ = conf.scale(); alpha_ = conf.scale();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLPackedRecurrentLayer.h"
namespace paddle {
REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
packed_weight_->pack();
if (needGradient_) {
packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
packed_weightT_->pack();
}
return true;
}
void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
RecurrentLayer::backward(callback);
packed_weight_->pack();
if (needGradient_) {
packed_weightT_->pack();
}
}
void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
size_t numSequences,
const int* starts) {
if (!batchValue_) {
batchValue_.reset(new SequenceToBatch(useGpu_));
}
batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
batchValue_->copyFromSeq(*output_.value);
{
REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
/* forward one batch */
for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
MatrixPtr batchValue = batchValue_->getBatchValue(n);
if (n != 0) {
MatrixPtr preBatchValue =
batchValue_->getBatchValue(n - 1, batchValue->getHeight());
packed_weight_->gemm_compute(preBatchValue, batchValue);
}
Argument arg;
arg.value = batchValue;
activation_->forward(arg).check();
}
}
batchValue_->copyBackSeq(*output_.value);
}
void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
size_t numSequences,
const int* starts) {
if (!batchGrad_) {
batchGrad_.reset(new SequenceToBatch(useGpu_));
}
batchGrad_->shareIndexWith(*batchValue_);
size_t numBatch = batchGrad_->getNumBatch();
bool backwardByBatch = numBatch < numSequences;
batchGrad_->copyFromSeq(*output_.grad);
{
REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
/* backward one batch */
for (int n = (int)numBatch - 1; n >= 0; n--) {
MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
MatrixPtr batchValue =
batchValue_->getBatchValue(n, batchGrad->getHeight());
Argument arg;
arg.value = batchValue;
arg.grad = batchGrad;
activation_->backward(arg).check();
if (n != 0) {
batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
packed_weightT_->gemm_compute(batchGrad, batchValue);
}
if (backwardByBatch && weight_->getWGrad()) {
if (n != 0) {
/* backward weight */
batchValue =
batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
weight_->getWGrad()->mul(
*batchValue->getTranspose(), *batchGrad, 1, 1);
}
}
}
}
batchGrad_->copyBackSeq(*output_.grad);
if (!backwardByBatch && weight_->getWGrad()) {
REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
for (size_t seq = 0; seq < numSequences; ++seq) {
int len = starts[seq + 1] - starts[seq];
weight_->getWGrad()->mul(
*output_.value
->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
->getTranspose(),
*output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
len - 1),
1,
1);
}
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLPackedWeight.h"
#include "RecurrentLayer.h"
DECLARE_bool(rnn_use_batch);
namespace paddle {
/**
* @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
* but is optimized with MKL cblas packed gemm.
* More details:
* https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
*/
class MKLPackedRecurrentLayer : public RecurrentLayer {
public:
explicit MKLPackedRecurrentLayer(const LayerConfig& config)
: RecurrentLayer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void backward(const UpdateCallback& callback) override;
protected:
void forwardBatch(int batchSize,
size_t numSequences,
const int* starts) override;
void backwardBatch(int batchSize,
size_t numSequences,
const int* starts) override;
protected:
/// packed_weight_ contains same data with
/// RecurrentLayer::weight_ but is packed
std::unique_ptr<MKLPackedWeight> packed_weight_;
/// packed_weightT_ is the transposition matrix of packed_weight_
std::unique_ptr<MKLPackedWeight> packed_weightT_;
};
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/math/MathFunctions.h"
#include "paddle/parameter/Parameter.h"
#include "paddle/parameter/Weight.h"
namespace paddle {
class MKLPackedWeight {
protected:
/// The pointer of weight
real *weight_;
/// The pointer of cblas packed gemm to weight
real *packedWeight_;
size_t height_;
size_t width_;
bool transW_;
public:
explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
packedWeight_ = nullptr;
weight_ = weight->getData();
height_ = weight->getHeight();
width_ = weight->getWidth();
transW_ = transW;
}
~MKLPackedWeight() { free_(); }
void pack() { pack_(weight_); }
void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
cblas_sgemm_compute(CblasRowMajor,
CblasNoTrans,
CblasPacked,
src->getHeight(),
transW_ ? height_ : width_,
transW_ ? width_ : height_,
src->getData(),
src->getWidth(),
packedWeight_,
width_,
1.0,
dst->getData(),
dst->getWidth());
}
protected:
void pack_(real *src) {
if (!packedWeight_) {
packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
}
cblas_sgemm_pack(CblasRowMajor,
CblasBMatrix,
transW_ ? CblasTrans : CblasNoTrans,
1,
transW_ ? height_ : width_,
transW_ ? width_ : height_,
1.0,
src,
width_,
packedWeight_);
}
void free_() {
if (packedWeight_) {
cblas_sgemm_free(packedWeight_);
}
}
};
} // namespace paddle
...@@ -12,119 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,119 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gflags/gflags.h> #include "RecurrentLayer.h"
#include "Layer.h"
#include "SequenceToBatch.h"
#include "paddle/utils/Stat.h"
DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation."); DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
namespace paddle { namespace paddle {
/**
* @brief RecurrentLayer takes 1 input layer. The output size is the same with
* input layer.
* For each sequence [start, end] it performs the following computation:
* \f[
* out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\
* out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
*
* \f]
* If reversed is true, the order is reversed:
* \f[
* out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\
* out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
* \f]
* There are two methods to calculate rnn. One way is to compute rnn one
* sequence by one sequence. The other way is to reorganize the input
* into batches, then compute rnn one batch by one batch. Users can select
* them by rnn_use_batch flag.
*/
class RecurrentLayer : public Layer {
public:
explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback) override;
void resetState() override;
void setState(LayerStatePtr state) override;
LayerStatePtr getState() override;
protected:
/**
* @brief If user do not set --rnn_use_batch=true, it will
* compute rnn forward one sequence by one sequence in default.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void forwardSequence(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Compute rnn forward by one sequence.
* @param start The start position of this sequence (or sample).
* @param length The length of this sequence (or sample), namely the words
* number of this sequence.
*/
void forwardOneSequence(int start, int length);
/**
* @brief Compute rnn backward one sequence by onesequence.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void backwardSequence(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Compute rnn backward by one sequence.
* @param start The start position of this sequence (or sample).
* @param length The length of this sequence (or sample), namely the words
* number of this sequence.
*/
void backwardOneSequence(int start, int length);
/**
* @brief Reorganize input into batches and compute rnn forward batch
* by batch. It will convert batch shape to sequence after finishing forward.
* The batch info can refer to SequenceToBatch class.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void forwardBatch(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Reorganize input into batches and compute rnn forward batch
* by batch.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void backwardBatch(int batchSize, size_t numSequences, const int* starts);
protected:
std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> bias_;
/// frameOutput_[i] is used to hold the i-th sample of output_
std::vector<Argument> frameOutput_;
MatrixPtr prevOutput_;
/// Whether compute rnn by reverse.
bool reversed_;
/// If compute batch by batch, batchValue_ will be used to save the
/// reorganized input value.
std::unique_ptr<SequenceToBatch> batchValue_;
/// If compute batch by batch, batchGrad_ will be used to save the
/// gradient with respect to reorganized input value.
std::unique_ptr<SequenceToBatch> batchGrad_;
};
REGISTER_LAYER(recurrent, RecurrentLayer); REGISTER_LAYER(recurrent, RecurrentLayer);
bool RecurrentLayer::init(const LayerMap& layerMap, bool RecurrentLayer::init(const LayerMap& layerMap,
...@@ -260,7 +153,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) { ...@@ -260,7 +153,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
bias_->getWGrad()->collectBias(*output_.grad, 1); bias_->getWGrad()->collectBias(*output_.grad, 1);
bias_->getParameterPtr()->incUpdate(callback); bias_->getParameterPtr()->incUpdate(callback);
} }
weight_->getParameterPtr()->incUpdate(callback); weight_->getParameterPtr()->incUpdate(callback);
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <gflags/gflags.h>
#include "Layer.h"
#include "SequenceToBatch.h"
#include "paddle/utils/Stat.h"
namespace paddle {
/**
* @brief RecurrentLayer takes 1 input layer. The output size is the same with
* input layer.
* For each sequence [start, end] it performs the following computation:
* \f[
* out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\
* out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
*
* \f]
* If reversed is true, the order is reversed:
* \f[
* out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\
* out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
* \f]
* There are two methods to calculate rnn. One way is to compute rnn one
* sequence by one sequence. The other way is to reorganize the input
* into batches, then compute rnn one batch by one batch. Users can select
* them by rnn_use_batch flag.
*/
class RecurrentLayer : public Layer {
public:
explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback) override;
void resetState() override;
void setState(LayerStatePtr state) override;
LayerStatePtr getState() override;
protected:
/**
* @brief If user do not set --rnn_use_batch=true, it will
* compute rnn forward one sequence by one sequence in default.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void forwardSequence(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Compute rnn forward by one sequence.
* @param start The start position of this sequence (or sample).
* @param length The length of this sequence (or sample), namely the words
* number of this sequence.
*/
void forwardOneSequence(int start, int length);
/**
* @brief Compute rnn backward one sequence by onesequence.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
void backwardSequence(int batchSize, size_t numSequences, const int* starts);
/**
* @brief Compute rnn backward by one sequence.
* @param start The start position of this sequence (or sample).
* @param length The length of this sequence (or sample), namely the words
* number of this sequence.
*/
void backwardOneSequence(int start, int length);
/**
* @brief Reorganize input into batches and compute rnn forward batch
* by batch. It will convert batch shape to sequence after finishing forward.
* The batch info can refer to SequenceToBatch class.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
virtual void forwardBatch(int batchSize,
size_t numSequences,
const int* starts);
/**
* @brief Reorganize input into batches and compute rnn forward batch
* by batch.
* @param batchSize Total words number of all samples in this batch.
* @param numSequences The sample number.
* @param starts Each start position of each samples.
*/
virtual void backwardBatch(int batchSize,
size_t numSequences,
const int* starts);
protected:
std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> bias_;
/// frameOutput_[i] is used to hold the i-th sample of output_
std::vector<Argument> frameOutput_;
MatrixPtr prevOutput_;
/// Whether compute rnn by reverse.
bool reversed_;
/// If compute batch by batch, batchValue_ will be used to save the
/// reorganized input value.
std::unique_ptr<SequenceToBatch> batchValue_;
/// If compute batch by batch, batchGrad_ will be used to save the
/// gradient with respect to reorganized input value.
std::unique_ptr<SequenceToBatch> batchGrad_;
};
} // namespace paddle
...@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) { ...@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) {
for (auto reversed : {false, true}) { for (auto reversed : {false, true}) {
config.layerConfig.set_reversed(reversed); config.layerConfig.set_reversed(reversed);
config.testState = !reversed; config.testState = !reversed;
testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu); testLayerGrad(
config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
} }
} }
} }
...@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) { ...@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) {
for (auto reversed : {false, true}) { for (auto reversed : {false, true}) {
config.layerConfig.set_reversed(reversed); config.layerConfig.set_reversed(reversed);
config.testState = !reversed; config.testState = !reversed;
testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu); testLayerGrad(
config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
} }
} }
for (auto useGpu : {true}) { for (auto useGpu : {true}) {
......
...@@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) { ...@@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) {
#define protected public #define protected public
#include "paddle/gserver/layers/GatedRecurrentLayer.h" #include "paddle/gserver/layers/GatedRecurrentLayer.h"
#include "paddle/gserver/layers/LstmLayer.h" #include "paddle/gserver/layers/LstmLayer.h"
#include "paddle/gserver/layers/RecurrentLayer.h"
template <class T> template <class T>
class TestRecurrentLayer { class TestRecurrentLayer {
public: public:
...@@ -420,12 +421,151 @@ TEST(Layer, LstmLayer) { ...@@ -420,12 +421,151 @@ TEST(Layer, LstmLayer) {
} }
} }
#ifdef PADDLE_WITH_MKLML
#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
bool reversed,
int layerSize,
LayerPtr dataLayer,
ParameterPtr para,
ParameterPtr bias = nullptr) {
LayerMap layerMap;
ParameterMap parameterMap;
layerMap[dataLayer->getName()] = dataLayer;
parameterMap[para->getName()] = para;
if (bias) {
parameterMap[bias->getName()] = bias;
layerConfig.set_bias_parameter_name("bias_0");
}
layerConfig.set_size(layerSize);
layerConfig.set_reversed(reversed);
layerConfig.add_inputs();
LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
input.set_input_layer_name("layer_0");
input.set_input_parameter_name("para_0");
LayerPtr testLayer = Layer::create(layerConfig);
layerMap[testLayer->getName()] = testLayer;
testLayer->init(layerMap, parameterMap);
testLayer->setNeedGradient(true);
return testLayer;
}
void checkMKLPackedLayer(LayerConfig layerConfig1,
LayerConfig layerConfig2,
bool reversed,
int layerSize,
int batchSize,
bool useBatch1,
bool useBatch2) {
LayerPtr dataLayer;
ParameterPtr para, bias;
if (layerConfig1.type() == "recurrent") {
dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
para = creatParameter("para_0", 0, layerSize * layerSize, false);
bias = nullptr;
} else if (layerConfig1.type() == "gated_recurrent") {
dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
}
LayerPtr testLayer1 = initMKLPackedLayer(
layerConfig1, reversed, layerSize, dataLayer, para, bias);
LayerPtr testLayer2 = initMKLPackedLayer(
layerConfig2, reversed, layerSize, dataLayer, para, bias);
const VectorPtr& weightGrad =
(testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
CpuVector wgt_grad1(weightGrad->getSize());
CpuVector wgt_grad2(weightGrad->getSize());
CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
for (int i = 0; i < 2; i++) {
FLAGS_rnn_use_batch = useBatch1;
testLayer1->forward(PASS_GC);
FLAGS_rnn_use_batch = useBatch2;
testLayer2->forward(PASS_GC);
testLayer1->getOutputGrad()->randomizeUniform();
testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
weightGrad->zero();
inputGrad->zero();
FLAGS_rnn_use_batch = useBatch1;
testLayer1->backward(nullptr);
wgt_grad1.copyFrom(*weightGrad);
input_grad1.copyFrom(*inputGrad);
weightGrad->zero();
inputGrad->zero();
FLAGS_rnn_use_batch = useBatch2;
testLayer2->backward(nullptr);
wgt_grad2.copyFrom(*weightGrad);
input_grad2.copyFrom(*inputGrad);
checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
checkError(wgt_grad1, wgt_grad2);
checkError(input_grad1, input_grad2);
}
}
TEST(MKLPackedLayer, RecurrentLayer) {
LayerConfig layerConfig1;
LayerConfig layerConfig2;
layerConfig1.set_name("paddle-rnn");
layerConfig1.set_type("recurrent");
layerConfig1.set_active_type("relu");
layerConfig2.set_name("mkl-packed-rnn");
layerConfig2.set_type("mkl_packed_recurrent");
layerConfig2.set_active_type("relu");
FLAGS_use_gpu = false;
for (auto layerSize : {32, 64, 128, 256, 512}) {
for (auto batchSize : {1, 5, 100, 500}) {
for (auto reversed : {true, false}) {
for (auto paddle_use_batch : {true, false}) {
for (auto MKLPacked_use_batch : {true, false}) {
LOG(INFO) << " layerSize=" << layerSize
<< " batchSize=" << batchSize << " reversed=" << reversed
<< " paddle_use_batch=" << paddle_use_batch
<< " MKLPacked_use_batch=" << MKLPacked_use_batch;
checkMKLPackedLayer(layerConfig1,
layerConfig2,
reversed,
layerSize,
batchSize,
paddle_use_batch,
MKLPacked_use_batch);
}
}
}
}
}
}
#endif
int main(int argc, char** argv) { int main(int argc, char** argv) {
if (version::isWithGpu()) { testing::InitGoogleTest(&argc, argv);
testing::InitGoogleTest(&argc, argv); initMain(argc, argv);
initMain(argc, argv); if (!version::isWithGpu()) {
return RUN_ALL_TESTS(); testing::GTEST_FLAG(filter) = "-Layer.*";
} else {
return 0;
} }
return RUN_ALL_TESTS();
} }
file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h) set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
function(op_library TARGET) function(op_library TARGET)
...@@ -48,6 +49,10 @@ function(op_library TARGET) ...@@ -48,6 +49,10 @@ function(op_library TARGET)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif() endif()
list(LENGTH op_library_DEPS op_library_DEPS_len)
if (${op_library_DEPS_len} GREATER 0)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
...@@ -56,106 +61,28 @@ function(op_library TARGET) ...@@ -56,106 +61,28 @@ function(op_library TARGET)
${op_common_deps}) ${op_common_deps})
endif() endif()
# net_op doesn't need pybind # Define operators that don't need pybind here.
if ("${TARGET}" STREQUAL "net_op") foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
set(pybind_flag 1) if ("${TARGET}" STREQUAL "${manual_pybind_op}")
endif() set(pybind_flag 1)
endif()
if ("${TARGET}" STREQUAL "compare_op") endforeach()
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# conv_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "conv_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
endif()
# pool_op contains several operators
if ("${TARGET}" STREQUAL "pool_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
endif()
# pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
endif()
if ("${TARGET}" STREQUAL "logical_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
endif()
# pool_with_index_op contains several operators
if ("${TARGET}" STREQUAL "pool_with_index_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif()
# conv_transpose_op contains several operators
if ("${TARGET}" STREQUAL "conv_transpose_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif()
# conv_transpose_cudnn_op contains two operators
if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
endif()
# save_restore_op contains several operators
if ("${TARGET}" STREQUAL "save_restore_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
endif()
# activation_op contains several operators
if ("${TARGET}" STREQUAL "activation_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
endif()
# nccl_op contains several operators
if ("${TARGET}" STREQUAL "nccl_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
endif()
# reduce_op contains several operators
if ("${TARGET}" STREQUAL "reduce_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
endif()
if ("${TARGET}" STREQUAL "tensor_array_read_write_op") # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
set(pybind_flag 1) # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n") # And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
if (one_register STREQUAL "")
string(REPLACE "_op" "" TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
string(REPLACE "," "" TARGET "${TARGET}")
endif() endif()
# pybind USE_NO_KERNEL_OP # pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
string(REPLACE "_op" "" TARGET "${TARGET}") string(REPLACE "_op" "" TARGET "${TARGET}")
if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
...@@ -166,7 +93,6 @@ function(op_library TARGET) ...@@ -166,7 +93,6 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP # pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -181,58 +107,31 @@ endfunction() ...@@ -181,58 +107,31 @@ endfunction()
add_subdirectory(math) add_subdirectory(math)
add_subdirectory(nccl) add_subdirectory(nccl)
set(DEPS_OPS if(WITH_GPU)
cond_op op_library(nccl_op DEPS nccl_common)
cross_entropy_op file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
recurrent_op else()
softmax_with_cross_entropy_op set(DEPS_OPS ${DEPS_OPS} nccl_op)
softmax_op endif()
sequence_softmax_op
sum_op
pool_op
maxout_op
unpool_op
pool_with_index_op
conv_op
conv_transpose_op
nccl_op
sequence_conv_op
sequence_pool_op
lod_rank_table_op
lod_tensor_to_array_op
array_to_lod_tensor_op
max_sequence_len_op
lstm_op
tensor_array_read_write_op
gru_op
adagrad_op
sgd_op
save_op
load_op
send_op
recv_op)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
add_subdirectory(detail) add_subdirectory(detail)
op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
set_source_files_properties( set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
send_op.cc op_library(send_op DEPS ${DISTRIBUTE_DEPS})
PROPERTIES set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
set_source_files_properties( else()
recv_op.cc set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
PROPERTIES
COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
endif() endif()
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op DEPS framework_proto tensor net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax) op_library(softmax_op DEPS softmax)
op_library(detection_output_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax)
op_library(sum_op DEPS selected_rows_functor) op_library(sum_op DEPS selected_rows_functor)
op_library(sgd_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor)
...@@ -242,21 +141,17 @@ op_library(pool_op DEPS pooling) ...@@ -242,21 +141,17 @@ op_library(pool_op DEPS pooling)
op_library(maxout_op DEPS maxouting) op_library(maxout_op DEPS maxouting)
op_library(unpool_op DEPS unpooling) op_library(unpool_op DEPS unpooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op DEPS lod_rank_table)
op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op) op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table) op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
if(WITH_GPU)
op_library(nccl_op DEPS nccl_common)
endif()
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling) op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col) op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) op_library(recurrent_op DEPS executor)
op_library(cos_sim_op DEPS cos_sim_functor)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
...@@ -265,9 +160,10 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) ...@@ -265,9 +160,10 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
endforeach() endforeach()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
...@@ -276,6 +172,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) ...@@ -276,6 +172,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
if(WITH_GPU) if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
...@@ -105,48 +105,18 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> { ...@@ -105,48 +105,18 @@ struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
const framework::Tensor& learning_rate, T epsilon, const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) { framework::Tensor* moment, framework::Tensor* param) {
// 1. g_m.rows = set(g.rows) // 1. g_m.rows = set(g.rows)
auto grad_rows = grad.rows();
std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto grad_width = grad.value().dims()[1]; auto grad_width = grad.value().dims()[1];
std::unique_ptr<framework::SelectedRows> grad_merge{ math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
new framework::SelectedRows()}; auto grad_merge = merge_func(context, grad);
grad_merge->set_rows(merge_rows); auto& merge_rows = grad_merge.rows();
grad_merge->set_height(grad.height()); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
grad_merge->mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);
auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
auto* grad_data = grad.value().data<T>();
for (size_t i = 0; i < grad_rows.size(); i++) {
size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]);
for (int64_t j = 0; j < grad_width; j++) {
grad_merge_data[grad_merge_i * grad_width + j] +=
grad_data[i * grad_width + j];
}
}
// 2. m += g_m * g_m // 2. m += g_m * g_m
std::unique_ptr<framework::SelectedRows> grad_square{ math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
new framework::SelectedRows()}; auto grad_square = sqare_func(context, grad_merge, grad_merge);
grad_square->set_rows(grad_merge->rows());
grad_square->set_height(grad_merge->height());
grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
context.GetPlace());
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.eigen_device()) = gm * gm;
math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor; math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
functor(context, *grad_square, moment); functor(context, grad_square, moment);
// 3. update parameter // 3. update parameter
auto* lr = learning_rate.data<T>(); auto* lr = learning_rate.data<T>();
......
...@@ -78,62 +78,30 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -78,62 +78,30 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& learning_rate, T epsilon, const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) { framework::Tensor* moment, framework::Tensor* param) {
// 1. g_m.rows = set(g.rows) // 1. g_m.rows = set(g.rows)
auto grad_rows = grad.rows();
std::set<int64_t> row_set(grad_rows.begin(), grad_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto grad_width = grad.value().dims()[1]; auto grad_width = grad.value().dims()[1];
std::unique_ptr<framework::SelectedRows> grad_merge{ math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
new framework::SelectedRows()}; auto grad_merge = merge_func(context, grad);
grad_merge->set_rows(merge_rows); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
grad_merge->set_height(grad.height()); auto& merge_rows = grad_merge.rows();
grad_merge->mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);
auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
auto* grad_data = grad.value().data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid1(1, grad_rows.size());
MergeGradKernel<
T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_data, grad.rows().data(),
grad_merge_data, grad_merge->rows().data(),
grad_merge->rows().size(), grad_width);
// 2. m += g_m * g_m // 2. m += g_m * g_m
std::unique_ptr<framework::SelectedRows> grad_square{ math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
new framework::SelectedRows()}; auto grad_square = sqare_func(context, grad_merge, grad_merge);
grad_square->set_rows(grad_merge->rows());
grad_square->set_height(grad_merge->height());
grad_square->mutable_value()->mutable_data<T>(grad_merge->value().dims(),
context.GetPlace());
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.eigen_device()) = gm * gm;
math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor; math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
functor(context, *grad_square, moment); functor(context, grad_square, moment);
// 3. update parameter // 3. update parameter
auto* lr = learning_rate.data<T>(); auto* lr = learning_rate.data<T>();
auto* param_data = param->data<T>(); auto* param_data = param->data<T>();
auto* moment_data = moment->data<T>(); auto* moment_data = moment->data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid2(1, merge_rows.size()); dim3 grid2(1, merge_rows.size());
SparseAdagradFunctorKernel< SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0, T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge->rows().data(), .stream()>>>(grad_merge_data, grad_merge.rows().data(),
lr, param_data, moment_data, grad_width, lr, param_data, moment_data, grad_width,
epsilon); epsilon);
} }
......
...@@ -16,11 +16,14 @@ limitations under the License. */ ...@@ -16,11 +16,14 @@ limitations under the License. */
#include <math.h> // for sqrt in CPU and CUDA #include <math.h> // for sqrt in CPU and CUDA
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h" #include "paddle/operators/detail/safe_ref.h"
#include "paddle/operators/math/selected_rows_functor.h"
#include "paddle/platform/for_range.h" #include "paddle/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace scatter = paddle::operators::math::scatter;
template <typename T> template <typename T>
struct AdamFunctor { struct AdamFunctor {
T beta1_; T beta1_;
...@@ -79,6 +82,69 @@ struct AdamFunctor { ...@@ -79,6 +82,69 @@ struct AdamFunctor {
} }
}; };
template <typename T>
struct SparseAdamFunctor {
T beta1_;
T beta2_;
T epsilon_;
const T* beta1_pow_;
const T* beta2_pow_;
const T* moment1_;
T* moment1_out_;
const T* moment2_;
T* moment2_out_;
const T* lr_;
const T* grad_;
const T* param_;
T* param_out_;
const int64_t* rows_;
int64_t row_numel_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows,
int64_t row_numel)
: beta1_(beta1),
beta2_(beta2),
epsilon_(epsilon),
beta1_pow_(beta1_pow),
beta2_pow_(beta2_pow),
moment1_(mom1),
moment1_out_(mom1_out),
moment2_(mom2),
moment2_out_(mom2_out),
lr_(lr),
grad_(grad),
param_(param),
param_out_(param_out),
rows_(rows),
row_numel_(row_numel) {}
inline HOSTDEVICE void operator()(size_t i) const {
T beta1_pow = *beta1_pow_;
T beta2_pow = *beta2_pow_;
for (int64_t j = 0; j < row_numel_; ++j) {
T g = grad_[i * row_numel_ + j];
T mom1 = moment1_[rows_[i] * row_numel_ + j];
T mom2 = moment2_[rows_[i] * row_numel_ + j];
T lr = *lr_;
T p = param_[rows_[i] * row_numel_ + j];
lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
moment1_out_[rows_[i] * row_numel_ + j] = mom1;
moment2_out_[rows_[i] * row_numel_ + j] = mom2;
param_out_[rows_[i] * row_numel_ + j] = p;
} // for col id
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AdamOpKernel : public framework::OpKernel<T> { class AdamOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -90,7 +156,8 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -90,7 +156,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
T beta2 = static_cast<T>(ctx.Attr<float>("beta2")); T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param"); auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad"); // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
auto* grad_var = ctx.InputVar("Grad");
auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1"); auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2"); auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
auto& lr = auto& lr =
...@@ -108,18 +175,48 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -108,18 +175,48 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto& mom2_out = auto& mom2_out =
Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out"); Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
AdamFunctor<T> functor(beta1, beta2, epsilon, beta1_pow.template data<T>(), if (grad_var->IsType<framework::LoDTensor>()) {
beta2_pow.template data<T>(), auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
mom1.template data<T>(), AdamFunctor<T> functor(
mom1_out.template mutable_data<T>(ctx.GetPlace()), beta1, beta2, epsilon, beta1_pow.template data<T>(),
mom2.template data<T>(), beta2_pow.template data<T>(), mom1.template data<T>(),
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom1_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad.template data<T>(), mom2.template data<T>(),
param.template data<T>(), mom2_out.template mutable_data<T>(ctx.GetPlace()),
param_out.template mutable_data<T>(ctx.GetPlace())); lr.template data<T>(), grad.template data<T>(),
platform::ForRange<DeviceContext> for_range( param.template data<T>(),
static_cast<const DeviceContext&>(ctx.device_context()), param.numel()); param_out.template mutable_data<T>(ctx.GetPlace()));
for_range(functor); platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto& grad =
Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
// merge duplicated rows if any.
scatter::MergeAdd<DeviceContext, T> merge_func;
auto grad_merge =
merge_func(ctx.template device_context<DeviceContext>(), grad);
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
auto* rows = grad_merge.rows().data();
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor(
beta1, beta2, epsilon, beta1_pow.template data<T>(),
beta2_pow.template data<T>(), mom1.template data<T>(),
mom1_out.template mutable_data<T>(ctx.GetPlace()),
mom2.template data<T>(),
mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
grad_merge.rows().size());
for_range(functor);
} else {
PADDLE_THROW("Variable type not supported by adam_op");
}
} }
}; };
......
...@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase { ...@@ -35,8 +35,8 @@ class ArrayOp : public framework::OperatorBase {
PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
size_t offset; size_t offset;
if (platform::is_gpu_place(i_tensor.place())) { if (platform::is_gpu_place(i_tensor.place())) {
......
...@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -106,8 +106,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
} }
auto slice = out->Slice(out_offset, out_offset + len); auto slice = out->Slice(out_offset, out_offset + len);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice); dev_ctx, &slice);
......
...@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase { ...@@ -82,8 +82,8 @@ class AssignOp : public framework::OperatorBase {
out != nullptr, out != nullptr,
"The Output(Out) should not be null if the Input(X) is set."); "The Output(Out) should not be null if the Input(X) is set.");
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
} }
......
...@@ -50,10 +50,6 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -50,10 +50,6 @@ class BatchNormOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), ""); PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), ""); PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
const float epsilon = ctx->Attrs().Get<float>("epsilon");
PADDLE_ENFORCE_GE(epsilon, 0.0, "epsilon should be larger than 0");
PADDLE_ENFORCE_LE(epsilon, 0.001, "epsilon should not be too large");
// make sure Mean/MeanOut and Variance/VarianceOut share memory in Python // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
"Mean and MeanOut should share the same memory"); "Mean and MeanOut should share the same memory");
...@@ -91,7 +87,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -91,7 +87,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddAttr<bool>("is_test", "").SetDefault(false); AddAttr<bool>("is_test", "").SetDefault(false);
AddAttr<float>("momentum", "").SetDefault(0.9); AddAttr<float>("momentum", "").SetDefault(0.9);
AddAttr<float>("epsilon", "").SetDefault(1e-5); AddAttr<float>("epsilon", "")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
"'epsilon' should be between 0.0 and 0.001.");
});
AddAttr<std::string>("data_layout", "").SetDefault("NCHW"); AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
AddInput("X", "The input tensor"); AddInput("X", "The input tensor");
AddInput("Scale", AddInput("Scale",
......
...@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { ...@@ -57,8 +57,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Borrow(dev_place); auto& dev_ctx = *pool.Get(dev_place);
framework::ExecutionContext ctx(*this, scope, dev_ctx); framework::ExecutionContext ctx(*this, scope, dev_ctx);
......
...@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope, ...@@ -195,8 +195,8 @@ void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
void CondOp::Run(const Scope& scope, const platform::Place& place) const { void CondOp::Run(const Scope& scope, const platform::Place& place) const {
// get device context from pool // get device context from pool
platform::DeviceContextPool& pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& dev_ctx = *pool.Borrow(place); auto& dev_ctx = *pool.Get(place);
PrepareDataForSubnet(scope, dev_ctx); PrepareDataForSubnet(scope, dev_ctx);
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope); std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
......
...@@ -315,6 +315,10 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -315,6 +315,10 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace,
paddle::operators::CudnnConvOpKernel<float>,
paddle::operators::CudnnConvOpKernel<double>);
REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
paddle::operators::CudnnConvOpKernel<float>, paddle::operators::CudnnConvOpKernel<float>,
paddle::operators::CudnnConvOpKernel<double>); paddle::operators::CudnnConvOpKernel<double>);
......
...@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -31,8 +31,6 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
int groups = ctx->Attrs().Get<int>("groups"); int groups = ctx->Attrs().Get<int>("groups");
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations"); std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
int input_channels = in_dims[1];
int output_channels = filter_dims[0];
PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
"Conv intput should be 4-D or 5-D tensor."); "Conv intput should be 4-D or 5-D tensor.");
...@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -45,9 +43,13 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddings.size(), strides.size(), paddings.size(), strides.size(),
"Conv paddings dimension and Conv strides dimension should be the same."); "Conv paddings dimension and Conv strides dimension should be the same.");
int input_channels = in_dims[1];
PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups, PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
"The number of input channels should be equal to filter " "The number of input channels should be equal to filter "
"channels * groups."); "channels * groups.");
int output_channels = filter_dims[0];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
output_channels % groups, 0, output_channels % groups, 0,
"The number of output channels should be divided by groups."); "The number of output channels should be divided by groups.");
......
...@@ -13,19 +13,15 @@ See the License for the specific language governing permissions and ...@@ -13,19 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/cos_sim_functor.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class CosSimKernel : public framework::OpKernel<T> { class CosSimKernel : public framework::OpKernel<T> {
...@@ -41,28 +37,25 @@ class CosSimKernel : public framework::OpKernel<T> { ...@@ -41,28 +37,25 @@ class CosSimKernel : public framework::OpKernel<T> {
out_x_norm->mutable_data<T>(context.GetPlace()); out_x_norm->mutable_data<T>(context.GetPlace());
out_y_norm->mutable_data<T>(context.GetPlace()); out_y_norm->mutable_data<T>(context.GetPlace());
// convert Tensor to Eigen Tensor
int rows_x = in_x->dims()[0]; int rows_x = in_x->dims()[0];
int rows_y = in_y->dims()[0]; int rows_y = in_y->dims()[0];
auto x = EigenMatrix<T>::Reshape(*in_x, 1);
auto y = EigenMatrix<T>::Reshape(*in_y, 1);
auto z = EigenVector<T>::Flatten(*out_z);
auto x_norm = EigenVector<T>::Flatten(*out_x_norm);
auto y_norm = EigenVector<T>::Flatten(*out_y_norm);
// compute int cols = framework::product(in_x->dims()) / rows_x;
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto row_along = Eigen::array<int, 1>({{1}});
x_norm.device(place) = x.square().sum(row_along).sqrt();
y_norm.device(place) = y.square().sum(row_along).sqrt();
if (rows_x == rows_y) { if (rows_x == rows_y) {
auto xy = (x * y).sum(Eigen::array<int, 1>({{1}})); math::CosSimFunctor<T, true> functor(
z.device(place) = xy / x_norm / y_norm; in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
out_y_norm->data<T>(), out_z->data<T>(), cols);
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(context.device_context()), rows_x);
for_range(functor);
} else { } else {
Eigen::DSizes<int, 2> bcast(rows_x, 1); math::CosSimFunctor<T, false> functor(
auto xy = (x * y.broadcast(bcast)).sum(row_along); in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
z.device(place) = xy / x_norm / y_norm.broadcast(bcast); out_y_norm->data<T>(), out_z->data<T>(), cols);
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(context.device_context()), rows_x);
for_range(functor);
} }
} }
}; };
...@@ -81,62 +74,54 @@ class CosSimGradKernel : public framework::OpKernel<T> { ...@@ -81,62 +74,54 @@ class CosSimGradKernel : public framework::OpKernel<T> {
auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y")); auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out")); auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
// convert Tensor to Eigen Tensor
auto x = EigenMatrix<T>::Reshape(*in_x, 1);
auto y = EigenMatrix<T>::Reshape(*in_y, 1);
auto z = EigenMatrix<T>::Reshape(*in_z, 1);
auto x_norm = EigenMatrix<T>::Reshape(*in_x_norm, 1);
auto y_norm = EigenMatrix<T>::Reshape(*in_y_norm, 1);
auto dz = EigenMatrix<T>::Reshape(*in_grad_z, 1);
// compute gradident // compute gradident
int rows_x = in_x->dims()[0]; int rows_x = in_x->dims()[0];
int rows_y = in_y->dims()[0]; int rows_y = in_y->dims()[0];
int cols = framework::product(in_x->dims()) / rows_x; int cols = framework::product(in_x->dims()) / rows_x;
Eigen::DSizes<int, 2> bcast_cols(1, cols);
auto z_bcast = z.broadcast(bcast_cols);
auto dz_bcast = dz.broadcast(bcast_cols);
auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
if (rows_x == rows_y) { if (rows_x == rows_y) {
auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
// compute dx
if (out_grad_x) { if (out_grad_x) {
out_grad_x->mutable_data<T>(context.GetPlace()); math::CosSimGradFunctor<T> functor(
auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1); in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast; in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
dx.device(place) = dz_bcast * grad; out_grad_x->mutable_data<T>(context.GetPlace()), cols);
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(context.device_context()),
rows_x);
for_range(functor);
} }
// compute dy
if (out_grad_y) { if (out_grad_y) {
out_grad_y->mutable_data<T>(context.GetPlace()); math::CosSimGradFunctor<T> functor(
auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1); in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast; in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
dy.device(place) = dz_bcast * grad; out_grad_y->mutable_data<T>(context.GetPlace()), cols);
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(context.device_context()),
rows_x);
for_range(functor);
} }
} else { } else {
Eigen::DSizes<int, 2> bcast_rows(rows_x, 1);
Eigen::DSizes<int, 2> bcast_rows_cols(rows_x, cols);
auto y_bcast = y.broadcast(bcast_rows);
auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols);
auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows))
.eval()
.broadcast(bcast_cols);
// compute dx
if (out_grad_x) { if (out_grad_x) {
out_grad_x->mutable_data<T>(context.GetPlace()); math::CosSimDxFunctor<T> functor(
auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1); in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast; in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
dx.device(place) = dz_bcast * grad; out_grad_x->mutable_data<T>(context.GetPlace()), cols);
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(context.device_context()),
rows_x);
for_range(functor);
} }
// compute dy
if (out_grad_y) { if (out_grad_y) {
out_grad_y->mutable_data<T>(context.GetPlace()); out_grad_y->mutable_data<T>(context.GetPlace());
auto dy = EigenVector<T>::Flatten(*out_grad_y); math::SetConstant<DeviceContext, T> set_zero;
auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast; auto& dev_ctx = context.template device_context<DeviceContext>();
dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({{0}})); set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
math::CosSimDyFunctor<DeviceContext, T> functor;
functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
in_grad_z->data<T>(), static_cast<size_t>(rows_x),
static_cast<size_t>(cols), out_grad_y->data<T>());
} }
} }
} }
......
...@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -114,15 +114,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker) CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", AddInput("X",
"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, " "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
"where N is the batch size and D is the number of classes. " " where N is the batch size and D is the number of classes. "
"This input is a probability computed by the previous operator, " "This input is a probability computed by the previous operator, "
"which is almost always the result of a softmax operator."); "which is almost always the result of a softmax operator.");
AddInput("Label", AddInput("Label",
"(Tensor), the ground truth which is a 2-D tensor. When " "(Tensor), the ground truth which is a 2-D tensor. When "
"soft_label is set to false, Label is a Tensor<int64> with shape " "soft_label is set to false, Label is a Tensor<int64> with shape "
"[N x 1]. When soft_label is set to true, Label is a " "[N x 1]. When soft_label is set to true, Label is a "
"Tensor<float/double> with shape [N x K]."); "Tensor<float/double> with shape [N x D].");
AddOutput("Y", AddOutput("Y",
"(Tensor, default Tensor<float>), a 2-D tensor with shape " "(Tensor, default Tensor<float>), a 2-D tensor with shape "
"[N x 1]. The cross entropy loss."); "[N x 1]. The cross entropy loss.");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/detection_output_op.h"
namespace paddle {
namespace operators {
class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
public:
DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Loc",
"(Tensor) The input tensor of detection_output operator."
"The input predict locations"
"The format of input tensor is kNCHW. Where K is priorbox point "
"numbers,"
"N is How many boxes are there on each point, "
"C is 4, H and W both are 1.");
AddInput("Conf",
"(Tensor) The input tensor of detection_output operator."
"The input priorbox confidence."
"The format of input tensor is kNCHW. Where K is priorbox point "
"numbers,"
"N is How many boxes are there on each point, "
"C is the number of classes, H and W both are 1.");
AddInput("PriorBox",
"(Tensor) The input tensor of detection_output operator."
"The format of input tensor is the position and variance "
"of the boxes");
AddOutput("Out",
"(Tensor) The output tensor of detection_output operator.");
AddAttr<int>("background_label_id", "(int), The background class index.");
AddAttr<int>("num_classes", "(int), The number of the classification.");
AddAttr<float>("nms_threshold",
"(float), The Non-maximum suppression threshold.");
AddAttr<float>("confidence_threshold",
"(float), The classification confidence threshold.");
AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
AddAttr<int>("nms_top_k",
"(int), The bbox number kept of the NMS’s output.");
AddComment(R"DOC(
detection output for SSD(single shot multibox detector)
Apply the NMS to the output of network and compute the predict
bounding box location. The output’s shape of this layer could
be zero if there is no valid bounding box.
)DOC");
}
};
class DetectionOutputOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Loc"),
"Input(X) of DetectionOutputOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Conf"),
"Input(X) of DetectionOutputOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
"Input(X) of DetectionOutputOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of DetectionOutputOp should not be null.");
std::vector<int64_t> output_shape({1, 7});
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
ops::DetectionOutputOpMaker);
REGISTER_OP_CPU_KERNEL(
detection_output,
ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/detection_output_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
detection_output,
ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/framework/tensor.h"
#include "paddle/operators/math/detection_util.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/softmax.h"
#include "paddle/operators/strided_memcpy.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
inline void transpose_fun(const framework::ExecutionContext& context,
const framework::Tensor& src,
framework::Tensor* dst) {
int input_nums = src.dims()[0];
int offset = 0;
for (int j = 0; j < input_nums; ++j) {
framework::Tensor in_p_tensor = src.Slice(j, j + 1);
std::vector<int64_t> shape_vec(
{in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
framework::DDim shape(framework::make_ddim(shape_vec));
framework::Tensor in_p_tensor_transpose;
in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
std::vector<int> shape_axis({0, 1, 3, 4, 2});
math::Transpose<DeviceContext, T, 5> trans5;
trans5(context.template device_context<DeviceContext>(), in_p_tensor,
&in_p_tensor_transpose, shape_axis);
auto dst_stride = framework::stride(dst->dims());
auto src_stride = framework::stride(in_p_tensor_transpose.dims());
StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
src_stride, in_p_tensor_transpose.dims(), dst_stride,
dst->data<T>() + offset);
offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
}
}
template <typename DeviceContext, typename T>
class DetectionOutputKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
const framework::Tensor* in_priorbox =
context.Input<framework::Tensor>("PriorBox");
auto* out = context.Output<framework::Tensor>("Out");
int num_classes = context.template Attr<int>("num_classes");
int top_k = context.template Attr<int>("top_k");
int nms_top_k = context.template Attr<int>("nms_top_k");
int background_label_id = context.template Attr<int>("background_label_id");
float nms_threshold = context.template Attr<float>("nms_threshold");
float confidence_threshold =
context.template Attr<float>("confidence_threshold");
size_t batch_size = in_conf->dims()[1];
int conf_sum_size = in_conf->numel();
// for softmax
std::vector<int64_t> conf_shape_softmax_vec(
{conf_sum_size / num_classes, num_classes});
framework::DDim conf_shape_softmax(
framework::make_ddim(conf_shape_softmax_vec));
// for knchw => nhwc
std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
in_loc->dims()[4],
in_loc->dims()[2] * in_loc->dims()[0]});
std::vector<int64_t> conf_shape_vec(
{1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
in_conf->dims()[2] * in_conf->dims()[0]});
framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
framework::Tensor loc_tensor;
framework::Tensor conf_tensor;
loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
// for cpu
framework::Tensor loc_cpu;
framework::Tensor conf_cpu;
framework::Tensor priorbox_cpu;
const T* priorbox_data = in_priorbox->data<T>();
transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
conf_tensor.Resize(conf_shape_softmax);
math::SoftmaxFunctor<DeviceContext, T>()(
context.template device_context<DeviceContext>(), &conf_tensor,
&conf_tensor);
T* loc_data = loc_tensor.data<T>();
T* conf_data = conf_tensor.data<T>();
if (platform::is_gpu_place(context.GetPlace())) {
loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
framework::CopyFrom(loc_tensor, platform::CPUPlace(),
context.device_context(), &loc_cpu);
loc_data = loc_cpu.data<T>();
conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
framework::CopyFrom(conf_tensor, platform::CPUPlace(),
context.device_context(), &conf_cpu);
conf_data = conf_cpu.data<T>();
priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
framework::CopyFrom(*in_priorbox, platform::CPUPlace(),
context.device_context(), &priorbox_cpu);
priorbox_data = priorbox_cpu.data<T>();
}
// get decode bboxes
size_t num_priors = in_priorbox->numel() / 8;
std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
for (size_t n = 0; n < batch_size; ++n) {
std::vector<operators::math::BBox<T>> decoded_bboxes;
for (size_t i = 0; i < num_priors; ++i) {
size_t prior_offset = i * 8;
size_t loc_pred_offset = n * num_priors * 4 + i * 4;
std::vector<math::BBox<T>> prior_bbox_vec;
math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
prior_bbox_vec);
std::vector<std::vector<T>> prior_bbox_var;
math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
prior_bbox_var);
std::vector<T> loc_pred_data;
for (size_t j = 0; j < 4; ++j)
loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
decoded_bboxes.push_back(bbox);
}
all_decoded_bboxes.push_back(decoded_bboxes);
}
std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
int num_kept = math::GetDetectionIndices<T>(
conf_data, num_priors, num_classes, background_label_id, batch_size,
confidence_threshold, nms_top_k, nms_threshold, top_k,
all_decoded_bboxes, &all_indices);
if (num_kept <= 0) {
std::vector<int64_t> out_shape_vec({0, 0});
framework::DDim out_shape(framework::make_ddim(out_shape_vec));
out->Resize(out_shape);
return;
}
std::vector<int64_t> out_shape_vec({num_kept, 7});
framework::DDim out_shape(framework::make_ddim(out_shape_vec));
out->mutable_data<T>(out_shape, context.GetPlace());
framework::Tensor out_cpu;
T* out_data = out->data<T>();
if (platform::is_gpu_place(context.GetPlace())) {
out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
out_data = out_cpu.data<T>();
}
math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
batch_size, all_indices, all_decoded_bboxes,
out_data);
if (platform::is_gpu_place(context.GetPlace())) {
framework::CopyFrom(out_cpu, platform::CUDAPlace(),
context.device_context(), out);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -25,8 +25,6 @@ class DropoutOp : public framework::OperatorWithKernel { ...@@ -25,8 +25,6 @@ class DropoutOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", x_dims); ctx->SetOutputDim("Out", x_dims);
...@@ -47,7 +45,11 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -47,7 +45,11 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate(); AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
AddAttr<float>("dropout_prob", "Probability of setting units to zero.") AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
.SetDefault(.5f); .SetDefault(.5f)
.AddCustomChecker([](const float& drop_p) {
PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
"'dropout_prob' must be between 0.0 and 1.0.");
});
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false); AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0); AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
...@@ -78,8 +80,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel { ...@@ -78,8 +80,6 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) must not be null."); "Input(Out@GRAD) must not be null.");
PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(x_dims, out_dims, PADDLE_ENFORCE_EQ(x_dims, out_dims,
......
...@@ -30,16 +30,15 @@ struct MaskGenerator { ...@@ -30,16 +30,15 @@ struct MaskGenerator {
__host__ __device__ MaskGenerator(AttrType dropout_prob, int seed) __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
: dropout_prob(dropout_prob), seed(seed) {} : dropout_prob(dropout_prob), seed(seed) {}
__host__ __device__ T operator()(const unsigned int n) const { inline __host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng; thrust::minstd_rand rng;
rng.seed(seed); rng.seed(seed);
thrust::uniform_real_distribution<AttrType> dist(0, 1); thrust::uniform_real_distribution<AttrType> dist(0, 1);
rng.discard(n); rng.discard(n);
if (dist(rng) < dropout_prob) { if (dist(rng) < dropout_prob) {
return static_cast<T>(0); return static_cast<T>(0);
} else {
return static_cast<T>(1);
} }
return static_cast<T>(1);
} }
}; };
......
...@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase { ...@@ -49,8 +49,8 @@ class FeedOp : public framework::OperatorBase {
auto *out_item = out_var->GetMutable<framework::FeedFetchType>(); auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(feed_item, place, dev_ctx, out_item); framework::CopyFrom(feed_item, place, dev_ctx, out_item);
out_item->set_lod(feed_item.lod()); out_item->set_lod(feed_item.lod());
......
...@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase { ...@@ -52,8 +52,8 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate // FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs? // CPU outputs?
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
dev_ctx.Wait(); dev_ctx.Wait();
......
...@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase { ...@@ -49,8 +49,8 @@ class FillConstantOp : public framework::OperatorBase {
out.mutable_data(dev_place, framework::ToTypeIndex(data_type)); out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
} }
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(dev_place); auto &dev_ctx = *pool.Get(dev_place);
math::set_constant(dev_ctx, &out, value); math::set_constant(dev_ctx, &out, value);
} }
}; };
......
...@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase { ...@@ -69,8 +69,9 @@ class FillOp : public framework::OperatorBase {
if (!force_cpu && platform::is_gpu_place(place)) { if (!force_cpu && platform::is_gpu_place(place)) {
// Copy tensor to out // Copy tensor to out
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(tensor, place, dev_ctx, &out); framework::CopyFrom(tensor, place, dev_ctx, &out);
} }
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/operators/math/gru_compute.h" #include "paddle/operators/math/gru_compute.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h" #include "paddle/operators/math/sequence2batch.h"
...@@ -70,7 +71,7 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -70,7 +71,7 @@ class GRUKernel : public framework::OpKernel<T> {
} }
int frame_size = hidden_dims[1]; int frame_size = hidden_dims[1];
math::hl_gru_value<T> gru_value; math::GRUMetaValue<T> gru_value;
gru_value.gate_weight = const_cast<T*>(weight_data); gru_value.gate_weight = const_cast<T*>(weight_data);
gru_value.state_weight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
...@@ -89,6 +90,10 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -89,6 +90,10 @@ class GRUKernel : public framework::OpKernel<T> {
} }
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
auto active_node = math::detail::GetActivationType(
context.Attr<std::string>("activation"));
auto active_gate = math::detail::GetActivationType(
context.Attr<std::string>("gate_activation"));
for (size_t n = 0; n < num_batch; n++) { for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -101,9 +106,8 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -101,9 +106,8 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value.gate_value = gate_t.data<T>(); gru_value.gate_value = gate_t.data<T>();
gru_value.reset_output_value = reset_hidden_prev_t.data<T>(); gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
math::GRUUnitFunctor<DeviceContext, T>::compute( math::GRUUnitFunctor<DeviceContext, T>::compute(
dev_ctx, gru_value, frame_size, cur_batch_size, dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
math::ActiveType(context.Attr<std::string>("activation")), active_gate);
math::ActiveType(context.Attr<std::string>("gate_activation")));
gru_value.prev_out_value = gru_value.output_value; gru_value.prev_out_value = gru_value.output_value;
} }
...@@ -170,12 +174,12 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -170,12 +174,12 @@ class GRUGradKernel : public framework::OpKernel<T> {
batch_hidden_grad.set_lod(batch_hidden->lod()); batch_hidden_grad.set_lod(batch_hidden->lod());
to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
math::hl_gru_value<T> gru_value; math::GRUMetaValue<T> gru_value;
gru_value.gate_weight = const_cast<T*>(weight_data); gru_value.gate_weight = const_cast<T*>(weight_data);
gru_value.state_weight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
math::hl_gru_grad<T> gru_grad; math::GRUMetaGrad<T> gru_grad;
if (weight_grad) { if (weight_grad) {
gru_grad.gate_weight_grad = gru_grad.gate_weight_grad =
weight_grad->mutable_data<T>(context.GetPlace()); weight_grad->mutable_data<T>(context.GetPlace());
...@@ -189,6 +193,10 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -189,6 +193,10 @@ class GRUGradKernel : public framework::OpKernel<T> {
auto batch_starts = batch_hidden_grad.lod()[0]; auto batch_starts = batch_hidden_grad.lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
auto active_node = math::detail::GetActivationType(
context.Attr<std::string>("activation"));
auto active_gate = math::detail::GetActivationType(
context.Attr<std::string>("gate_activation"));
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) { for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -219,9 +227,8 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -219,9 +227,8 @@ class GRUGradKernel : public framework::OpKernel<T> {
} }
math::GRUUnitGradFunctor<DeviceContext, T>::compute( math::GRUUnitGradFunctor<DeviceContext, T>::compute(
dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
math::ActiveType(context.Attr<std::string>("activation")), active_gate);
math::ActiveType(context.Attr<std::string>("gate_activation")));
} }
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
......
...@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase { ...@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
out_var_name); out_var_name);
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
framework::DeserializeFromStream(fin, tensor); DeserializeFromStream(fin, tensor);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
// copy CPU to GPU // copy CPU to GPU
......
...@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -88,8 +88,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
auto slice = out[i].Slice(static_cast<int>(offset), auto slice = out[i].Slice(static_cast<int>(offset),
static_cast<int>(offset + len)); static_cast<int>(offset + len));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin), framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)), static_cast<int>(each_range.end)),
......
...@@ -9,13 +9,14 @@ if(WITH_GPU) ...@@ -9,13 +9,14 @@ if(WITH_GPU)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
...@@ -23,13 +24,14 @@ else() ...@@ -23,13 +24,14 @@ else()
cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(maxouting SRCS maxouting.cc DEPS device_context)
cc_library(unpooling SRCS unpooling.cc DEPS device_context) cc_library(unpooling SRCS unpooling.cc DEPS device_context)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function) cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
endif() endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/cos_sim_functor.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
const T* y_norm, const T* x, const T* y, const T* z,
const T* dz, const size_t rows, const size_t cols,
T* dy) const {
for (size_t row_id = 0; row_id < rows; ++row_id) {
auto xy_norm_prod = x_norm[row_id] * y_norm[0];
auto dz_data = dz[row_id];
auto z_data = z[row_id];
auto* x_data = x + cols * row_id;
auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
auto y_norm_square = y_norm[0] * y_norm[0];
auto reciprocal_y_norm_square = 1 / y_norm_square;
for (size_t i = 0; i < cols; ++i) {
dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
z_data * y[i] * reciprocal_y_norm_square);
}
}
}
};
template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/cos_sim_functor.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
const T* y, const T* z, const T* dz,
const size_t rows, const size_t cols, T* dy) {
int grid_size = blockDim.x * gridDim.x;
T y_norm_data = y_norm[0];
for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
row_id += grid_size) {
T xy_norm_prod = x_norm[row_id] * y_norm_data;
T dz_data = dz[row_id];
T z_data = z[row_id];
const T* x_data = x + cols * row_id;
T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
T y_norm_square = y_norm_data * y_norm_data;
T reciprocal_y_norm_square = 1 / y_norm_square;
for (size_t i = 0; i < cols; ++i) {
T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
z_data * y[i] * reciprocal_y_norm_square);
platform::CudaAtomicAdd(dy + i, dy_data);
}
}
}
template <typename T>
struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
const T* y_norm, const T* x, const T* y, const T* z,
const T* dz, const size_t rows, const size_t cols,
T* dy) const {
const int block_size = 512;
dim3 threads(block_size, 1);
dim3 grid(1, (rows + block_size - 1) / block_size);
CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
x_norm, y_norm, x, y, z, dz, rows, cols, dy);
}
};
template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <stdlib.h>
#include "paddle/platform/device_context.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T, bool same_row>
struct CosSimFunctor {
CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
: x_norm_(x_norm),
y_norm_(y_norm),
x_(x),
y_(y),
z_(z),
cols_(static_cast<size_t>(cols)) {}
inline HOSTDEVICE void operator()(size_t row_id) const {
auto* x = x_ + cols_ * row_id;
T xx = 0, xy = 0, yy = 0;
if (same_row) {
auto* y = y_ + cols_ * row_id;
T tep_x, tep_y;
for (size_t i = 0; i < cols_; ++i) {
tep_x = x[i];
tep_y = y[i];
xx += tep_x * tep_x;
yy += tep_y * tep_y;
xy += tep_x * tep_y;
}
xx = sqrt(xx);
yy = sqrt(yy);
y_norm_[row_id] = yy;
x_norm_[row_id] = xx;
z_[row_id] = xy / (xx * yy);
} else { // This can be wrote in a better way.
T tep_x, tep_y;
for (size_t i = 0; i < cols_; ++i) {
tep_x = x[i];
tep_y = y_[i];
xx += tep_x * tep_x;
yy += tep_y * tep_y;
xy += tep_x * tep_y;
}
xx = sqrt(xx);
yy = sqrt(yy);
if (row_id == 0) y_norm_[0] = yy;
x_norm_[row_id] = xx;
z_[row_id] = xy / (xx * yy);
}
}
T* x_norm_;
T* y_norm_;
const T* x_;
const T* y_;
T* z_;
const size_t cols_;
};
template <typename T>
struct CosSimGradFunctor {
CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
const T* z, const T* dz, T* dx, int cols)
: x_norm_(x_norm),
y_norm_(y_norm),
x_(x),
y_(y),
z_(z),
dz_(dz),
dx_(dx),
cols_(static_cast<size_t>(cols)) {}
inline HOSTDEVICE void operator()(size_t row_id) const {
auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
auto dz = dz_[row_id];
auto z = z_[row_id];
auto* dx = dx_ + cols_ * row_id;
auto* x = x_ + cols_ * row_id;
auto* y = y_ + cols_ * row_id;
auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
auto reciprocal_x_norm_square = 1 / x_norm_square;
for (size_t i = 0; i < cols_; ++i) {
dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
z * x[i] * reciprocal_x_norm_square);
}
}
const T* x_norm_;
const T* y_norm_;
const T* x_;
const T* y_;
const T* z_;
const T* dz_;
T* dx_;
const size_t cols_;
};
template <typename T>
struct CosSimDxFunctor {
CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
const T* z, const T* dz, T* dx, int cols)
: x_norm_(x_norm),
y_norm_(y_norm),
x_(x),
y_(y),
z_(z),
dz_(dz),
dx_(dx),
cols_(static_cast<size_t>(cols)) {}
inline HOSTDEVICE void operator()(size_t row_id) const {
auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
auto dz = dz_[row_id];
auto z = z_[row_id];
auto* x = x_ + cols_ * row_id;
auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
auto* dx = dx_ + cols_ * row_id;
auto reciprocal_x_norm_square = 1 / x_norm_square;
for (size_t i = 0; i < cols_; ++i) {
dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
z * x[i] * reciprocal_x_norm_square);
}
}
const T* x_norm_;
const T* y_norm_;
const T* x_;
const T* y_;
const T* z_;
const T* dz_;
T* dx_;
const size_t cols_;
};
template <typename DeviceContext, typename T>
struct CosSimDyFunctor {
void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
const T* x, const T* y, const T* z, const T* dz,
const size_t rows, const size_t cols, T* dy) const;
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -28,7 +28,7 @@ template <class OpResetOutput, typename T> ...@@ -28,7 +28,7 @@ template <class OpResetOutput, typename T>
void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
T *gate_value, T *reset_output_value, T *gate_value, T *reset_output_value,
T *prev_output_value, int frame_size, T *prev_output_value, int frame_size,
activation_mode_t active_gate) { ActivationType active_gate) {
T r_value_update_gate; T r_value_update_gate;
T r_value_reset_gate; T r_value_reset_gate;
T r_value_reset_output; T r_value_reset_output;
...@@ -56,7 +56,7 @@ template <class OpFinalOutput, typename T> ...@@ -56,7 +56,7 @@ template <class OpFinalOutput, typename T>
void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
T *gate_value, T *prev_output_value, T *gate_value, T *prev_output_value,
T *output_value, int frame_size, T *output_value, int frame_size,
activation_mode_t active_node) { ActivationType active_node) {
T r_value_update_gate; T r_value_update_gate;
T r_value_frame_state; T r_value_frame_state;
T r_prev_out = 0; T r_prev_out = 0;
...@@ -83,7 +83,7 @@ template <class OpResetOutput, typename T> ...@@ -83,7 +83,7 @@ template <class OpResetOutput, typename T>
void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
T *gate_value, T *reset_output_value, T *gate_value, T *reset_output_value,
T *prev_output_value, int frame_size, T *prev_output_value, int frame_size,
activation_mode_t active_gate) { ActivationType active_gate) {
#ifdef __AVX__ #ifdef __AVX__
__m256 r_value_update_gate; __m256 r_value_update_gate;
__m256 r_value_reset_gate; __m256 r_value_reset_gate;
...@@ -113,7 +113,7 @@ template <class OpFinalOutput, typename T> ...@@ -113,7 +113,7 @@ template <class OpFinalOutput, typename T>
void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
T *gate_value, T *prev_output_value, T *gate_value, T *prev_output_value,
T *output_value, int frame_size, T *output_value, int frame_size,
activation_mode_t active_node) { ActivationType active_node) {
#ifdef __AVX__ #ifdef __AVX__
__m256 r_value_update_gate; __m256 r_value_update_gate;
__m256 r_value_frame_state; __m256 r_value_frame_state;
...@@ -140,9 +140,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -140,9 +140,8 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
template <class OpResetOutput, typename T> template <class OpResetOutput, typename T>
inline void forward_reset_output(OpResetOutput op_reset_output, inline void forward_reset_output(OpResetOutput op_reset_output,
hl_gru_value<T> value, int frame_size, GRUMetaValue<T> value, int frame_size,
int batch_size, int batch_size, ActivationType active_gate) {
activation_mode_t active_gate) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_reset_output( hl_avx_gru_forward_reset_output(
...@@ -164,9 +163,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output, ...@@ -164,9 +163,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
template <class OpFinalOutput, typename T> template <class OpFinalOutput, typename T>
inline void forward_final_output(OpFinalOutput op_final_output, inline void forward_final_output(OpFinalOutput op_final_output,
hl_gru_value<T> value, int frame_size, GRUMetaValue<T> value, int frame_size,
int batch_size, int batch_size, ActivationType active_node) {
activation_mode_t active_node) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_forward_final_output(op_final_output, value.gate_value, hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
...@@ -191,7 +189,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -191,7 +189,7 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T *gate_grad, T *prev_out_value, T *gate_grad, T *prev_out_value,
T *prev_out_grad, T *output_grad, T *prev_out_grad, T *output_grad,
int frame_size, int frame_size,
activation_mode_t active_node) { ActivationType active_node) {
T r_update_gate_value; T r_update_gate_value;
T r_update_gate_grad; T r_update_gate_grad;
T r_frame_state_value; T r_frame_state_value;
...@@ -232,7 +230,7 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -232,7 +230,7 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
T *gate_grad, T *prev_out_value, T *gate_grad, T *prev_out_value,
T *prev_out_grad, T *reset_output_grad, T *prev_out_grad, T *reset_output_grad,
int frame_size, int frame_size,
activation_mode_t active_gate) { ActivationType active_gate) {
T r_update_gate_value; T r_update_gate_value;
T r_update_gate_grad; T r_update_gate_grad;
T r_reset_gate_value; T r_reset_gate_value;
...@@ -277,7 +275,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, ...@@ -277,7 +275,7 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
T *gate_grad, T *prev_out_value, T *gate_grad, T *prev_out_value,
T *prev_out_grad, T *output_grad, T *prev_out_grad, T *output_grad,
int frame_size, int frame_size,
activation_mode_t active_node) { ActivationType active_node) {
#ifdef __AVX__ #ifdef __AVX__
__m256 r_update_gate_value; __m256 r_update_gate_value;
__m256 r_update_gate_grad; __m256 r_update_gate_grad;
...@@ -320,7 +318,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -320,7 +318,7 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
T *gate_grad, T *prev_out_value, T *gate_grad, T *prev_out_value,
T *prev_out_grad, T *reset_output_grad, T *prev_out_grad, T *reset_output_grad,
int frame_size, int frame_size,
activation_mode_t active_gate) { ActivationType active_gate) {
#ifdef __AVX__ #ifdef __AVX__
__m256 r_update_gate_value; __m256 r_update_gate_value;
__m256 r_update_gate_grad; __m256 r_update_gate_grad;
...@@ -364,9 +362,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -364,9 +362,9 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
template <class OpStateGrad, typename T> template <class OpStateGrad, typename T>
inline void backward_state_grad(OpStateGrad op_state_grad, inline void backward_state_grad(OpStateGrad op_state_grad,
hl_gru_value<T> value, hl_gru_grad<T> grad, GRUMetaValue<T> value, GRUMetaGrad<T> grad,
int frame_size, int batch_size, int frame_size, int batch_size,
activation_mode_t active_node) { ActivationType active_node) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_state_grad( hl_avx_gru_backward_state_grad(
...@@ -393,9 +391,9 @@ inline void backward_state_grad(OpStateGrad op_state_grad, ...@@ -393,9 +391,9 @@ inline void backward_state_grad(OpStateGrad op_state_grad,
template <class OpResetGrad, typename T> template <class OpResetGrad, typename T>
inline void backward_reset_grad(OpResetGrad op_reset_grad, inline void backward_reset_grad(OpResetGrad op_reset_grad,
hl_gru_value<T> value, hl_gru_grad<T> grad, GRUMetaValue<T> value, GRUMetaGrad<T> grad,
int frame_size, int batch_size, int frame_size, int batch_size,
activation_mode_t active_gate) { ActivationType active_gate) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
hl_avx_gru_backward_reset_grad( hl_avx_gru_backward_reset_grad(
......
...@@ -19,8 +19,6 @@ limitations under the License. */ ...@@ -19,8 +19,6 @@ limitations under the License. */
#include "paddle/platform/cuda_helper.h" #include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include <glog/logging.h>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -35,7 +33,7 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, ...@@ -35,7 +33,7 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
T *gate_value, T *reset_output_value, T *gate_value, T *reset_output_value,
T *prev_output_value, int frame_size, T *prev_output_value, int frame_size,
int batch_size, int batch_size,
activation_mode_t active_gate) { ActivationType active_gate) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return; if (frame_idx >= frame_size) return;
...@@ -74,7 +72,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, ...@@ -74,7 +72,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
T *gate_value, T *prev_output_value, T *gate_value, T *prev_output_value,
T *output_value, int frame_size, T *output_value, int frame_size,
int batch_size, int batch_size,
activation_mode_t active_node) { ActivationType active_node) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return; if (frame_idx >= frame_size) return;
int batch_idx = 0; int batch_idx = 0;
...@@ -111,7 +109,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, ...@@ -111,7 +109,7 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
T *gate_grad, T *prev_out_value, T *gate_grad, T *prev_out_value,
T *prev_out_grad, T *output_grad, T *prev_out_grad, T *output_grad,
int frame_size, int batch_size, int frame_size, int batch_size,
activation_mode_t active_node) { ActivationType active_node) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return; if (frame_idx >= frame_size) return;
int batch_idx = 0; int batch_idx = 0;
...@@ -159,7 +157,7 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, ...@@ -159,7 +157,7 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
T *gate_grad, T *prev_out_value, T *gate_grad, T *prev_out_value,
T *prev_out_grad, T *reset_output_grad, T *prev_out_grad, T *reset_output_grad,
int frame_size, int batch_size, int frame_size, int batch_size,
activation_mode_t active_gate) { ActivationType active_gate) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return; if (frame_idx >= frame_size) return;
int batch_idx = 0; int batch_idx = 0;
......
...@@ -30,7 +30,7 @@ class gru_resetOutput { ...@@ -30,7 +30,7 @@ class gru_resetOutput {
public: public:
HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate, HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
T &prev_out, T &value_reset_output, T &prev_out, T &value_reset_output,
activation_mode_t act_gate) { ActivationType act_gate) {
value_update_gate = activation(value_update_gate, act_gate); value_update_gate = activation(value_update_gate, act_gate);
value_reset_gate = activation(value_reset_gate, act_gate); value_reset_gate = activation(value_reset_gate, act_gate);
value_reset_output = prev_out * value_reset_gate; value_reset_output = prev_out * value_reset_gate;
...@@ -43,7 +43,7 @@ class gru_resetOutput { ...@@ -43,7 +43,7 @@ class gru_resetOutput {
HOSTDEVICE void operator()(__m256 &value_update_gate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &value_reset_gate, __m256 &prev_out, __m256 &value_reset_gate, __m256 &prev_out,
__m256 &value_reset_output, __m256 &value_reset_output,
activation_mode_t act_gate) { ActivationType act_gate) {
value_update_gate = activation(value_update_gate, act_gate); value_update_gate = activation(value_update_gate, act_gate);
value_reset_gate = activation(value_reset_gate, act_gate); value_reset_gate = activation(value_reset_gate, act_gate);
value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate); value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
...@@ -57,7 +57,7 @@ class gru_finalOutput { ...@@ -57,7 +57,7 @@ class gru_finalOutput {
public: public:
HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state, HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
T &prev_out, T &value_output, T &prev_out, T &value_output,
activation_mode_t act_input) { ActivationType act_input) {
value_frame_state = activation(value_frame_state, act_input); value_frame_state = activation(value_frame_state, act_input);
value_output = prev_out - (value_update_gate * prev_out) + value_output = prev_out - (value_update_gate * prev_out) +
(value_update_gate * value_frame_state); (value_update_gate * value_frame_state);
...@@ -69,8 +69,7 @@ class gru_finalOutput { ...@@ -69,8 +69,7 @@ class gru_finalOutput {
static const bool avx = true; static const bool avx = true;
HOSTDEVICE void operator()(__m256 &value_update_gate, HOSTDEVICE void operator()(__m256 &value_update_gate,
__m256 &value_frame_state, __m256 &prev_out, __m256 &value_frame_state, __m256 &prev_out,
__m256 &value_output, __m256 &value_output, ActivationType act_input) {
activation_mode_t act_input) {
value_frame_state = activation(value_frame_state, act_input); value_frame_state = activation(value_frame_state, act_input);
value_output = _mm256_add_ps( value_output = _mm256_add_ps(
_mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)), _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
...@@ -89,7 +88,7 @@ class gru_stateGrad { ...@@ -89,7 +88,7 @@ class gru_stateGrad {
HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
T &value_frame_state, T &grad_frame_state, T &value_frame_state, T &grad_frame_state,
T &value_prev_out, T &grad_prev_out, T &value_prev_out, T &grad_prev_out,
T &grad_output, activation_mode_t act_input) { T &grad_output, ActivationType act_input) {
grad_update_gate = (grad_output * value_frame_state); grad_update_gate = (grad_output * value_frame_state);
grad_update_gate -= (grad_output * value_prev_out); grad_update_gate -= (grad_output * value_prev_out);
grad_prev_out -= (grad_output * value_update_gate); grad_prev_out -= (grad_output * value_update_gate);
...@@ -107,7 +106,7 @@ class gru_stateGrad { ...@@ -107,7 +106,7 @@ class gru_stateGrad {
__m256 &value_frame_state, __m256 &value_frame_state,
__m256 &grad_frame_state, __m256 &value_prev_out, __m256 &grad_frame_state, __m256 &value_prev_out,
__m256 &grad_prev_out, __m256 &grad_output, __m256 &grad_prev_out, __m256 &grad_output,
activation_mode_t act_input) { ActivationType act_input) {
grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state); grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
grad_update_gate = _mm256_sub_ps( grad_update_gate = _mm256_sub_ps(
grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out)); grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
...@@ -128,7 +127,7 @@ class gru_resetGrad { ...@@ -128,7 +127,7 @@ class gru_resetGrad {
HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
T &value_reset_gate, T &grad_reset_gate, T &value_reset_gate, T &grad_reset_gate,
T &value_prev_out, T &grad_prev_out, T &value_prev_out, T &grad_prev_out,
T &grad_reset_output, activation_mode_t act_gate) { T &grad_reset_output, ActivationType act_gate) {
grad_reset_gate = (grad_reset_output * value_prev_out); grad_reset_gate = (grad_reset_output * value_prev_out);
grad_prev_out += (grad_reset_output * value_reset_gate); grad_prev_out += (grad_reset_output * value_reset_gate);
grad_update_gate = grad_update_gate =
...@@ -144,7 +143,7 @@ class gru_resetGrad { ...@@ -144,7 +143,7 @@ class gru_resetGrad {
__m256 &grad_update_gate, __m256 &value_reset_gate, __m256 &grad_update_gate, __m256 &value_reset_gate,
__m256 &grad_reset_gate, __m256 &value_prev_out, __m256 &grad_reset_gate, __m256 &value_prev_out,
__m256 &grad_prev_out, __m256 &grad_reset_output, __m256 &grad_prev_out, __m256 &grad_reset_output,
activation_mode_t act_gate) { ActivationType act_gate) {
grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out); grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
grad_prev_out = _mm256_add_ps( grad_prev_out = _mm256_add_ps(
grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate)); grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include "paddle/framework/selected_rows.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
struct BBox {
BBox(T x_min, T y_min, T x_max, T y_max)
: x_min(x_min),
y_min(y_min),
x_max(x_max),
y_max(y_max),
is_difficult(false) {}
BBox() {}
T get_width() const { return x_max - x_min; }
T get_height() const { return y_max - y_min; }
T get_center_x() const { return (x_min + x_max) / 2; }
T get_center_y() const { return (y_min + y_max) / 2; }
T get_area() const { return get_width() * get_height(); }
// coordinate of bounding box
T x_min;
T y_min;
T x_max;
T y_max;
// whether difficult object (e.g. object with heavy occlusion is difficult)
bool is_difficult;
};
// KNCHW ==> NHWC
// template <typename T>
template <typename T>
void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
std::vector<BBox<T>>& bbox_vec);
template <typename T>
void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
std::vector<std::vector<T>>& var_vec);
template <typename T>
BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
const std::vector<T>& prior_bbox_var,
const std::vector<T>& loc_pred_data);
template <typename T1, typename T2>
bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
const std::pair<T1, T2>& pair2);
template <typename T>
bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
const std::pair<T, BBox<T>>& pair2);
template <typename T>
T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
template <typename T>
void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
size_t class_idx, size_t top_k, T conf_threshold,
T nms_threshold, size_t num_priors, size_t num_classes,
std::vector<size_t>* indices);
template <typename T>
int GetDetectionIndices(
const T* conf_data, const size_t num_priors, const size_t num_classes,
const size_t background_label_id, const size_t batch_size,
const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
const size_t top_k,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
template <typename T>
BBox<T> ClipBBox(const BBox<T>& bbox);
template <typename T>
void GetDetectionOutput(
const T* conf_data, const size_t num_kept, const size_t num_priors,
const size_t num_classes, const size_t batch_size,
const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
template <typename T>
void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
std::vector<BBox<T>>& bbox_vec) {
size_t out_offset = bbox_vec.size();
bbox_vec.resize(bbox_vec.size() + num_bboxes);
for (size_t i = 0; i < num_bboxes; ++i) {
BBox<T> bbox;
bbox.x_min = *(prior_data + i * 8);
bbox.y_min = *(prior_data + i * 8 + 1);
bbox.x_max = *(prior_data + i * 8 + 2);
bbox.y_max = *(prior_data + i * 8 + 3);
bbox_vec[out_offset + i] = bbox;
}
}
template <typename T>
void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
std::vector<std::vector<T>>& var_vec) {
size_t out_offset = var_vec.size();
var_vec.resize(var_vec.size() + num);
for (size_t i = 0; i < num; ++i) {
std::vector<T> var;
var.push_back(*(prior_data + i * 8 + 4));
var.push_back(*(prior_data + i * 8 + 5));
var.push_back(*(prior_data + i * 8 + 6));
var.push_back(*(prior_data + i * 8 + 7));
var_vec[out_offset + i] = var;
}
}
template <typename T>
BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
const std::vector<T>& prior_bbox_var,
const std::vector<T>& loc_pred_data) {
T prior_bbox_width = prior_bbox.get_width();
T prior_bbox_height = prior_bbox.get_height();
T prior_bbox_center_x = prior_bbox.get_center_x();
T prior_bbox_center_y = prior_bbox.get_center_y();
T decoded_bbox_center_x =
prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
prior_bbox_center_x;
T decoded_bbox_center_y =
prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
prior_bbox_center_y;
T decoded_bbox_width =
std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
T decoded_bbox_height =
std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
BBox<T> decoded_bbox;
decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
return decoded_bbox;
}
template <typename T1, typename T2>
bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
const std::pair<T1, T2>& pair2) {
return pair1.first > pair2.first;
}
template <typename T>
T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
return 0.0;
} else {
T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
T interX_max = std::min(bbox1.x_max, bbox2.x_max);
T interY_max = std::min(bbox1.y_max, bbox2.y_max);
T inter_width = interX_max - inter_x_min;
T inter_height = interY_max - inter_y_min;
T inter_area = inter_width * inter_height;
T bbox_area1 = bbox1.get_area();
T bbox_area2 = bbox2.get_area();
return inter_area / (bbox_area1 + bbox_area2 - inter_area);
}
}
template <typename T>
void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
size_t class_idx, size_t top_k, T conf_threshold,
T nms_threshold, size_t num_priors, size_t num_classes,
std::vector<size_t>* indices) {
std::vector<std::pair<T, size_t>> scores;
for (size_t i = 0; i < num_priors; ++i) {
size_t conf_offset = i * num_classes + class_idx;
if (conf_score_data[conf_offset] > conf_threshold)
scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
}
std::stable_sort(scores.begin(), scores.end(),
SortScorePairDescend<T, size_t>);
if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
while (scores.size() > 0) {
const size_t idx = scores.front().second;
bool keep = true;
for (size_t i = 0; i < indices->size(); ++i) {
if (keep) {
const size_t saved_idx = (*indices)[i];
T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
keep = overlap <= nms_threshold;
} else {
break;
}
}
if (keep) indices->push_back(idx);
scores.erase(scores.begin());
}
}
template <typename T>
int GetDetectionIndices(
const T* conf_data, const size_t num_priors, const size_t num_classes,
const size_t background_label_id, const size_t batch_size,
const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
const size_t top_k,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
int total_keep_num = 0;
for (size_t n = 0; n < batch_size; ++n) {
const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
size_t num_detected = 0;
std::map<size_t, std::vector<size_t>> indices;
size_t conf_offset = n * num_priors * num_classes;
for (size_t c = 0; c < num_classes; ++c) {
if (c == background_label_id) continue;
ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
conf_threshold, nms_threshold, num_priors, num_classes,
&(indices[c]));
num_detected += indices[c].size();
}
if (top_k > 0 && num_detected > top_k) {
// std::vector<pair<T,T>> score_index_pairs;
std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
for (size_t c = 0; c < num_classes; ++c) {
const std::vector<size_t>& label_indices = indices[c];
for (size_t i = 0; i < label_indices.size(); ++i) {
size_t idx = label_indices[i];
score_index_pairs.push_back(
std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
std::make_pair(c, idx)));
}
}
std::sort(score_index_pairs.begin(), score_index_pairs.end(),
SortScorePairDescend<T, std::pair<size_t, size_t>>);
score_index_pairs.resize(top_k);
std::map<size_t, std::vector<size_t>> new_indices;
for (size_t i = 0; i < score_index_pairs.size(); ++i) {
size_t label = score_index_pairs[i].second.first;
size_t idx = score_index_pairs[i].second.second;
new_indices[label].push_back(idx);
}
all_detection_indices->push_back(new_indices);
total_keep_num += top_k;
} else {
all_detection_indices->push_back(indices);
total_keep_num += num_detected;
}
}
return total_keep_num;
}
template <typename T>
BBox<T> ClipBBox(const BBox<T>& bbox) {
T one = static_cast<T>(1.0);
T zero = static_cast<T>(0.0);
BBox<T> clipped_bbox;
clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
return clipped_bbox;
}
template <typename T>
void GetDetectionOutput(
const T* conf_data, const size_t num_kept, const size_t num_priors,
const size_t num_classes, const size_t batch_size,
const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
size_t count = 0;
for (size_t n = 0; n < batch_size; ++n) {
for (std::map<size_t, std::vector<size_t>>::const_iterator it =
all_indices[n].begin();
it != all_indices[n].end(); ++it) {
size_t label = it->first;
const std::vector<size_t>& indices = it->second;
const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
for (size_t i = 0; i < indices.size(); ++i) {
size_t idx = indices[i];
size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
out_data[count * 7] = n;
out_data[count * 7 + 1] = label;
out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
out_data[count * 7 + 3] = clipped_bbox.x_min;
out_data[count * 7 + 4] = clipped_bbox.y_min;
out_data[count * 7 + 5] = clipped_bbox.x_max;
out_data[count * 7 + 6] = clipped_bbox.y_max;
++count;
}
}
}
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -21,9 +21,9 @@ namespace math { ...@@ -21,9 +21,9 @@ namespace math {
template <typename T> template <typename T>
struct GRUUnitFunctor<platform::CPUDeviceContext, T> { struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext &context, static void compute(const platform::CPUDeviceContext &context,
hl_gru_value<T> value, int frame_size, int batch_size, GRUMetaValue<T> value, int frame_size, int batch_size,
activation_mode_t active_node, const detail::ActivationType active_node,
activation_mode_t active_gate) { const detail::ActivationType active_gate) {
#ifndef __NVCC__ #ifndef __NVCC__
if (value.prev_out_value) { if (value.prev_out_value) {
math::gemm<platform::CPUDeviceContext, T>( math::gemm<platform::CPUDeviceContext, T>(
...@@ -51,10 +51,10 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> { ...@@ -51,10 +51,10 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
template <typename T> template <typename T>
struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> { struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext &context, static void compute(const platform::CPUDeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, GRUMetaValue<T> value, GRUMetaGrad<T> grad,
int frame_size, int batch_size, int frame_size, int batch_size,
activation_mode_t active_node, const detail::ActivationType active_node,
activation_mode_t active_gate) { const detail::ActivationType active_gate) {
#ifndef __NVCC__ #ifndef __NVCC__
detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value, detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
grad, frame_size, batch_size, active_node); grad, frame_size, batch_size, active_node);
......
...@@ -21,9 +21,9 @@ namespace math { ...@@ -21,9 +21,9 @@ namespace math {
template <typename T> template <typename T>
struct GRUUnitFunctor<platform::CUDADeviceContext, T> { struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext &context, static void compute(const platform::CUDADeviceContext &context,
hl_gru_value<T> value, int frame_size, int batch_size, GRUMetaValue<T> value, int frame_size, int batch_size,
activation_mode_t active_node, const detail::ActivationType active_node,
activation_mode_t active_gate) { const detail::ActivationType active_gate) {
auto stream = context.stream(); auto stream = context.stream();
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
...@@ -88,10 +88,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> { ...@@ -88,10 +88,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
template <typename T> template <typename T>
struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> { struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext &context, static void compute(const platform::CUDADeviceContext &context,
hl_gru_value<T> value, hl_gru_grad<T> grad, GRUMetaValue<T> value, GRUMetaGrad<T> grad,
int frame_size, int batch_size, int frame_size, int batch_size,
activation_mode_t active_node, const detail::ActivationType active_node,
activation_mode_t active_gate) { const detail::ActivationType active_gate) {
auto stream = context.stream(); auto stream = context.stream();
dim3 threads; dim3 threads;
dim3 grid; dim3 grid;
......
...@@ -11,7 +11,7 @@ limitations under the License. */ ...@@ -11,7 +11,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/operators/math/lstm_compute.h" #include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
...@@ -19,9 +19,8 @@ namespace paddle { ...@@ -19,9 +19,8 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
// TODO(guosheng): refine code style in gru_compute
template <typename T> template <typename T>
struct hl_gru_value { struct GRUMetaValue {
T *gate_weight; T *gate_weight;
T *state_weight; T *state_weight;
T *gate_value; T *gate_value;
...@@ -31,7 +30,7 @@ struct hl_gru_value { ...@@ -31,7 +30,7 @@ struct hl_gru_value {
}; };
template <typename T> template <typename T>
struct hl_gru_grad { struct GRUMetaGrad {
T *gate_weight_grad; T *gate_weight_grad;
T *state_weight_grad; T *state_weight_grad;
T *gate_grad; T *gate_grad;
...@@ -42,18 +41,18 @@ struct hl_gru_grad { ...@@ -42,18 +41,18 @@ struct hl_gru_grad {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct GRUUnitFunctor { struct GRUUnitFunctor {
static void compute(const DeviceContext &context, hl_gru_value<T> value, static void compute(const DeviceContext &context, GRUMetaValue<T> value,
int frame_size, int batch_size, int frame_size, int batch_size,
activation_mode_t active_node, const detail::ActivationType active_node,
activation_mode_t active_gate); const detail::ActivationType active_gate);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct GRUUnitGradFunctor { struct GRUUnitGradFunctor {
static void compute(const DeviceContext &context, hl_gru_value<T> value, static void compute(const DeviceContext &context, GRUMetaValue<T> value,
hl_gru_grad<T> grad, int frame_size, int batch_size, GRUMetaGrad<T> grad, int frame_size, int batch_size,
activation_mode_t active_node, const detail::ActivationType active_node,
activation_mode_t active_gate); const detail::ActivationType active_gate);
}; };
} // namespace math } // namespace math
......
...@@ -22,14 +22,6 @@ namespace paddle { ...@@ -22,14 +22,6 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
typedef enum {
HL_ACTIVATION_SIGMOID = 0,
HL_ACTIVATION_RELU = 1,
HL_ACTIVATION_TANH = 2,
HL_ACTIVATION_LINEAR = 3,
HL_ACTIVATION_END
} activation_mode_t;
template <class T> template <class T>
struct LstmMetaValue { struct LstmMetaValue {
T *gate_value; T *gate_value;
...@@ -54,20 +46,6 @@ struct LstmMetaGrad { ...@@ -54,20 +46,6 @@ struct LstmMetaGrad {
T *check_og_grad; T *check_og_grad;
}; };
inline activation_mode_t ActiveType(const std::string &type) {
if (type == "sigmoid") {
return HL_ACTIVATION_SIGMOID;
} else if (type == "relu") {
return HL_ACTIVATION_RELU;
} else if (type == "tanh") {
return HL_ACTIVATION_TANH;
} else if (type == "linear" || type == "identity" || type == "") {
return HL_ACTIVATION_LINEAR;
} else {
PADDLE_THROW("Do not support activation type.");
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LstmUnitFunctor { class LstmUnitFunctor {
public: public:
......
...@@ -245,9 +245,12 @@ template struct SetConstant<platform::CPUDeviceContext, int>; ...@@ -245,9 +245,12 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
template struct SetConstant<platform::CPUDeviceContext, int64_t>; template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>; template struct SetConstant<platform::CPUDeviceContext, bool>;
#define DEFINE_CPU_TRANS(RANK) \ #define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \ template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(1);
DEFINE_CPU_TRANS(2); DEFINE_CPU_TRANS(2);
...@@ -302,8 +305,29 @@ void set_constant(const platform::DeviceContext& context, ...@@ -302,8 +305,29 @@ void set_constant(const platform::DeviceContext& context,
#endif #endif
} }
template <typename T>
struct RowwiseAdd<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenVector<T>::Flatten(vector);
auto out = framework::EigenMatrix<T>::From(*output);
for (int64_t i = 0; i < in_dims[0]; ++i) {
out.chip(i, 0) = in.chip(i, 0) + vec;
}
}
};
template struct RowwiseAdd<platform::CPUDeviceContext, float>; template struct RowwiseAdd<platform::CPUDeviceContext, float>;
template struct RowwiseAdd<platform::CPUDeviceContext, double>; template struct RowwiseAdd<platform::CPUDeviceContext, double>;
template struct ColwiseSum<platform::CPUDeviceContext, float>; template struct ColwiseSum<platform::CPUDeviceContext, float>;
template struct ColwiseSum<platform::CPUDeviceContext, double>; template struct ColwiseSum<platform::CPUDeviceContext, double>;
......
...@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>( ...@@ -273,6 +273,35 @@ void set_constant_with_place<platform::CUDAPlace>(
TensorSetConstantGPU(context, tensor, value)); TensorSetConstantGPU(context, tensor, value));
} }
template <typename T>
__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
int num) {
T tmp = 1.0 / width;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
int h = i * tmp;
int w = i - h * width;
c[i] = a[i] + b[w];
}
}
template <typename T>
struct RowwiseAdd<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector, framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
int blocks = 512;
int grids = (input.numel() + blocks - 1) / blocks;
RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
input.data<T>(), vector.data<T>(), output->data<T>(),
static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
}
};
template struct RowwiseAdd<platform::CUDADeviceContext, float>; template struct RowwiseAdd<platform::CUDADeviceContext, float>;
template struct RowwiseAdd<platform::CUDADeviceContext, double>; template struct RowwiseAdd<platform::CUDADeviceContext, double>;
template struct ColwiseSum<platform::CUDADeviceContext, float>; template struct ColwiseSum<platform::CUDADeviceContext, float>;
......
...@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()( ...@@ -45,25 +45,6 @@ void Transpose<DeviceContext, T, Rank>::operator()(
eigen_out.device(*dev) = eigen_in.shuffle(permute); eigen_out.device(*dev) = eigen_in.shuffle(permute);
} }
template <typename DeviceContext, typename T>
void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector,
framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenMatrix<T>::From(vector);
auto out = framework::EigenMatrix<T>::From(*output);
Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
out.device(*context.eigen_device()) =
in + vec.reshape(shape).broadcast(bcast);
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context, void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& input,
......
...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/math/selected_rows_functor.h" #include <set>
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/selected_rows_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -179,6 +181,118 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>; ...@@ -179,6 +181,118 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>; template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>; template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
// This is a separated namespace for manipulate SelectedRows typed
// data. Like merge duplicated rows, adding two SelectedRows etc.
//
// Another group of functors is called "scatter updates", which means
// use SelectedRows to update a dense tensor with different Ops, like
// add or mul.
namespace scatter {
size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
return std::find(rows.begin(), rows.end(), value) - rows.begin();
}
template <typename T>
struct MergeAdd<platform::CPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
auto input_rows = input.rows();
std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto input_width = input.value().dims()[1];
out.set_rows(merge_rows);
out.set_height(input.height());
out.mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = FindPos(merge_rows, input_rows[i]);
for (int64_t j = 0; j < input_width; j++) {
out_data[out_i * input_width + j] += input_data[i * input_width + j];
}
}
return out;
}
};
template struct MergeAdd<platform::CPUDeviceContext, float>;
template struct MergeAdd<platform::CPUDeviceContext, double>;
template struct MergeAdd<platform::CPUDeviceContext, int>;
template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
template <typename T>
struct UpdateToTensor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const ScatterOps& op, const framework::SelectedRows& input1,
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>();
// FIXME(typhoonzero): use macro fix the below messy code.
switch (op) {
case ScatterOps::ASSIGN:
INLINE_FOR2(in1_rows.size(), in1_row_numel)
input2_data[in1_rows[i] * in1_row_numel + j] =
in1_data[i * in1_row_numel + j];
break;
case ScatterOps::ADD:
INLINE_FOR2(in1_rows.size(), in1_row_numel)
input2_data[in1_rows[i] * in1_row_numel + j] +=
in1_data[i * in1_row_numel + j];
break;
case ScatterOps::SUB:
INLINE_FOR2(in1_rows.size(), in1_row_numel)
input2_data[in1_rows[i] * in1_row_numel + j] -=
in1_data[i * in1_row_numel + j];
break;
case ScatterOps::SUBBY:
INLINE_FOR2(in1_rows.size(), in1_row_numel)
input2_data[in1_rows[i] * in1_row_numel + j] =
in1_data[i * in1_row_numel + j] -
input2_data[in1_rows[i] * in1_row_numel + j];
break;
case ScatterOps::MUL:
INLINE_FOR2(in1_rows.size(), in1_row_numel)
input2_data[in1_rows[i] * in1_row_numel + j] *=
in1_data[i * in1_row_numel + j];
break;
case ScatterOps::DIV:
INLINE_FOR2(in1_rows.size(), in1_row_numel)
input2_data[in1_rows[i] * in1_row_numel + j] /=
in1_data[i * in1_row_numel + j];
break;
case ScatterOps::DIVBY:
INLINE_FOR2(in1_rows.size(), in1_row_numel)
input2_data[in1_rows[i] * in1_row_numel + j] =
in1_data[i * in1_row_numel + j] /
input2_data[in1_rows[i] * in1_row_numel + j];
break;
}
}
};
} // namespace scatter
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <set>
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/selected_rows_functor.h" #include "paddle/operators/math/selected_rows_functor.h"
#include "paddle/platform/cuda_helper.h" #include "paddle/platform/cuda_helper.h"
...@@ -222,6 +224,157 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>; ...@@ -222,6 +224,157 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>; template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>; template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>; template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
namespace scatter {
template <typename T, int block_size>
__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
T* out, const int64_t* out_rows,
size_t out_rows_size, int64_t row_numel) {
const int ty = blockIdx.y;
int tid = threadIdx.x;
__shared__ size_t out_idx;
if (tid == 0) {
for (size_t i = 0; i < out_rows_size; i++) {
if (input_rows[ty] == out_rows[i]) {
out_idx = i;
}
}
}
__syncthreads();
input += ty * row_numel;
out += out_idx * row_numel;
for (int index = tid; index < row_numel; index += block_size) {
paddle::platform::CudaAtomicAdd(out + index, input[index]);
}
}
template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> {
framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
auto input_rows = input.rows();
std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
auto input_width = input.value().dims()[1];
out.set_rows(merge_rows);
out.set_height(input.height());
out.mutable_value()->mutable_data<T>(
framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
auto* out_data = out.mutable_value()->data<T>();
auto* input_data = input.value().data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid1(1, input_rows.size());
MergeAddKernel<
T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input_data, input.rows().data(), out_data,
out.rows().data(), out.rows().size(),
input_width);
return out;
}
};
template struct MergeAdd<platform::CUDADeviceContext, float>;
template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows,
const int64_t* rows, const ScatterOps& op,
T* tensor_out, int64_t row_numel) {
const int ty = blockIdx.y;
int tid = threadIdx.x;
selected_rows += ty * row_numel;
tensor_out += rows[ty] * row_numel;
// FIXME(typhoonzero): use macro fix the below messy code.
switch (op) {
case ScatterOps::ASSIGN:
for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] = selected_rows[index];
}
break;
case ScatterOps::ADD:
for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] += selected_rows[index];
}
break;
case ScatterOps::SUB:
for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] -= selected_rows[index];
}
break;
case ScatterOps::SUBBY:
for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] = selected_rows[index] - tensor_out[index];
}
break;
case ScatterOps::MUL:
for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] *= selected_rows[index];
}
break;
case ScatterOps::DIV:
for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] /= selected_rows[index];
}
break;
case ScatterOps::DIVBY:
for (int index = tid; index < row_numel; index += block_size) {
tensor_out[index] = selected_rows[index] / tensor_out[index];
}
break;
}
}
template <typename T>
struct UpdateToTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const ScatterOps& op, const framework::SelectedRows& input1,
framework::Tensor* input2) {
// NOTE: Use SelectedRowsAddToTensor for better performance
// no additional MergeAdd called.
MergeAdd<platform::CUDADeviceContext, T> merge_func;
auto merged_in1 = merge_func(context, input1);
auto in1_height = merged_in1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = merged_in1.value();
auto& in1_rows = merged_in1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.template data<T>();
auto* in2_data = input2->data<T>();
dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
dim3 grid(1, in1_rows.size());
UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
in2_data, in1_row_numel);
}
};
} // namespace scatter
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/selected_rows.h" #include "paddle/framework/selected_rows.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#define INLINE_FOR2(sizei, sizej) \
for (int64_t i = 0; i < sizei; i++) \
for (int64_t j = 0; j < sizej; j++)
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -52,6 +57,78 @@ struct SelectedRowsAddToTensor { ...@@ -52,6 +57,78 @@ struct SelectedRowsAddToTensor {
framework::Tensor* input2); framework::Tensor* input2);
}; };
namespace scatter {
// functors for manuplating SelectedRows data
template <typename DeviceContext, typename T>
struct MergeAdd {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input);
};
template <typename DeviceContext, typename T>
struct Add {
framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input1,
const framework::SelectedRows& input2) {
framework::SelectedRows out;
out.set_rows(input1.rows());
out.set_height(input1.height());
out.mutable_value()->mutable_data<T>(input1.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
e_out.device(*context.eigen_device()) = e_in1 + e_in2;
return out;
}
};
template <typename DeviceContext, typename T>
struct Mul {
// multiply two SelectedRows
framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input1,
const framework::SelectedRows& input2) {
framework::SelectedRows out;
out.set_rows(input1.rows());
out.set_height(input1.height());
out.mutable_value()->mutable_data<T>(input1.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
e_out.device(*context.eigen_device()) = e_in1 * e_in2;
return out;
}
// multiply scalar to SelectedRows
framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input1,
const T input2) {
framework::SelectedRows out;
out.set_rows(input1.rows());
out.set_height(input1.height());
out.mutable_value()->mutable_data<T>(input1.value().dims(),
context.GetPlace());
auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
e_out.device(*context.eigen_device()) = input2 * e_in1;
return out;
}
};
enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
// out = seleted_rows_in / tensor
template <typename DeviceContext, typename T>
struct UpdateToTensor {
void operator()(const DeviceContext& context, const ScatterOps& op,
const framework::SelectedRows& input1,
framework::Tensor* input2);
};
} // namespace scatter
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -30,8 +30,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
void Run(const framework::Scope &scope, void Run(const framework::Scope &scope,
const platform::Place &dev_place) const override { const platform::Place &dev_place) const override {
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(dev_place); auto &dev_ctx = *pool.Get(dev_place);
auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>(); auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>(); auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
......
...@@ -305,7 +305,7 @@ int main(int argc, char **argv) { ...@@ -305,7 +305,7 @@ int main(int argc, char **argv) {
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places); paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/norm_op.h"
namespace paddle {
namespace operators {
template <typename AttrType>
class NormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"X",
"(Tensor) The input tensor of norm operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddInput("Scale",
"(Tensor) The input tensor of norm operator. "
"The format of input tensor is C * 1.");
AddAttr<AttrType>("epsilon",
"(float, default 1e-10) Constant "
"for numerical stability.")
.SetDefault(1.0e-10f);
AddOutput("Out",
"(Tensor) The output tensor of norm operator."
"N * M."
"M = C * H * W");
AddComment(R"DOC(
"Input shape: $(N, C, H, W)$
Sclae shape: $(C, 1)$
Output shape: $(N, C, H, W)$
Where
forward
$$
[\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot \cdot \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
$$
backward
$$
\frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
$$
)DOC");
}
};
class NormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of NormOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of NormOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of NormOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim("Out", in_x_dims);
}
};
class NormOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input(X@GRAD) should not be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
ops::NormOpGrad);
REGISTER_OP_CPU_KERNEL(
norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
REGISTER_OP_CPU_KERNEL(
norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/norm_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
REGISTER_OP_CUDA_KERNEL(
norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Indicesou may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T, typename AttrType = T>
class NormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
auto* out = context.Output<framework::Tensor>("Out");
auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
out->mutable_data<T>(context.GetPlace());
int batch_size = in_x->dims()[0];
int channels = in_x->dims()[1];
int height = in_x->dims()[2];
int width = in_x->dims()[3];
int fea_len = height * width;
auto* place =
context.template device_context<DeviceContext>().eigen_device();
auto x =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
*in_x, framework::make_ddim({batch_size, fea_len * channels}));
// get square
framework::Tensor x_square;
x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
auto x_square_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square, framework::make_ddim({batch_size, fea_len * channels}));
x_square_eigen.device(*place) = x.square();
auto scale_eigen =
framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
*scale);
for (int n = 0; n < batch_size; ++n) {
framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
auto in_x_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
in_x_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
auto x_square_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor out_batch = out->Slice(n, n + 1);
auto out_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
out_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor tmp_tensor;
tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
context.GetPlace());
auto tmp = framework::EigenVector<T, Eigen::RowMajor,
Eigen::DenseIndex>::Flatten(tmp_tensor);
// get colsum and sqrt , inverse
auto dim = Eigen::array<int, 1>({{0}});
tmp.device(*place) = x_square_batch_eigen.sum(dim);
tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
Eigen::array<int, 2> broadcast_dim_col;
broadcast_dim_col[1] = 1;
broadcast_dim_col[0] = channels;
out_batch_eigen.device(*place) =
in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
Eigen::array<int, 2> broadcast_dim_row;
broadcast_dim_row[1] = fea_len;
broadcast_dim_row[0] = 1;
out_batch_eigen.device(*place) =
out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
}
}
};
template <typename DeviceContext, typename T, typename AttrType = T>
class NormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
const framework::Tensor* out_grad =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
framework::Tensor* in_x_grad =
context.Output<framework::Tensor>(framework::GradVarName("X"));
in_x_grad->mutable_data<T>(context.GetPlace());
int batch_size = in_x->dims()[0];
int channels = in_x->dims()[1];
int height = in_x->dims()[2];
int width = in_x->dims()[3];
int fea_len = height * width;
auto* place =
context.template device_context<DeviceContext>().eigen_device();
auto scale_eigen =
framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
*scale);
auto x =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
*in_x, framework::make_ddim({batch_size, fea_len * channels}));
// get square
framework::Tensor x_square;
x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
auto x_square_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square, framework::make_ddim({batch_size, fea_len * channels}));
x_square_eigen.device(*place) = x.square();
for (int n = 0; n < batch_size; ++n) {
framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
auto in_x_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
in_x_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
auto in_g_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
in_g_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
auto x_square_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
x_square_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
auto outg_batch_eigen =
framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
outg_batch, framework::make_ddim({channels, fea_len}));
framework::Tensor tmp_tensor;
tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
context.GetPlace());
auto tmp_eigen =
framework::EigenVector<T, Eigen::RowMajor,
Eigen::DenseIndex>::Flatten(tmp_tensor);
auto dim = Eigen::array<int, 1>({{0}});
tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
framework::Tensor norm_tmp_tensor;
norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
context.GetPlace());
auto norm_tmp_eigen =
framework::EigenVector<T, Eigen::RowMajor,
Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
norm_tmp_eigen.device(*place) =
(x_square_batch_eigen.sum(dim) + epsilon).sqrt();
Eigen::array<int, 2> broadcast_dim_col;
broadcast_dim_col[1] = 1;
broadcast_dim_col[0] = channels;
in_g_batch_eigen.device(*place) =
in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
in_g_batch_eigen.device(*place) =
in_g_batch_eigen /
(norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
// outg_batch_eigen + (in_g_batch_eigen * -1);
in_g_batch_eigen.device(*place) =
in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
Eigen::array<int, 2> broadcast_dim_row;
broadcast_dim_row[1] = fea_len;
broadcast_dim_row[0] = 1;
in_g_batch_eigen.device(*place) =
in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
}
}
};
} // namespace operators
} // namespace paddle
...@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase { ...@@ -272,8 +272,9 @@ class RecurrentOp : public RecurrentBase {
false /*create_local_scope*/); false /*create_local_scope*/);
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// Copy inside::output -> outside::output // Copy inside::output -> outside::output
// outside::output[seq_offset: seq_offset + 1] = inside::output // outside::output[seq_offset: seq_offset + 1] = inside::output
...@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -326,8 +327,8 @@ class RecurrentGradOp : public RecurrentBase {
auto *program = block->Program(); auto *program = block->Program();
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
for (size_t step_id = 0; step_id < seq_len; ++step_id) { for (size_t step_id = 0; step_id < seq_len; ++step_id) {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
......
...@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { ...@@ -131,8 +131,8 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
auto x_sliced = x.Slice(x_offset, x_offset + len); auto x_sliced = x.Slice(x_offset, x_offset + len);
auto out_sliced = out->Slice(out_offset, out_offset + len); auto out_sliced = out->Slice(out_offset, out_offset + len);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
out_offset += len; out_offset += len;
return out_offset; return out_offset;
......
...@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase { ...@@ -91,8 +91,8 @@ class SaveOp : public framework::OperatorBase {
auto &tensor = var->Get<framework::LoDTensor>(); auto &tensor = var->Get<framework::LoDTensor>();
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
framework::SerializeToStream(fout, tensor, dev_ctx); framework::SerializeToStream(fout, tensor, dev_ctx);
} }
......
...@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -79,7 +79,7 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
This operator will recv tensor from send_op This operator will send tensor to recv_op.
)DOC"); )DOC");
AddAttr<std::vector<std::string>>("endpoints", AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)" "(string vector, default 127.0.0.1:6164)"
......
...@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -106,8 +106,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
dx_tensor.mutable_data(x_tensor.place(), x_tensor.type()); dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
// get device context from pool // get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(place); auto &dev_ctx = *pool.Get(place);
if (dout_var == nullptr) { // dx_tensor fill zero if (dout_var == nullptr) { // dx_tensor fill zero
math::set_constant(dev_ctx, &dx_tensor, 0.0f); math::set_constant(dev_ctx, &dx_tensor, 0.0f);
...@@ -116,9 +116,9 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -116,9 +116,9 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
auto height = dout_tensor.dims()[0]; auto height = dout_tensor.dims()[0];
auto slice = dx_tensor.Slice(0, static_cast<int>(height)); auto slice = dx_tensor.Slice(0, static_cast<int>(height));
framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice); framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
if (dx_tensor.dims()[0] < height) { if (dx_tensor.dims()[0] > height) {
auto rest_tensor = dx_tensor.Slice( auto rest_tensor = dx_tensor.Slice(
static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0])); static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
math::set_constant(dev_ctx, &rest_tensor, 0.0f); math::set_constant(dev_ctx, &rest_tensor, 0.0f);
} }
} }
......
...@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase { ...@@ -45,8 +45,8 @@ class SplitLoDTensorOp : public framework::OperatorBase {
auto &x_lod = x.lod(); auto &x_lod = x.lod();
auto &mask_dim = mask.dims(); auto &mask_dim = mask.dims();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Borrow(dev_place); auto &dev_ctx = *pool.Get(dev_place);
std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()}; std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
if (platform::is_cpu_place(mask.place())) { if (platform::is_cpu_place(mask.place())) {
......
...@@ -37,11 +37,11 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -37,11 +37,11 @@ class SumKernel : public framework::OpKernel<T> {
bool in_place = out_var == in_vars[0]; bool in_place = out_var == in_vars[0];
if (out_var->IsType<framework::LoDTensor>()) { if (out_var->IsType<framework::LoDTensor>()) {
auto *out = context.Output<Tensor>("Out"); auto *out = context.Output<LoDTensor>("Out");
out->mutable_data<T>(context.GetPlace()); if (!in_place) {
out->mutable_data<T>(context.GetPlace());
}
auto result = EigenVector<T>::Flatten(*out); auto result = EigenVector<T>::Flatten(*out);
if (!in_place) { if (!in_place) {
math::SetConstant<DeviceContext, T> constant_functor; math::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context.template device_context<DeviceContext>(), out, constant_functor(context.template device_context<DeviceContext>(), out,
......
...@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp { ...@@ -40,8 +40,9 @@ class WriteToArrayOp : public ArrayOp {
if (x_tensor.memory_size() > 0) { if (x_tensor.memory_size() > 0) {
auto *out_tensor = &out->at(offset); auto *out_tensor = &out->at(offset);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
CopyFrom(x_tensor, place, dev_ctx, out_tensor); CopyFrom(x_tensor, place, dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod()); out_tensor->set_lod(x_tensor.lod());
...@@ -129,11 +130,12 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -129,11 +130,12 @@ class ReadFromArrayOp : public ArrayOp {
auto &x_array = x->Get<framework::LoDTensorArray>(); auto &x_array = x->Get<framework::LoDTensorArray>();
auto *out = scope.FindVar(Output("Out")); auto *out = scope.FindVar(Output("Out"));
PADDLE_ENFORCE(out != nullptr, "Out must be set"); PADDLE_ENFORCE(out != nullptr, "Out must be set");
auto *out_tensor = out->GetMutable<framework::LoDTensor>();
size_t offset = GetOffset(scope, place); size_t offset = GetOffset(scope, place);
if (offset < x_array.size()) { if (offset < x_array.size()) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); auto *out_tensor = out->GetMutable<framework::LoDTensor>();
auto &dev_ctx = *pool.Borrow(place); platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor);
out_tensor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} else { } else {
......
...@@ -25,12 +25,12 @@ namespace operators { ...@@ -25,12 +25,12 @@ namespace operators {
using StepScopeVar = std::vector<framework::Scope *>; using StepScopeVar = std::vector<framework::Scope *>;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
constexpr char kStepBlock[] = "sub_block"; static constexpr char kStepBlock[] = "sub_block";
constexpr char kCondition[] = "Condition"; static constexpr char kCondition[] = "Condition";
constexpr char kStepScopes[] = "StepScopes"; static constexpr char kStepScopes[] = "StepScopes";
constexpr char kParameters[] = "X"; static constexpr char kX[] = "X";
constexpr char kParamGrads[] = "X@GRAD"; static constexpr char kXGRAD[] = "X@GRAD";
constexpr char kOutputs[] = "Out"; static constexpr char kOutputs[] = "Out";
class WhileOp : public framework::OperatorBase { class WhileOp : public framework::OperatorBase {
public: public:
...@@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker) WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(kParameters, AddInput(kX,
"A set of variables, which are required by operators inside the " "A set of variables, which are required by operators inside the "
"block of While Op.") "block of While Op.")
.AsDuplicable(); .AsDuplicable();
...@@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase {
executor.Run(*program, *cur_scope_iter, block->ID(), false); executor.Run(*program, *cur_scope_iter, block->ID(), false);
auto &pg_names = Outputs(kParamGrads); auto &pg_names = Outputs(kXGRAD);
auto &p_names = Inputs(kParameters); auto &p_names = Inputs(kX);
PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
if (pg_names[param_id] == framework::kEmptyVarName) { if (pg_names[param_id] == framework::kEmptyVarName) {
...@@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
std::unique_ptr<framework::OpDesc> Apply() const override { std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad = new framework::OpDesc(); auto *grad = new framework::OpDesc();
grad->SetType("while_grad"); grad->SetType("while_grad");
grad->SetInput(kParameters, Input(kParameters)); grad->SetInput(kX, Input(kX));
// Not all of IGs will be generated by inner gradient operators of while op. // Not all of IGs will be generated by inner gradient operators of while op.
// Ignore IGs that is not generated by the inside block. // Ignore IGs that is not generated by the inside block.
auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false); auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
std::unordered_set<std::string> all_outs; std::unordered_set<std::string> all_outs;
for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) { for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) { for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
...@@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
} }
} }
grad->SetOutput(framework::GradVarName(kParameters), igs); grad->SetOutput(framework::GradVarName(kX), igs);
grad->SetInput(kOutputs, Output(kOutputs)); grad->SetInput(kOutputs, Output(kOutputs));
...@@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
std::unordered_set<std::string> block_ins; std::unordered_set<std::string> block_ins;
auto *fwd_block = this->grad_block_[0]->ParentBlock(); auto *fwd_block = this->grad_block_[0]->ParentBlock();
{ {
for (auto &p : Input(kParameters)) { for (auto &p : Input(kX)) {
block_ins.insert(p); block_ins.insert(p);
} }
for (auto &o : Output(kOutputs)) { for (auto &o : Output(kOutputs)) {
...@@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { ...@@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override { framework::BlockDesc *block) const override {
auto p_names = op_desc.Input(kParameters); auto p_names = op_desc.Input(kX);
auto pg_names = op_desc.Output(framework::GradVarName(kParameters)); auto pg_names = op_desc.Output(framework::GradVarName(kX));
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_names.size(); ++i) {
auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
...@@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { ...@@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
class WhileGradOpShapeInference : public framework::InferShapeBase { class WhileGradOpShapeInference : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
ctx->HasInputs(kParameters); ctx->HasInputs(kX);
ctx->HasOutputs(framework::GradVarName(kParameters)); ctx->HasOutputs(framework::GradVarName(kX));
ctx->HasInputs(kOutputs); ctx->HasInputs(kOutputs);
ctx->HasInputs(framework::GradVarName(kOutputs)); ctx->HasInputs(framework::GradVarName(kOutputs));
auto p_names = ctx->Inputs(kParameters); auto p_names = ctx->Inputs(kX);
auto pg_names = ctx->Outputs(kParamGrads); auto pg_names = ctx->Outputs(kXGRAD);
auto var_types = ctx->GetInputsVarType(kParameters); auto var_types = ctx->GetInputsVarType(kX);
std::vector<std::string> names_to_set; std::vector<std::string> names_to_set;
std::vector<framework::DDim> dims_to_set; std::vector<framework::DDim> dims_to_set;
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_names.size(); ++i) {
if (pg_names[i] == framework::kEmptyVarName) { if (pg_names[i] == framework::kEmptyVarName) {
continue; continue;
} }
auto dims = ctx->GetInputsElementDim(kParameters, i); auto dims = ctx->GetInputsElementDim(kX, i);
if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) { if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
names_to_set.push_back(pg_names[i]); names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims); dims_to_set.push_back(dims);
......
...@@ -17,7 +17,7 @@ namespace platform { ...@@ -17,7 +17,7 @@ namespace platform {
DeviceContextPool* DeviceContextPool::pool = nullptr; DeviceContextPool* DeviceContextPool::pool = nullptr;
const platform::DeviceContext* DeviceContextPool::Borrow( const platform::DeviceContext* DeviceContextPool::Get(
const platform::Place& place) { const platform::Place& place) {
auto it = device_contexts_.find(place); auto it = device_contexts_.find(place);
if (it == device_contexts_.end()) { if (it == device_contexts_.end()) {
...@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow( ...@@ -28,24 +28,6 @@ const platform::DeviceContext* DeviceContextPool::Borrow(
return it->second; return it->second;
} }
std::vector<const platform::DeviceContext*> DeviceContextPool::Borrow(
const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0);
PADDLE_ENFORCE_LE(places.size(), device_contexts_.size());
std::vector<const platform::DeviceContext*> borrowed_contexts;
for (auto& place : places) {
auto it = device_contexts_.find(place);
if (it != device_contexts_.end()) {
borrowed_contexts.emplace_back(it->second);
} else {
PADDLE_THROW(
"'Place' is not supported, Please re-compile with WITH_GPU "
"option");
}
}
return borrowed_contexts;
}
DeviceContextPool::DeviceContextPool( DeviceContextPool::DeviceContextPool(
const std::vector<platform::Place>& places) { const std::vector<platform::Place>& places) {
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
......
...@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext { ...@@ -52,6 +52,14 @@ class CPUDeviceContext : public DeviceContext {
std::unique_ptr<Eigen::DefaultDevice> eigen_device_; std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
}; };
template <typename Place>
struct DefaultDeviceContextType;
template <>
struct DefaultDeviceContextType<platform::CPUPlace> {
using TYPE = CPUDeviceContext;
};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class EigenCudaStreamDevice; class EigenCudaStreamDevice;
...@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext { ...@@ -90,6 +98,11 @@ class CUDADeviceContext : public DeviceContext {
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
}; };
template <>
struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = CUDADeviceContext;
};
class CUDNNDeviceContext : public CUDADeviceContext { class CUDNNDeviceContext : public CUDADeviceContext {
public: public:
explicit CUDNNDeviceContext(CUDAPlace place); explicit CUDNNDeviceContext(CUDAPlace place);
...@@ -102,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext { ...@@ -102,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext {
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
}; };
class DeviceGuard {
public:
explicit DeviceGuard(int device) {
original_device_ = platform::GetCurrentDeviceId();
platform::SetDeviceId(device);
}
~DeviceGuard() { platform::SetDeviceId(original_device_); }
private:
int original_device_;
};
#endif #endif
/*! \brief device context pool singleton */ /*! \brief device context pool singleton */
...@@ -121,13 +122,13 @@ class DeviceContextPool { ...@@ -121,13 +122,13 @@ class DeviceContextPool {
public: public:
explicit DeviceContextPool(const std::vector<platform::Place>& places); explicit DeviceContextPool(const std::vector<platform::Place>& places);
static DeviceContextPool& Get() { static DeviceContextPool& Instance() {
PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!"); PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
return *pool; return *pool;
} }
/*! \brief Create should only called by Init function */ /*! \brief Create should only called by Init function */
static DeviceContextPool& Create(const std::vector<platform::Place>& places) { static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
if (pool == nullptr) { if (pool == nullptr) {
pool = new DeviceContextPool(places); pool = new DeviceContextPool(places);
} }
...@@ -135,13 +136,14 @@ class DeviceContextPool { ...@@ -135,13 +136,14 @@ class DeviceContextPool {
} }
/*! \brief Return handle of single device context. */ /*! \brief Return handle of single device context. */
const platform::DeviceContext* Borrow(const platform::Place& place); const platform::DeviceContext* Get(const platform::Place& place);
/*! \brief Return handle of multi-device context. */
std::vector<const platform::DeviceContext*> Borrow(
const std::vector<platform::Place>& places);
~DeviceContextPool() {} template <typename Place>
const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
const Place& place) {
return reinterpret_cast<
const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
}
private: private:
static DeviceContextPool* pool; static DeviceContextPool* pool;
......
...@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) { ...@@ -71,35 +71,20 @@ TEST(Device, DeviceContextPool) {
using paddle::platform::CPUPlace; using paddle::platform::CPUPlace;
using paddle::platform::CUDAPlace; using paddle::platform::CUDAPlace;
DeviceContextPool& pool = DeviceContextPool::Get(); DeviceContextPool& pool = DeviceContextPool::Instance();
auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); auto cpu_dev_ctx1 = pool.Get(CPUPlace());
auto cpu_dev_ctx2 = pool.Borrow(CPUPlace()); auto cpu_dev_ctx2 = pool.Get(CPUPlace());
EXPECT_TRUE(cpu_dev_ctx2 == cpu_dev_ctx1); ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
std::vector<Place> gpu_places; std::vector<Place> gpu_places;
int count = paddle::platform::GetCUDADeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
gpu_places.emplace_back(CUDAPlace(i)); auto dev_ctx = pool.Get(CUDAPlace(i));
} ASSERT_NE(dev_ctx, nullptr);
auto dev_ctxs = pool.Borrow(gpu_places);
for (size_t i = 0; i < dev_ctxs.size(); ++i) {
auto* dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctxs[i]);
// check same as CUDAPlace(i)
CUDAPlace place = boost::get<CUDAPlace>(dev_ctx->GetPlace());
EXPECT_EQ(place.GetDeviceId(), static_cast<int>(i));
} }
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
int dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu DeviceContextPool, because the CUDA "
"device count is "
<< dev_count;
return 0;
}
std::vector<paddle::platform::Place> places; std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace()); places.emplace_back(paddle::platform::CPUPlace());
...@@ -109,7 +94,7 @@ int main(int argc, char** argv) { ...@@ -109,7 +94,7 @@ int main(int argc, char** argv) {
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places); paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
......
...@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> { ...@@ -62,7 +62,7 @@ struct ForRange<CUDADeviceContext> {
template <typename Function> template <typename Function>
inline void operator()(Function func) const { inline void operator()(Function func) const {
constexpr size_t num_threads = 1024; constexpr int num_threads = 1024;
int block_size = limit_ <= num_threads ? limit_ : num_threads; int block_size = limit_ <= num_threads ? limit_ : num_threads;
int grid_size = (limit_ + num_threads - 1) / num_threads; int grid_size = (limit_ + num_threads - 1) / num_threads;
......
...@@ -144,7 +144,7 @@ int main(int argc, char** argv) { ...@@ -144,7 +144,7 @@ int main(int argc, char** argv) {
} }
VLOG(0) << " DeviceCount " << count; VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Create(places); paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include <iostream> #include <iostream>
#include "paddle/platform/enforce.h"
#include "paddle/platform/variant.h" #include "paddle/platform/variant.h"
namespace paddle { namespace paddle {
...@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &); ...@@ -64,5 +64,31 @@ bool places_are_same_class(const Place &, const Place &);
std::ostream &operator<<(std::ostream &, const Place &); std::ostream &operator<<(std::ostream &, const Place &);
template <typename Visitor>
struct PlaceVisitorWrapper
: public boost::static_visitor<typename Visitor::result_type> {
const Visitor &visitor_;
explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
typename Visitor::result_type operator()(const CPUPlace &cpu) const {
return visitor_(cpu);
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#ifdef PADDLE_WITH_CUDA
return visitor_(cuda);
#else
PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
return typename Visitor::result_type();
#endif
}
};
template <typename Visitor>
typename Visitor::result_type VisitPlace(const Place &place,
const Visitor &visitor) {
return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
licensed under the Apache License, Version 2.0 (the "License"); licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
`
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
...@@ -18,34 +18,134 @@ limitations under the License. */ ...@@ -18,34 +18,134 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace platform { namespace platform {
ProfilerState kState = ProfilerState::kDisabled; // The profiler state, the initial value is ProfilerState::kDisabled
uint32_t kNextThreadId = 0; static ProfilerState g_state = ProfilerState::kDisabled;
std::mutex kAllEventListsMutex; // The thread local event list only can be accessed by the specific thread
std::list<std::shared_ptr<EventList>> kAllEventLists; // The thread index of each thread
thread_local std::shared_ptr<EventList> kEventList; static thread_local int32_t g_thread_id;
thread_local int32_t kThreadId; // The g_next_thread_id is a global counter for threads, by the g_thread_id and
// g_next_thread_id, we can know how many threads have created EventList.
static uint32_t g_next_thread_id = 0;
// The global mutex
static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list;
inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx)
: kind_(kind),
name_(std::move(name)),
thread_id_(thread_id),
has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
}
#endif
cpu_ns_ = GetTimeInNsec();
}
std::string Event::kind() const {
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
double Event::CpuElapsedUs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000.0);
}
double Event::CudaElapsedUs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
}
#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
#endif
inline EventList& GetEventList() {
if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>();
g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list);
}
return *g_event_list;
}
void Mark(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id,
dev_ctx);
}
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
name_ = name;
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
dev_ctx_);
}
RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::move(name_), g_thread_id,
dev_ctx_);
}
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
PADDLE_ENFORCE(kState == ProfilerState::kDisabled, PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
"The profiling state should be disabled when calling ", "The profiling state should be disabled when calling ",
"EnableProfiler."); "EnableProfiler.");
kState = state; g_state = state;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto ForEachDevice = [](std::function<void(int)> op) { if (g_state == ProfilerState::kCUDA) {
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
DeviceGuard dev_guard(i);
op(i);
}
};
if (kState == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead. // Generate some dummy evenets first to reduce the startup overhead.
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) { ForEachDevice([](int d) {
DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx); Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait(); dev_ctx->Wait();
}); });
...@@ -53,35 +153,36 @@ void EnableProfiler(ProfilerState state) { ...@@ -53,35 +153,36 @@ void EnableProfiler(ProfilerState state) {
} }
#endif #endif
// Mark the profiling start. // Mark the profiling start.
Mark("_start_profiler_"); Mark("_start_profiler_", nullptr);
} }
std::vector<std::vector<Event>> DisableProfiler() { std::vector<std::vector<Event>> DisableProfiler() {
PADDLE_ENFORCE(kState != ProfilerState::kDisabled, PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting."); "Can't disable profiling, since it's not starting.");
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_"); Mark("_stop_profiler_", nullptr);
kState = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> result; std::vector<std::vector<Event>> result;
std::lock_guard<std::mutex> guard(kAllEventListsMutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) { for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
auto& list = *it; ++it) {
result.emplace_back(list->Reduce()); result.emplace_back((*it)->Reduce());
} }
return result; return result;
} }
void PushEvent(const std::string name, const platform::DeviceContext* dev_ctx) { void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId, GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id,
dev_ctx); dev_ctx);
} }
void PopEvent(const std::string name, const platform::DeviceContext* dev_ctx) { void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, std::move(name), kThreadId, GetEventList().Record(EventKind::kPopRange, std::move(name), g_thread_id,
dev_ctx); dev_ctx);
} }
void ParseEvents(std::vector<std::vector<Event>> events) { void ParseEvents(std::vector<std::vector<Event>>& events) {
// Event name :: counts :: ave :: min :: max :: total
std::map<std::string, std::tuple<int, double, double>> events_table; std::map<std::string, std::tuple<int, double, double>> events_table;
for (size_t i = 0; i < events.size(); i++) { for (size_t i = 0; i < events.size(); i++) {
std::list<Event> pushed_events; std::list<Event> pushed_events;
......
...@@ -24,76 +24,24 @@ namespace platform { ...@@ -24,76 +24,24 @@ namespace platform {
enum EventKind { kMark, kPushRange, kPopRange }; enum EventKind { kMark, kPushRange, kPopRange };
inline uint64_t GetTimeInNsec() {
// using std::chrono;
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
std::chrono::high_resolution_clock,
std::chrono::steady_clock>::type;
return std::chrono::duration_cast<std::chrono::nanoseconds>(
clock::now().time_since_epoch())
.count();
}
class Event { class Event {
public: public:
// the DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id, Event(EventKind kind, std::string name, uint32_t thread_id,
const platform::DeviceContext* dev_ctx = nullptr) DeviceContext* dev_ctx);
: kind_(kind), name_(std::move(name)), thread_id_(thread_id) {
has_cuda_ = false;
#ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx =
static_cast<const platform::CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
}
#endif
cpu_ns_ = GetTimeInNsec();
}
std::string kind() const {
switch (kind_) {
case EventKind::kMark:
return "mark";
case EventKind::kPushRange:
return "push";
case EventKind::kPopRange:
return "pop";
}
PADDLE_THROW("Unknown EventKind.");
}
std::string kind() const;
std::string name() const { return name_; } std::string name() const { return name_; }
bool has_cuda() const { return has_cuda_; } bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
cudaEvent_t event() const { return event_; } cudaEvent_t event() const { return event_; }
int device() const { return device_; } int device() const { return device_; }
#endif #endif
double CpuElapsedUs(const Event& e) const { double CpuElapsedUs(const Event& e) const;
return (e.cpu_ns_ - cpu_ns_) / (1000.0); double CudaElapsedUs(const Event& e) const;
}
double CudaElapsedUs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0;
#else
PADDLE_THROW("CUDA is not enabled");
#endif
}
private: private:
EventKind kind_; EventKind kind_;
...@@ -108,11 +56,11 @@ class Event { ...@@ -108,11 +56,11 @@ class Event {
}; };
struct EventList { struct EventList {
constexpr static std::size_t kMB = 1024 * 1024; constexpr static size_t kMB = 1024 * 1024;
constexpr static std::size_t kEventBlockSize = 16 * kMB; constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static std::size_t kEventSize = sizeof(Event); constexpr static size_t kEventSize = sizeof(Event);
constexpr static std::size_t kEventAlign = alignof(Event); constexpr static size_t kEventAlign = alignof(Event);
constexpr static std::size_t kNumBlock = constexpr static size_t kNumBlock =
kEventBlockSize / kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
...@@ -139,69 +87,36 @@ struct EventList { ...@@ -139,69 +87,36 @@ struct EventList {
}; };
enum ProfilerState { enum ProfilerState {
kDisabled, kDisabled, // disabled state
kCPU, kCPU, // CPU profiling state
kCUDA, kCUDA, // GPU profiling state
}; };
// The profiler state, the initial value is ProfilerState::kDisabled void Mark(const std::string& name, DeviceContext* dev_ctx);
extern ProfilerState kState;
// The global mutex
extern std::mutex kAllEventListsMutex;
// The total event lists of all threads
extern std::list<std::shared_ptr<EventList>> kAllEventLists;
// The thread local event list only can be accessed by the specific thread
extern thread_local std::shared_ptr<EventList> kEventList;
// The thread index of each thread
extern thread_local int32_t kThreadId;
// The kNextThreadId is a global counter for threads, by the kThreadId and
// kNextThreadId, we can know how many threads have created EventList.
extern uint32_t kNextThreadId;
inline EventList& GetEventList() {
if (!kEventList) {
std::lock_guard<std::mutex> guard(kAllEventListsMutex);
kEventList = std::make_shared<EventList>();
kThreadId = kNextThreadId++;
kAllEventLists.emplace_front(kEventList);
}
return *kEventList;
}
inline void Mark(const std::string name,
const platform::DeviceContext* dev_ctx = nullptr) {
GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx);
}
void PushEvent(const std::string name, void PushEvent(const std::string& name, DeviceContext* dev_ctx);
const platform::DeviceContext* dev_ctx = nullptr);
void PopEvent(const std::string name, void PopEvent(const std::string& name, DeviceContext* dev_ctx);
const platform::DeviceContext* dev_ctx = nullptr);
struct RecordEvent { struct RecordEvent {
explicit RecordEvent(const std::string name, explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
platform::DeviceContext* dev_ctx = nullptr) {
if (kState == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx;
name_ = name;
GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId,
dev_ctx_);
}
~RecordEvent() { ~RecordEvent();
if (kState == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::move(name_), kThreadId, // The device context is used by Event to get the current cuda stream.
dev_ctx_); DeviceContext* dev_ctx_;
} // Event name
platform::DeviceContext* dev_ctx_;
std::string name_; std::string name_;
}; };
// Enable the profiling function.
void EnableProfiler(ProfilerState state); void EnableProfiler(ProfilerState state);
// Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler(); std::vector<std::vector<Event>> DisableProfiler();
void ParseEvents(std::vector<std::vector<Event>>); void ParseEvents(std::vector<std::vector<Event>>&);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) { ...@@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) {
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventKind;
Event start_event(EventKind::kPushRange, "test", 0); Event start_event(EventKind::kPushRange, "test", 0, nullptr);
EXPECT_TRUE(start_event.has_cuda() == false); EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0); Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0);
} }
...@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) { ...@@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) {
TEST(Event, CudaElapsedTime) { TEST(Event, CudaElapsedTime) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventKind; using paddle::platform::EventKind;
DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true); EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0; int counter = 0;
...@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) { ...@@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) {
DeviceContext* dev_ctx = nullptr; DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA; state = ProfilerState::kCUDA;
dev_ctx = dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0)); new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#endif #endif
EnableProfiler(state); EnableProfiler(state);
...@@ -98,7 +98,9 @@ TEST(RecordEvent, RecordEvent) { ...@@ -98,7 +98,9 @@ TEST(RecordEvent, RecordEvent) {
int cuda_startup_count = 0; int cuda_startup_count = 0;
int start_profiler_count = 0; int start_profiler_count = 0;
int stop_profiler_count = 0; int stop_profiler_count = 0;
ParseEvents(events); ParseEvents(events);
for (size_t i = 0; i < events.size(); ++i) { for (size_t i = 0; i < events.size(); ++i) {
for (size_t j = 0; j < events[i].size(); ++j) { for (size_t j = 0; j < events[i].size(); ++j) {
if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
......
...@@ -3,6 +3,9 @@ if(WITH_PYTHON) ...@@ -3,6 +3,9 @@ if(WITH_PYTHON)
SRCS pybind.cc exception.cc protobuf.cc const_value.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init DEPS pybind python backward proto_desc paddle_memory executor prune init
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT ANDROID)
endif(WITH_PYTHON) endif(WITH_PYTHON)
if(WITH_DOC) if(WITH_DOC)
......
...@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) { ...@@ -171,12 +171,23 @@ void BindBlockDesc(py::module &m) {
std::string name = byte_name; std::string name = byte_name;
return self.HasVar(name); return self.HasVar(name);
}) })
.def("has_var_recursive",
[](BlockDesc &self, py::bytes byte_name) {
std::string name = byte_name;
return self.HasVarRecursive(name);
})
.def("find_var", .def("find_var",
[](BlockDesc &self, py::bytes byte_name) { [](BlockDesc &self, py::bytes byte_name) {
std::string name = byte_name; std::string name = byte_name;
return self.FindVar(name); return self.FindVar(name);
}, },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("find_var_recursive",
[](BlockDesc &self, py::bytes byte_name) {
std::string name = byte_name;
return self.FindVarRecursive(name);
},
py::return_value_policy::reference)
.def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference) .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
.def("op_size", &BlockDesc::OpSize) .def("op_size", &BlockDesc::OpSize)
.def("op", &BlockDesc::Op, py::return_value_policy::reference) .def("op", &BlockDesc::Op, py::return_value_policy::reference)
...@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) { ...@@ -204,7 +215,7 @@ void BindVarDsec(py::module &m) {
.def("set_shape", &VarDesc::SetShape) .def("set_shape", &VarDesc::SetShape)
.def("set_dtype", &VarDesc::SetDataType) .def("set_dtype", &VarDesc::SetDataType)
.def("shape", &VarDesc::Shape, py::return_value_policy::reference) .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
.def("dtype", &VarDesc::GetDataType) .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
.def("lod_level", &VarDesc::GetLodLevel) .def("lod_level", &VarDesc::GetLodLevel)
.def("set_lod_level", &VarDesc::SetLoDLevel) .def("set_lod_level", &VarDesc::SetLoDLevel)
.def("type", &VarDesc::GetType) .def("type", &VarDesc::GetType)
...@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) { ...@@ -236,14 +247,22 @@ void BindOpDesc(py::module &m) {
.value("BLOCK", proto::AttrType::BLOCK); .value("BLOCK", proto::AttrType::BLOCK);
py::class_<OpDesc> op_desc(m, "OpDesc", ""); py::class_<OpDesc> op_desc(m, "OpDesc", "");
op_desc.def("type", &OpDesc::Type) op_desc
.def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
py::return_value_policy::reference)
.def("copy_from", &OpDesc::CopyFrom)
.def("type", &OpDesc::Type)
.def("set_type", &OpDesc::SetType) .def("set_type", &OpDesc::SetType)
.def("input", &OpDesc::Input) .def("input", &OpDesc::Input)
.def("input_names", &OpDesc::InputNames) .def("input_names", &OpDesc::InputNames)
.def("set_input", &OpDesc::SetInput)
.def("output", &OpDesc::Output) .def("output", &OpDesc::Output)
.def("output_names", &OpDesc::OutputNames) .def("output_names", &OpDesc::OutputNames)
.def("set_input", &OpDesc::SetInput)
.def("set_output", &OpDesc::SetOutput) .def("set_output", &OpDesc::SetOutput)
.def("input_arg_names", &OpDesc::InputArgumentNames)
.def("output_arg_names", &OpDesc::OutputArgumentNames)
.def("rename_input", &OpDesc::RenameInput)
.def("rename_output", &OpDesc::RenameOutput)
.def("has_attr", &OpDesc::HasAttr) .def("has_attr", &OpDesc::HasAttr)
.def("attr_type", &OpDesc::GetAttrType) .def("attr_type", &OpDesc::GetAttrType)
.def("attr_names", &OpDesc::AttrNames) .def("attr_names", &OpDesc::AttrNames)
......
...@@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -269,23 +269,22 @@ All parameter, weight, gradient are variables in Paddle.
} }
return ret_values; return ret_values;
}); });
m.def("get_grad_op_descs", m.def(
[](const OpDesc &op_desc, "get_grad_op_desc", [](const OpDesc &op_desc,
const std::unordered_set<std::string> &no_grad_set, const std::unordered_set<std::string> &no_grad_set,
std::unordered_map<std::string, std::string> &grad_to_var, const std::vector<BlockDesc *> &grad_sub_block) {
const std::vector<BlockDesc *> &grad_sub_block) { std::unordered_map<std::string, std::string> grad_to_var;
std::vector<std::unique_ptr<OpDesc>> grad_op_descs = std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance() framework::OpInfoMap::Instance()
.Get(op_desc.Type()) .Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, &grad_to_var, .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
grad_sub_block); grad_sub_block);
std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size()); std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
std::transform( std::transform(grad_op_descs.begin(), grad_op_descs.end(),
grad_op_descs.begin(), grad_op_descs.end(), grad_op_desc_ptrs.begin(),
grad_op_desc_ptrs.begin(), [](std::unique_ptr<OpDesc> &p) { return p.release(); });
[](std::unique_ptr<OpDesc> &p) { return p.release(); }); return std::make_pair(grad_op_desc_ptrs, grad_to_var);
return grad_op_desc_ptrs; });
});
m.def("prune", [](const ProgramDesc &origin, m.def("prune", [](const ProgramDesc &origin,
const std::vector<std::array<size_t, 2>> &targets) { const std::vector<std::array<size_t, 2>> &targets) {
ProgramDesc prog_with_targets(origin); ProgramDesc prog_with_targets(origin);
...@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -301,6 +300,8 @@ All parameter, weight, gradient are variables in Paddle.
InferenceOptimize(*(origin.Proto()), &pruned_desc); InferenceOptimize(*(origin.Proto()), &pruned_desc);
return new ProgramDesc(pruned_desc); return new ProgramDesc(pruned_desc);
}); });
m.def("empty_var_name", []() { return framework::kEmptyVarName; });
m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
m.def_submodule( m.def_submodule(
"var_names", "var_names",
"The module will return special predefined variable name in Paddle") "The module will return special predefined variable name in Paddle")
......
...@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -63,9 +63,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>( auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
tensor.dims(), platform::CPUPlace())); tensor.dims(), platform::CPUPlace()));
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto dev_ctx = static_cast<const platform::CUDADeviceContext *>( auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
pool.Borrow(tensor.place())); pool.Get(tensor.place()));
paddle::platform::GpuMemcpyAsync( paddle::platform::GpuMemcpyAsync(
dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
...@@ -76,10 +77,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -76,10 +77,10 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
} else if (paddle::platform::is_cpu_place(tensor.place())) { } else if (paddle::platform::is_cpu_place(tensor.place())) {
dst_tensor = tensor; dst_tensor = tensor;
} }
return py::buffer_info( return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.place()), py::format_descriptor<CUR_TYPE>::format(),
sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(), (size_t)framework::arity(dst_tensor.dims()),
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides); dims_outside, strides);
} else { } else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value; constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor); return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
...@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray( ...@@ -137,9 +138,9 @@ void PyCUDATensorSetFromArray(
self.Resize(framework::make_ddim(dims)); self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self.mutable_data<T>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx = auto dev_ctx =
static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place)); static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(), paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream()); cudaMemcpyHostToDevice, dev_ctx->stream());
} }
......
...@@ -178,7 +178,7 @@ EOF ...@@ -178,7 +178,7 @@ EOF
# run paddle version to install python packages first # run paddle version to install python packages first
RUN apt-get update &&\ RUN apt-get update &&\
${NCCL_DEPS}\ ${NCCL_DEPS}\
apt-get install -y wget python-pip dmidecode && pip install -U pip && \ apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
pip install /*.whl; apt-get install -f -y && \ pip install /*.whl; apt-get install -f -y && \
apt-get clean -y && \ apt-get clean -y && \
rm -f /*.whl && \ rm -f /*.whl && \
......
...@@ -71,9 +71,7 @@ function threads_config() { ...@@ -71,9 +71,7 @@ function threads_config() {
# auto set OMP_NUM_THREADS and MKL_NUM_THREADS # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
# according to trainer_count and total processors # according to trainer_count and total processors
# only when MKL enabled # only when MKL enabled
if [ "@WITH_MKL@" == "OFF" ]; then # auto set OPENBLAS_NUM_THREADS when do not use MKL
return 0
fi
processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l` processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs` trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
if [ -z $trainers ]; then if [ -z $trainers ]; then
...@@ -83,12 +81,19 @@ function threads_config() { ...@@ -83,12 +81,19 @@ function threads_config() {
if [ $threads -eq 0 ]; then if [ $threads -eq 0 ]; then
threads=1 threads=1
fi fi
if [ -z "$OMP_NUM_THREADS" ]; then if [ "@WITH_MKL@" == "ON" ]; then
export OMP_NUM_THREADS=$threads if [ -z "$OMP_NUM_THREADS" ]; then
fi export OMP_NUM_THREADS=$threads
if [ -z "$MKL_NUM_THREADS" ]; then fi
export MKL_NUM_THREADS=$threads if [ -z "$MKL_NUM_THREADS" ]; then
export MKL_NUM_THREADS=$threads
fi
else
if [ -z "$OPENBLAS_NUM_THREADS" ]; then
export OPENBLAS_NUM_THREADS=$threads
fi
fi fi
} }
PADDLE_CONF_HOME="$HOME/.config/paddle" PADDLE_CONF_HOME="$HOME/.config/paddle"
...@@ -150,7 +155,7 @@ fi ...@@ -150,7 +155,7 @@ fi
case "$1" in case "$1" in
"train") "train")
threads_config $@ threads_config $@
# echo $OMP_NUM_THREADS $MKL_NUM_THREADS # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2} ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
;; ;;
"merge_model") "merge_model")
......
...@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid'] ...@@ -44,7 +44,7 @@ __all__ = ['train', 'test', 'valid']
DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat' LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat' SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
DATA_MD5 = '52808999861908f626f3c1f4e79d11fa' DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d' LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c' SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
# In official 'readme', tstid is the flag of test data # In official 'readme', tstid is the flag of test data
......
...@@ -36,7 +36,7 @@ def __read_gflags_from_env__(): ...@@ -36,7 +36,7 @@ def __read_gflags_from_env__():
""" """
import sys import sys
import core import core
read_env_flags = ['use_pinned_memory'] read_env_flags = ['use_pinned_memory', 'check_nan_inf']
if core.is_compile_gpu(): if core.is_compile_gpu():
read_env_flags.append('fraction_of_gpu_memory_to_use') read_env_flags.append('fraction_of_gpu_memory_to_use')
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
......
from paddle.v2.fluid import framework as framework from paddle.v2.fluid import framework as framework
from . import core
import collections
__all__ = ['append_backward_ops'] __all__ = ['append_backward']
def append_backward_ops(loss, parameter_list=None, no_grad_set=None): def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
""" """
Create and add gradient Operators in BlockDesc to compute Traverse all ops in op_descs[begin_idx : end_idx],
gradients of `loss` for parameters in parameter_list if any op has inputs/outputs named "old_name", rename it as 'new_name'
"""
if begin_idx is None:
begin_idx = 0
if end_idx is None:
end_idx = len(op_descs)
for i in range(begin_idx, end_idx):
op_desc = op_descs[i]
if isinstance(op_desc, tuple):
op_desc = op_desc[0]
op_desc.rename_input(old_name, new_name)
op_desc.rename_output(old_name, new_name)
def _create_op_desc_(op_type, inputs, outputs, attrs):
"""
Create a C++ OpDesc object with specified inputs, outputs and attributes.
"""
op_desc = core.OpDesc()
op_desc.set_type(op_type)
for para, args in inputs.iteritems():
op_desc.set_input(para, args)
for para, args in outputs.iteritems():
op_desc.set_output(para, args)
for name, val in attrs.iteritems():
if isinstance(val, framework.Block):
op_desc.set_block_attr(name, val.desc)
else:
op_desc.set_attr(name, val)
return op_desc
def _infer_var_data_type_(grad_var_name, block):
"""
Infer the data type of given grad variable
"""
grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
if block.desc.has_var_recursive(fwd_name):
fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
grad_var.set_dtype(fwd_var.dtype())
else:
grad_var.set_dtype(core.DataType.FP32)
def _all_in_set_(cands, s):
"""
Test if all elements of 'cands' are in set 's'
"""
if len(cands) == 0:
return False
for c in cands:
if not c in s:
return False
return True
def _strip_grad_suffix_(name):
"""
Strip the grad suffix from the given varibale name
e.g. x@GRAD ==> x
y@GRAD@RENAME@1 ==> y
"""
pos = name.find(core.grad_var_suffix())
return name[:pos] if pos != -1 else name
def _append_grad_suffix_(name):
"""
Append grad suffix to the given variable name
e.g. x ==> x@GRAD
"""
return name + core.grad_var_suffix()
def _addup_repetitive_outputs_(op_descs):
"""
In backward part, an variable may be the output of more than one ops.
In this case, the variable should be the accumulation of all the outputs.
`sum_op`s are added to implement the accumulate.
"""
pending_sum_ops = []
var_rename_count = collections.defaultdict(int)
renamed_vars = collections.defaultdict(list)
for idx, op_desc in enumerate(op_descs):
for var_name in op_desc.input_arg_names():
if len(renamed_vars[var_name]) > 1:
pending_sum_ops.append(
(_create_op_desc_("sum", {"X": renamed_vars[var_name]},
{"Out": [var_name]}, {}), idx))
renamed_vars[var_name] = [var_name]
for var_name in op_desc.output_arg_names():
if var_name == core.empty_var_name(
) or var_name in op_desc.input_arg_names():
# empty variable or inplace op
continue
if len(renamed_vars[var_name]) == 0:
# it's the first time we get the variable
renamed_vars[var_name] = [var_name]
else:
if len(renamed_vars[var_name]) == 1:
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
# rename original var_name
renamed_vars[var_name][0] = new_name
_rename_arg_(op_descs, var_name, new_name, 0, idx)
_rename_arg_(pending_sum_ops, var_name, new_name)
new_name = var_name + "@RENAME@" + \
str(var_rename_count[var_name])
var_rename_count[var_name] += 1
op_desc.rename_output(var_name, new_name)
renamed_vars[var_name].append(new_name)
for var_name, inputs in renamed_vars.iteritems():
if len(inputs) > 1:
pending_sum_ops.append((_create_op_desc_(
"sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
# sum_op descs are sorted according to their insert position
for p in reversed(pending_sum_ops):
op_descs.insert(p[1], p[0])
return op_descs
:param loss: an variable generated by cost function. def _remove_no_grad_branch_(op_descs, no_grad_set):
:type loss: Variable """
:param no_grad_set: variable that should not create gradient Remove unnecessary grad ops
:type no_grad_set: set A grad op can be removed in two cases:
:param parameter_list: parameters that need to compute gradient and 1. all outputs of the grad op are in 'no_grad_set'
update to optimize the lost. 2. all grad inputs of the grad op are in 'no_grad_set'
:type: list """
:return: list of (parameters, gradients) pair.
:rtype: list[Variable] def _op_can_be_removed_(op_desc, no_grad_set):
out_arg_names = op_desc.output_arg_names()
if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
return True
if _all_in_set_(
filter(lambda name: name.find(core.grad_var_suffix()) != -1,
op_desc.input_arg_names()), no_grad_set):
no_grad_set.union(out_arg_names)
return True
return False
# Remove ops whose outputs are all in no_grad_dict
op_descs = filter(
lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
# Insert fill_zeros_like_op
to_insert = []
for idx, op_desc in enumerate(op_descs):
for arg in op_desc.input_arg_names():
if core.grad_var_suffix() in arg and arg in no_grad_set:
to_insert.append((_create_op_desc_("fill_zeros_like", {
"X": [_strip_grad_suffix_(arg)]
}, {"Y": [arg]}, {}), idx))
map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
return op_descs
def _append_backward_ops_(target,
block,
target_block,
no_grad_dict,
grad_to_var,
callback=None):
"""
Create all grad ops, and insert them into given block
Args:
target(Variable): the target variable of forward pass
block(Block): the block where forward ops are
target_block(Block): the block which is going to hold new generated grad ops
no_grad_dict(dict):
key(int) block index
val(set) a set of varibale names. These varibales have no gradient
grad_to_var(dict)(output argument):
key(str): grad variable name
val(str): corresponding forward variable name
"""
# grad_op_descs holds created grad_op, and will be appended to target_block
grad_op_descs = []
program = block.program
for op in reversed(block.ops):
grad_sub_block_list = []
# If the op has its own sub-block, deal with the sub-block first
if op.has_attr("sub_block"):
sub_block = program.block(op.block_attr("sub_block"))
grad_sub_block = program.create_block(parent_idx=sub_block.idx)
_append_backward_ops_(target, sub_block, grad_sub_block,
no_grad_dict, grad_to_var, callback)
grad_sub_block_list.append(grad_sub_block.desc)
# Getting op's corresponding grad_op
grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
op.desc, no_grad_dict[block.idx], grad_sub_block_list)
grad_op_descs.extend(grad_op_desc)
grad_to_var.update(op_grad_to_var)
grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
no_grad_dict[block.idx])
if target_block.idx == 0:
grad_op_descs.insert(
0,
_create_op_desc_("fill_constant", {}, {
"Out": [_append_grad_suffix_(target.name)]
}, {"shape": [1],
"value": 1.0,
"dtype": target.dtype}))
# append op_desc in grad_op_descs to target_block
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc)
def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
"""
Create new variables required by backward pass.
Args:
block(Block): the block where new variables will be created
start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
grad_to_var(dict):
key(str): grad variable name
val(str): corresponding forward variable name
In most cases, this dict is generated by _append_backward_ops_()
grad_info_map(dict)(output argument):
key(str): forward variable name
val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
"""
for op_idx in range(start_op_idx, block.desc.op_size()):
op_desc = block.desc.op(op_idx)
if op_desc.has_attr("sub_block"):
sub_block = block.program.block(op_desc.block_attr("sub_block"))
_append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
new_vars = set()
# create new gradient variables
for grad_var_name in op_desc.output_arg_names():
grad_var_name = grad_var_name.encode("ascii")
if block.desc.has_var_recursive(
grad_var_name) or grad_var_name == core.empty_var_name():
continue
block.desc.var(grad_var_name)
new_vars.add(grad_var_name)
if not grad_to_var.has_key(grad_var_name):
continue
grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
# infer_shape and infer_type
op_desc.infer_var_type(block.desc)
op_desc.infer_shape(block.desc)
for arg in op_desc.output_arg_names():
if arg in new_vars:
_infer_var_data_type_(arg, block)
def append_backward(loss, parameter_list=None, no_grad_set=None):
"""
Append backward part to main_program
Args:
loss(Variable): The variable generated by cost function.
parameter_list(list): Parameters that need to be updated by optimizer.
If None, it means all parameters need to be updated.
no_grad_set(set): Variables that have no gradients in Block 0.
If None, the set will be generated inside the function and
contains all variables with `step_gradient=True` from all blocks.
Return:
(list[Variable]): list of (parameters, gradients) pair.
""" """
assert isinstance(loss, framework.Variable) assert isinstance(loss, framework.Variable)
program = loss.block.program
no_grad_dict = dict()
if no_grad_set is None: if no_grad_set is None:
program = loss.block.program
assert isinstance(program, framework.Program) assert isinstance(program, framework.Program)
no_grad_set = list()
for block in program.blocks: for block in program.blocks:
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
block_no_grad_set = set()
for var in block.vars.itervalues(): for var in block.vars.itervalues():
assert isinstance(var, framework.Variable) assert isinstance(var, framework.Variable)
if var.stop_gradient: if var.stop_gradient:
no_grad_set.append(var.name) block_no_grad_set.add(_append_grad_suffix_(var.name))
no_grad_set = set(no_grad_set) no_grad_dict[block.idx] = block_no_grad_set
elif isinstance(no_grad_set, set):
no_grad_dict = {
0: set([_append_grad_suffix_(name) for name in no_grad_set])
}
else:
raise ValueError("'no_grad_set' should be a set or None.")
grad_info_map = dict()
root_block = program.block(0)
fwd_op_num = root_block.desc.op_size()
current_block_idx = program.current_block_idx
grad_to_var = dict()
_append_backward_ops_(loss, root_block, root_block, no_grad_dict,
grad_to_var)
_append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
program.current_block_idx = current_block_idx
program.sync_with_cpp()
param_grad_map = loss.block.program.append_backward(loss, no_grad_set)
if parameter_list is not None: if parameter_list is not None:
parameters = parameter_list parameters = parameter_list
else: else:
params = loss.block.program.global_block().all_parameters() params = program.global_block().all_parameters()
parameters = [param.name for param in params] parameters = [param.name for param in params]
params_and_grads = [] params_and_grads = []
for param in parameters: for param in parameters:
if param not in param_grad_map: if param not in grad_info_map:
raise ValueError("param %s is not in map" % param) raise ValueError("param %s is not in map" % param)
grad_info = param_grad_map[param] grad_info = grad_info_map[param]
grad_block = loss.block.program.block(grad_info[1]) grad_block = grad_info[1]
if not grad_block.has_var(grad_info[0]): if not grad_block.has_var(grad_info[0]):
raise ValueError("grad block[{0}] did not have grad var {1}".format( raise ValueError("grad block[{0}] did not have grad var {1}".format(
grad_info[1], grad_info[0])) grad_info[1], grad_info[0]))
# Get the param var from the global block # Get the param var from the global block
param_var = loss.block.program.global_block().var(param) param_var = program.global_block().var(param)
grad_var = grad_block.var(grad_info[0]) grad_var = grad_block.var(grad_info[0])
if loss.block.has_var(grad_info[0]): if loss.block.has_var(grad_info[0]):
params_and_grads.append((param_var, grad_var)) params_and_grads.append((param_var, grad_var))
......
...@@ -3,7 +3,7 @@ import core ...@@ -3,7 +3,7 @@ import core
import numpy import numpy
import six.moves as six import six.moves as six
from framework import Variable from framework import Variable, default_main_program
__all__ = ['DataFeeder'] __all__ = ['DataFeeder']
...@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object): ...@@ -53,12 +53,16 @@ class DataToLoDTensorConverter(object):
class DataFeeder(object): class DataFeeder(object):
def __init__(self, feed_list, place): def __init__(self, feed_list, place, program=None):
self.feed_dtypes = [] self.feed_dtypes = []
self.feed_names = [] self.feed_names = []
self.feed_shapes = [] self.feed_shapes = []
self.feed_lod_level = [] self.feed_lod_level = []
if program is None:
program = default_main_program()
for each_var in feed_list: for each_var in feed_list:
if isinstance(each_var, basestring):
each_var = program.block(0).var(each_var)
if not isinstance(each_var, Variable): if not isinstance(each_var, Variable):
raise TypeError("Feed list should contain a list of variable") raise TypeError("Feed list should contain a list of variable")
self.feed_dtypes.append(each_var.dtype) self.feed_dtypes.append(each_var.dtype)
......
...@@ -95,7 +95,9 @@ class DistributeTranspiler: ...@@ -95,7 +95,9 @@ class DistributeTranspiler:
""" """
if program is None: if program is None:
program = default_main_program() program = default_main_program()
self.program = program
self.trainers = trainers self.trainers = trainers
self.optimize_ops = optimize_ops
self._optimize_distributed( self._optimize_distributed(
optimize_ops, optimize_ops,
program, program,
...@@ -156,9 +158,10 @@ class DistributeTranspiler: ...@@ -156,9 +158,10 @@ class DistributeTranspiler:
attrs={"endpoints": pserver_endpoints, attrs={"endpoints": pserver_endpoints,
"epmap": epmap}) "epmap": epmap})
def get_trainer_program(optimize_ops, program): def get_trainer_program(self):
# remove optimize ops and add a send op to main_program # remove optimize ops and add a send op to main_program
program.global_block().delete_ops(optimize_ops) self.program.global_block().delete_ops(self.optimize_ops)
return self.program
def _create_var_for_trainers(self, block, var, trainers): def _create_var_for_trainers(self, block, var, trainers):
var_list = [] var_list = []
...@@ -210,7 +213,6 @@ class DistributeTranspiler: ...@@ -210,7 +213,6 @@ class DistributeTranspiler:
if opt_op.inputs.has_key("Grad"): if opt_op.inputs.has_key("Grad"):
if opt_op.inputs["Grad"].name in grad_var_names: if opt_op.inputs["Grad"].name in grad_var_names:
print "appending ", opt_op.type, opt_op.inputs
optimize_sub_program.global_block().append_op( optimize_sub_program.global_block().append_op(
type=opt_op.type, type=opt_op.type,
inputs=opt_op.inputs, inputs=opt_op.inputs,
......
import numpy as np import numpy as np
import contextlib
from framework import Program, default_main_program
from . import core from . import core
from framework import Program, default_main_program, Parameter, Variable
__all__ = ['Executor', 'g_scope'] __all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
g_scope = core.Scope() g_scope = core.Scope()
def global_scope():
return g_scope
def switch_scope(scope):
global g_scope
ex = g_scope
g_scope = scope
return ex
@contextlib.contextmanager
def scope_guard(scope):
ex = switch_scope(scope)
yield
switch_scope(ex)
def as_numpy(tensor): def as_numpy(tensor):
if isinstance(tensor, list): if isinstance(tensor, list):
return [as_numpy(t) for t in tensor] return [as_numpy(t) for t in tensor]
...@@ -117,7 +136,7 @@ class Executor(object): ...@@ -117,7 +136,7 @@ class Executor(object):
raise TypeError() raise TypeError()
if scope is None: if scope is None:
scope = g_scope scope = global_scope()
program = program.clone() program = program.clone()
global_block = program.global_block() global_block = program.global_block()
......
...@@ -663,7 +663,7 @@ class Block(object): ...@@ -663,7 +663,7 @@ class Block(object):
end = list(self.ops).index(ops[-1]) end = list(self.ops).index(ops[-1])
except Exception, e: except Exception, e:
raise e raise e
self.desc.remove_op(start, end) self.desc.remove_op(start, end + 1)
def prepend_op(self, *args, **kwargs): def prepend_op(self, *args, **kwargs):
op_desc = self.desc.prepend_op() op_desc = self.desc.prepend_op()
...@@ -846,9 +846,11 @@ class Program(object): ...@@ -846,9 +846,11 @@ class Program(object):
self.sync_with_cpp() self.sync_with_cpp()
return param_to_grad_info return param_to_grad_info
def create_block(self): def create_block(self, parent_idx=None):
new_block_idx = len(self.blocks) new_block_idx = len(self.blocks)
self.desc.append_block(self.current_block().desc) parent = self.current_block() if parent_idx is None else self.block(
parent_idx)
self.desc.append_block(parent.desc)
self.current_block_idx = new_block_idx self.current_block_idx = new_block_idx
self.blocks.append(Block(self, self.current_block_idx)) self.blocks.append(Block(self, self.current_block_idx))
return self.current_block() return self.current_block()
......
...@@ -188,7 +188,7 @@ def save_inference_model(dirname, ...@@ -188,7 +188,7 @@ def save_inference_model(dirname,
raise ValueError("'feed_var_names' should be a list of str.") raise ValueError("'feed_var_names' should be a list of str.")
if isinstance(target_vars, Variable): if isinstance(target_vars, Variable):
feeded_var_names = [feeded_var_names] target_vars = [target_vars]
else: else:
if not (bool(target_vars) and all( if not (bool(target_vars) and all(
isinstance(var, Variable) for var in target_vars)): isinstance(var, Variable) for var in target_vars)):
......
...@@ -16,6 +16,36 @@ __all__ = [ ...@@ -16,6 +16,36 @@ __all__ = [
def split_lod_tensor(input, mask, level=0): def split_lod_tensor(input, mask, level=0):
"""
**split_lod_tensor**
This function takes in an input that contains the complete lod information,
and takes in a mask which is used to mask certain parts of the input.
The output is the true branch and the false branch with the mask applied to
the input at a certain level in the tensor.
Args:
input(tuple|list|None): The input tensor that contains complete
lod information needed to construct the output.
mask(list): A bool column vector which masks the input.
level(int): The specific lod level to rank.
Returns:
Variable: The true branch of tensor as per the mask applied to input.
Variable: The false branch of tensor as per the mask applied to input.
Examples:
.. code-block:: python
x = layers.data(name='x', shape=[1])
x.persistable = True
y = layers.data(name='y', shape=[1])
y.persistable = True
out_true, out_false = layers.split_lod_tensor(
input=x, mask=y, level=level)
"""
helper = LayerHelper('split_lod_tensor', **locals()) helper = LayerHelper('split_lod_tensor', **locals())
out_true = helper.create_tmp_variable(dtype=input.dtype) out_true = helper.create_tmp_variable(dtype=input.dtype)
out_false = helper.create_tmp_variable(dtype=input.dtype) out_false = helper.create_tmp_variable(dtype=input.dtype)
...@@ -32,6 +62,40 @@ def split_lod_tensor(input, mask, level=0): ...@@ -32,6 +62,40 @@ def split_lod_tensor(input, mask, level=0):
def merge_lod_tensor(in_true, in_false, x, mask, level=0): def merge_lod_tensor(in_true, in_false, x, mask, level=0):
"""
**merge_lod_tensor**
This function takes in an input :math:`x`, the True branch, the False
branch and a binary :math:`mask`. Using this information, this function
merges the True and False branches of the tensor into a single Output
at a certain lod level indiacted by :math:`level`.
Args:
in_true(tuple|list|None): The True branch to be merged.
in_false(tuple|list|None): The False branch to be merged.
x(tuple|list|None): The input tensor that contains complete
lod information needed to construct the output.
mask(list): A bool column vector which masks the input.
level(int): The specific lod level to rank.
Returns:
Variable: The merged output tensor.
Examples:
.. code-block:: python
x = layers.data(
name='x', shape=[1], dtype='float32', stop_gradient=False)
y = layers.data(
name='y', shape=[1], dtype='bool', stop_gradient=False)
level = 0
out_true, out_false = layers.split_lod_tensor(
input=x, mask=y, level=level)
out = layers.merge_lod_tensor(
in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
"""
helper = LayerHelper('merge_lod_tensor', **locals()) helper = LayerHelper('merge_lod_tensor', **locals())
out = helper.create_tmp_variable(dtype=in_true.dtype) out = helper.create_tmp_variable(dtype=in_true.dtype)
helper.append_op( helper.append_op(
...@@ -397,9 +461,50 @@ class While(object): ...@@ -397,9 +461,50 @@ class While(object):
def lod_rank_table(x, level=0): def lod_rank_table(x, level=0):
""" """LoD Rank Table Operator. Given an input variable **x** and a level number
This function creates an operator for creating a LOD_RANK_TABLE of LoD, this layer creates a LodRankTable object. A LoDRankTable object
using the input x. contains a list of bi-element tuples. Each tuple consists of an index and
a length, both of which are int type. Reffering to specified level of LoD,
the index is the sequence index number and the length representes the
sequence length. Please note that the list is ranked in descending order by
the length. The following is an example:
.. code-block:: text
x is a LoDTensor:
x.lod = [[0, 2, 3],
[0, 5, 6, 7]]
x.data = [a, b, c, d, e, f, g]
1. set level to 0:
Create lod rank table:
lod_rank_table_obj = lod_rank_table(x, level=0)
Get:
lod_rank_table_obj.items() = [(0, 2), (1, 1)]
2. set level to 1:
Create lod rank table:
lod_rank_table_obj = lod_rank_table(x, level=1)
Get:
lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
Args:
x (Variable): Input variable, a LoDTensor based which to create the lod
rank table.
level (int): Specify the LoD level, on which to create the lod rank
table.
Returns:
Variable: The created LoDRankTable object.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10],
dtype='float32', lod_level=1)
out = layers.lod_rank_table(x=x, level=0)
""" """
helper = LayerHelper("lod_rank_table", **locals()) helper = LayerHelper("lod_rank_table", **locals())
table = helper.create_variable( table = helper.create_variable(
...@@ -414,9 +519,25 @@ def lod_rank_table(x, level=0): ...@@ -414,9 +519,25 @@ def lod_rank_table(x, level=0):
def max_sequence_len(rank_table): def max_sequence_len(rank_table):
""" """Max Sequence Len Operator. Given a LoDRankTable object, this layer
This function creates an operator to calculate the length of returns the max length of a batch of sequences. In fact, a LoDRankTable
max seqence through input rank_table(should be a lod_rank_table) object contains a list of tuples(<sequence index, sequence length>) and
the list is already sorted by sequence length in descending order, so the
operator just returns the sequence length of the first tuple element.
Args:
rank_table (Variable): Input variable which is a LoDRankTable object.
Returns:
Variable: The max length of sequence.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10],
dtype='float32', lod_level=1)
rank_table = layers.lod_rank_table(x=x, level=0)
max_seq_len = layers.max_sequence_len(rank_table)
""" """
helper = LayerHelper("max_seqence_len", **locals()) helper = LayerHelper("max_seqence_len", **locals())
res = helper.create_tmp_variable(dtype="int64") res = helper.create_tmp_variable(dtype="int64")
...@@ -428,6 +549,30 @@ def max_sequence_len(rank_table): ...@@ -428,6 +549,30 @@ def max_sequence_len(rank_table):
def topk(input, k): def topk(input, k):
"""
**topk**
This function performs the operation that selects the k entries in the input
vector and outputs their values and indices as vectors. Thus topk_out[j] is
the j-th largest entry in input, and its index is topk_indices[j]
Args:
input (Variable|list): The input tensor that has all the data.
k (int): The number of top elements that the function will pick.
Returns:
Variable: The variable of type array that contains the k largest entries
from input.
Variable: The variable of type array that contains the indices of k
largest entries from input.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[10])
k = 5
array = fluid.layers.topk(x, k)
"""
helper = LayerHelper('topk', **locals()) helper = LayerHelper('topk', **locals())
topk_out = helper.create_tmp_variable(dtype=input.data_type) topk_out = helper.create_tmp_variable(dtype=input.data_type)
topk_indices = helper.create_tmp_variable(dtype='int64') topk_indices = helper.create_tmp_variable(dtype='int64')
......
...@@ -151,7 +151,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): ...@@ -151,7 +151,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
Args: Args:
input(Variable): Input to the function input(Variable): Input to the function
size(tuple|list|None): Shape of the look up table parameter size(tuple|list|None): Shape of the look up table parameter
is_sparse(bool): Boolean flag that specifying whether the input is sparse is_sparse(bool): Boolean flag that specifying whether the input is sparse
param_attr(ParamAttr): Parameters for this layer param_attr(ParamAttr): Parameters for this layer
dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
...@@ -270,6 +270,7 @@ def gru_unit(input, ...@@ -270,6 +270,7 @@ def gru_unit(input,
attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
# create bias # create bias
if bias is None: if bias is None:
bias_size = [1, 3 * size] bias_size = [1, 3 * size]
bias = helper.create_parameter( bias = helper.create_parameter(
...@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs): ...@@ -358,7 +359,59 @@ def cos_sim(X, Y, **kwargs):
def cross_entropy(input, label, **kwargs): def cross_entropy(input, label, **kwargs):
""" """
This function computes cross_entropy using the input and label. **Cross Entropy Layer**
This layer computes the cross entropy between `input` and `label`. It supports
both standard cross-entropy and soft-label cross-entropy loss computation.
1) One-hot cross-entropy:
`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
.. math::
Y[i] = -\log(X[i, Label[i]])
2) Soft-label cross-entropy:
`soft_label = True`, `Label[i, j]` indicates the soft label of class j
for sample i:
.. math::
Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
Please make sure that in this case the summation of each row of `label`
equals one.
3) One-hot cross-entropy with vecterized `label`:
As a special case of 2), when each row of 'label' has only one
non-zero element which is equal to 1, soft-label cross-entropy degenerates
to a one-hot cross-entropy with one-hot label representation.
Args:
input (Variable|list): a 2-D tensor with shape [N x D], where N is the
batch size and D is the number of classes. This input is a probability
computed by the previous operator, which is almost always the result
of a softmax operator.
label (Variable|list): the ground truth which is a 2-D tensor. When
`soft_label` is set to `False`, `label` is a tensor<int64> with shape
[N x 1]. When `soft_label` is set to `True`, `label` is a
tensor<float/double> with shape [N x D].
soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
the given labels as soft labels, default `False`.
Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss.
Raises:
`ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
`soft_label == True`, and the 2nd dimension of `input` and `label` are not \
equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
Examples:
.. code-block:: python
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
""" """
helper = LayerHelper('cross_entropy', **kwargs) helper = LayerHelper('cross_entropy', **kwargs)
out = helper.create_tmp_variable(dtype=input.dtype) out = helper.create_tmp_variable(dtype=input.dtype)
...@@ -373,8 +426,36 @@ def cross_entropy(input, label, **kwargs): ...@@ -373,8 +426,36 @@ def cross_entropy(input, label, **kwargs):
def square_error_cost(input, label, **kwargs): def square_error_cost(input, label, **kwargs):
""" """
This functions returns the squared error cost using the input and label. **Square error cost layer**
The output is appending the op to do the above.
This layer accepts input predictions and target label and returns the squared error cost.
For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
.. math::
Out = (X - Y)^2
In the above equation:
* :math:`X`: Input predictions, a tensor.
* :math:`Y`: Input labels, a tensor.
* :math:`Out`: Output value, same shape with :math:`X`.
Args:
input(Variable): Input tensor, has predictions.
label(Variable): Label tensor, has target labels.
Returns:
Variable: The tensor variable storing the element-wise squared error difference \
of input and label.
Examples:
.. code-block:: python
y = layers.data(name='y', shape=[1], dtype='float32')
y_predict = layers.data(name='y_predict', shape=[1], dtype='float32')
cost = layers.square_error_cost(input=y_predict, label=y)
""" """
helper = LayerHelper('square_error_cost', **kwargs) helper = LayerHelper('square_error_cost', **kwargs)
minus_out = helper.create_tmp_variable(dtype=input.dtype) minus_out = helper.create_tmp_variable(dtype=input.dtype)
...@@ -514,14 +595,83 @@ def conv2d(input, ...@@ -514,14 +595,83 @@ def conv2d(input,
groups=None, groups=None,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
act=None, act=None):
name=None):
""" """
This function creates the op for a 2-dimensional Convolution. **Convlution2D Layer**
This is performed using the parameters of filters(size, dimensionality etc)
, stride and other configurations for a Convolution operation. The convolution2D layer calculates the output based on the input, filter
This funciton can also append an activation on top of the and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output)
conv-2d output, if mentioned in the input parameters. are in NCHW format. Where N is batch size, C is the number of channels, H is the height
of the feature, and W is the width of the feature.
The details of convolution layer, please refer UFLDL's `convolution,
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
If bias attribution and activation type are provided, bias is added to the output of the convolution,
and the corresponding activation function is applied to the final result.
For each input :math:`X`, the equation is:
.. math::
Out = \sigma (W \\ast X + b)
In the above equation:
* :math:`X`: Input value, a tensor with NCHW format.
* :math:`W`: Filter value, a tensor with MCHW format.
* :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
Example:
Input:
Input shape: $(N, C_{in}, H_{in}, W_{in})$
Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
Output:
Output shape: $(N, C_{out}, H_{out}, W_{out})$
Where
.. math::
H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride. Default: stride = 1.
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding. Default: padding = 0.
groups(int): The groups number of the Conv2d Layer. According to grouped
convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
the first half of the filters is only connected to the first half
of the input channels, while the second half of the filters is only
connected to the second half of the input channels. Default: groups=1
param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
act(str): Activation type. Default: None
Returns:
Variable: The tensor variable storing the convolution and \
non-linearity activation result.
Raises:
ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch.
Examples:
.. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
""" """
if stride is None: if stride is None:
...@@ -577,9 +727,9 @@ def conv2d(input, ...@@ -577,9 +727,9 @@ def conv2d(input,
def sequence_pool(input, pool_type, **kwargs): def sequence_pool(input, pool_type, **kwargs):
""" """
This function add the operator for sequence pooling. This function add the operator for sequence pooling.
It pools features of all time-steps of each instance, and is applied It pools features of all time-steps of each instance, and is applied
on top of the input using pool_type mentioned in the parameters. on top of the input using pool_type mentioned in the parameters.
It supports four pool_type: It supports four pool_type:
...@@ -608,7 +758,7 @@ def sequence_pool(input, pool_type, **kwargs): ...@@ -608,7 +758,7 @@ def sequence_pool(input, pool_type, **kwargs):
Args: Args:
input(variable): The input variable which is a LoDTensor. input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool. pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max. It supports average, sum, sqrt and max.
Returns: Returns:
...@@ -618,7 +768,7 @@ def sequence_pool(input, pool_type, **kwargs): ...@@ -618,7 +768,7 @@ def sequence_pool(input, pool_type, **kwargs):
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1], x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
avg_x = fluid.layers.sequence_pool(input=x, pool_type='average') avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum') sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
...@@ -666,7 +816,7 @@ def sequence_first_step(input, **kwargs): ...@@ -666,7 +816,7 @@ def sequence_first_step(input, **kwargs):
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1], x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
x_first_step = fluid.layers.sequence_first_step(input=x) x_first_step = fluid.layers.sequence_first_step(input=x)
""" """
...@@ -699,7 +849,7 @@ def sequence_last_step(input, **kwargs): ...@@ -699,7 +849,7 @@ def sequence_last_step(input, **kwargs):
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1], x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
x_last_step = fluid.layers.sequence_last_step(input=x) x_last_step = fluid.layers.sequence_last_step(input=x)
""" """
...@@ -1018,25 +1168,26 @@ def lstm_unit(x_t, ...@@ -1018,25 +1168,26 @@ def lstm_unit(x_t,
.. math:: .. math::
i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i) i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f) f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c) c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o) o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
h_t & = o_t tanh(c_t) h_t & = o_t tanh(c_t)
The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
:math:`c_{t-1}`. The implementation separates the linear transformation :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
and non-linear transformation apart. Here, we take :math:`i_t` as an should be same. The implementation separates the linear transformation and
example. The linear transformation is applied by calling a `fc` layer and non-linear transformation apart. Here, we take :math:`i_t` as an example.
the equation is: The linear transformation is applied by calling a `fc` layer and the
equation is:
.. math:: .. math::
L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
The non-linear transformation is applied by calling `lstm_unit_op` and the The non-linear transformation is applied by calling `lstm_unit_op` and the
equation is: equation is:
...@@ -1048,9 +1199,12 @@ def lstm_unit(x_t, ...@@ -1048,9 +1199,12 @@ def lstm_unit(x_t,
This layer has two outputs including :math:`h_t` and :math:`o_t`. This layer has two outputs including :math:`h_t` and :math:`o_t`.
Args: Args:
x_t (Variable): The input value of current step. x_t (Variable): The input value of current step, a 2-D tensor with shape
hidden_t_prev (Variable): The hidden value of lstm unit. M x N, M for batch size and N for input size.
cell_t_prev (Variable): The cell value of lstm unit. hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
with shape M x S, M for batch size and S for size of lstm unit.
cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
shape M x S, M for batch size and S for size of lstm unit.
forget_bias (float): The forget bias of lstm unit. forget_bias (float): The forget bias of lstm unit.
param_attr (ParamAttr): The attributes of parameter weights, used to set param_attr (ParamAttr): The attributes of parameter weights, used to set
initializer, name etc. initializer, name etc.
...@@ -1063,14 +1217,15 @@ def lstm_unit(x_t, ...@@ -1063,14 +1217,15 @@ def lstm_unit(x_t,
Raises: Raises:
ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\ ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \ not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
and **cell_t_prev** not be the same. and **cell_t_prev** not be the same or the 2nd dimensions of \
**hidden_t_prev** and **cell_t_prev** not be the same.
Examples: Examples:
.. code-block:: python .. code-block:: python
x_t = fluid.layers.fc(input=x_t_data, size=10) x_t = fluid.layers.fc(input=x_t_data, size=10)
prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20) prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
prev_cell = fluid.layers.fc(input=prev_cell_data, size=30) prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t, hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
hidden_t_prev=prev_hidden, hidden_t_prev=prev_hidden,
...@@ -1089,7 +1244,11 @@ def lstm_unit(x_t, ...@@ -1089,7 +1244,11 @@ def lstm_unit(x_t,
if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[ if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
0] != cell_t_prev.shape[0]: 0] != cell_t_prev.shape[0]:
raise ValueError("The 1s dimension of x_t, hidden_t_prev and " raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
"cell_t_prev must be the same.")
if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
raise ValueError("The 2nd dimensions of hidden_t_prev and "
"cell_t_prev must be the same.") "cell_t_prev must be the same.")
if bias_attr is None: if bias_attr is None:
...@@ -1118,17 +1277,17 @@ def lstm_unit(x_t, ...@@ -1118,17 +1277,17 @@ def lstm_unit(x_t,
def reduce_sum(input, dim=None, keep_dim=False): def reduce_sum(input, dim=None, keep_dim=False):
""" """
Computes the sum of tensor elements over the given dimension. Computes the sum of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the sum is performed. If dim (int|None): The dimension along which the sum is performed. If
:attr:`None`, sum all elements of :attr:`input` and return a :attr:`None`, sum all elements of :attr:`input` and return a
Tensor variable with a single element, otherwise must be in the Tensor variable with a single element, otherwise must be in the
range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
the dimension to reduce is :math:`rank + dim`. the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
...@@ -1162,17 +1321,17 @@ def reduce_sum(input, dim=None, keep_dim=False): ...@@ -1162,17 +1321,17 @@ def reduce_sum(input, dim=None, keep_dim=False):
def reduce_mean(input, dim=None, keep_dim=False): def reduce_mean(input, dim=None, keep_dim=False):
""" """
Computes the mean of tensor elements over the given dimension. Computes the mean of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the mean is computed. If dim (int|None): The dimension along which the mean is computed. If
:attr:`None`, compute the mean over all elements of :attr:`input` :attr:`None`, compute the mean over all elements of :attr:`input`
and return a Tensor variable with a single element, otherwise and return a Tensor variable with a single element, otherwise
must be in the range :math:`[-rank(input), rank(input))`. If must be in the range :math:`[-rank(input), rank(input))`. If
:math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
...@@ -1206,22 +1365,22 @@ def reduce_mean(input, dim=None, keep_dim=False): ...@@ -1206,22 +1365,22 @@ def reduce_mean(input, dim=None, keep_dim=False):
def reduce_max(input, dim=None, keep_dim=False): def reduce_max(input, dim=None, keep_dim=False):
""" """
Computes the maximum of tensor elements over the given dimension. Computes the maximum of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the maximum is computed. dim (int|None): The dimension along which the maximum is computed.
If :attr:`None`, compute the maximum over all elements of If :attr:`None`, compute the maximum over all elements of
:attr:`input` and return a Tensor variable with a single element, :attr:`input` and return a Tensor variable with a single element,
otherwise must be in the range :math:`[-rank(input), rank(input))`. otherwise must be in the range :math:`[-rank(input), rank(input))`.
If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
Variable: The reduced Tensor variable. Variable: The reduced Tensor variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1250,22 +1409,22 @@ def reduce_max(input, dim=None, keep_dim=False): ...@@ -1250,22 +1409,22 @@ def reduce_max(input, dim=None, keep_dim=False):
def reduce_min(input, dim=None, keep_dim=False): def reduce_min(input, dim=None, keep_dim=False):
""" """
Computes the minimum of tensor elements over the given dimension. Computes the minimum of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the minimum is computed. dim (int|None): The dimension along which the minimum is computed.
If :attr:`None`, compute the minimum over all elements of If :attr:`None`, compute the minimum over all elements of
:attr:`input` and return a Tensor variable with a single element, :attr:`input` and return a Tensor variable with a single element,
otherwise must be in the range :math:`[-rank(input), rank(input))`. otherwise must be in the range :math:`[-rank(input), rank(input))`.
If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
Variable: The reduced Tensor variable. Variable: The reduced Tensor variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -201,15 +201,47 @@ def fill_constant_batch_size_like(input, ...@@ -201,15 +201,47 @@ def fill_constant_batch_size_like(input,
def ones(shape, dtype): def ones(shape, dtype):
""" """
This function performs the same function as fill_constant() declared above **ones**
with the constant value being 1.0.
This function creates a tensor of specified *shape* and
*dtype*, and initializes this with 1.
It also sets *stop_gradient* to True.
Args:
shape(tuple|list|None): Shape of output tensor
dtype(np.dtype|core.DataType|str): Data type of output tensor
Returns:
Variable: The tensor variable storing the output
Examples:
.. code-block:: python
data = fluid.layers.ones(shape=[1], dtype='int64')
""" """
return fill_constant(value=1.0, **locals()) return fill_constant(value=1.0, **locals())
def zeros(shape, dtype): def zeros(shape, dtype):
""" """
This function performs the same function as fill_constant() declared above **zeros**
with the constant value being 0.0.
This function creates a tensor of specified *shape* and
*dtype*, and initializes this with 0.
It also sets *stop_gradient* to True.
Args:
shape(tuple|list|None): Shape of output tensor
dtype(np.dtype|core.DataType|str): Data type of output tensor
Returns:
Variable: The tensor variable storing the output
Examples:
.. code-block:: python
data = fluid.layers.zeros(shape=[1], dtype='int64')
""" """
return fill_constant(value=0.0, **locals()) return fill_constant(value=0.0, **locals())
from collections import defaultdict from collections import defaultdict
import framework import framework
from backward import append_backward_ops from backward import append_backward
from framework import unique_name, program_guard from framework import unique_name, program_guard
from initializer import Constant from initializer import Constant
from layer_helper import LayerHelper from layer_helper import LayerHelper
...@@ -194,10 +194,10 @@ class Optimizer(object): ...@@ -194,10 +194,10 @@ class Optimizer(object):
no_grad_set=None): no_grad_set=None):
"""Add operations to minimize `loss` by updating `parameter_list`. """Add operations to minimize `loss` by updating `parameter_list`.
This method combines interface `append_backward_ops()` and This method combines interface `append_backward()` and
`create_optimization_pass()` into one. `create_optimization_pass()` into one.
""" """
params_grads = append_backward_ops(loss, parameter_list, no_grad_set) params_grads = append_backward(loss, parameter_list, no_grad_set)
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
......
...@@ -170,7 +170,7 @@ def main(): ...@@ -170,7 +170,7 @@ def main():
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
embedding_param = fluid.g_scope.find_var(embedding_name).get_tensor() embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set( embedding_param.set(
load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place) load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
......
...@@ -38,35 +38,43 @@ train_reader = paddle.batch( ...@@ -38,35 +38,43 @@ train_reader = paddle.batch(
place = fluid.CPUPlace() place = fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS") pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv("TRAINING_ROLE", training_role = os.getenv("TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver "TRAINER") # get the training role: trainer/pserver
t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1) t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
if training_role == "PSERVER": if training_role == "PSERVER":
pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops) if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
exe.run(pserver_prog) exe.run(pserver_prog)
elif training_role == "TRAINER": elif training_role == "TRAINER":
trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(feed_list=[images, label], place=place) feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
batch_id = 0
for data in train_reader(): for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(), loss, acc = exe.run(trainer_prog,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
# print loss, acc if batch_id % 100 == 0:
if loss < 10.0 and pass_acc > 0.9: print("batch_id %d, loss: %f, acc: %f" %
# if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good. (batch_id, loss, pass_acc))
exit(0) batch_id += 1
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc)) print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
else: else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER") print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
exit(1)
import paddle.v2.fluid as fluid
__all__ = ['many_times', 'prog_scope']
def many_times(times):
def __impl__(fn):
def __fn__(*args, **kwargs):
for _ in range(times):
fn(*args, **kwargs)
return __fn__
return __impl__
def prog_scope():
def __impl__(fn):
def __fn__(*args, **kwargs):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
fn(*args, **kwargs)
return __fn__
return __impl__
...@@ -4,7 +4,7 @@ import random ...@@ -4,7 +4,7 @@ import random
import itertools import itertools
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import collections import collections
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
from paddle.v2.fluid.op import Operator from paddle.v2.fluid.op import Operator
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.framework import Program, OpProtoHolder from paddle.v2.fluid.framework import Program, OpProtoHolder
...@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase): ...@@ -491,7 +491,7 @@ class OpTest(unittest.TestCase):
op_loss.desc.infer_var_type(block.desc) op_loss.desc.infer_var_type(block.desc)
op_loss.desc.infer_shape(block.desc) op_loss.desc.infer_shape(block.desc)
param_grad_list = append_backward_ops( param_grad_list = append_backward(
loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set) loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
feed_dict = { feed_dict = {
......
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
from paddle.v2.fluid import core
from paddle.v2.fluid.op import Operator
class TestAdamOp1(OpTest): class TestAdamOp1(OpTest):
...@@ -176,5 +178,124 @@ def adam_step(inputs, attributes): ...@@ -176,5 +178,124 @@ def adam_step(inputs, attributes):
return param_out, moment1_out, moment2_out return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
:param attributes: dict of attributes
:return tuple: tuple of output param, moment1, moment2,
beta1 power accumulator and beta2 power accumulator
'''
param = inputs['Param']
# grad = inputs['Grad']
moment1 = inputs['Moment1']
moment2 = inputs['Moment2']
lr = inputs['LearningRate']
beta1_pow = inputs['Beta1Pow']
beta2_pow = inputs['Beta2Pow']
beta1 = attributes['beta1']
beta2 = attributes['beta2']
epsilon = attributes['epsilon']
moment1_out = np.zeros(shape=[height, row_numel])
moment2_out = np.zeros(shape=[height, row_numel])
param_out = np.zeros(shape=[height, row_numel])
for idx, row_id in enumerate(rows):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
) * np_grad[idx]
moment2_out[row_id] = beta2 * moment2[row_id] + (
1 - beta2) * np.square(np_grad[idx])
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
np.sqrt(moment2_out[row_id]) + epsilon))
return param_out, moment1_out, moment2_out
class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place):
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
height = 10
rows = [0, 4, 7]
self.rows = rows
row_numel = 12
self.row_numel = row_numel
self.dense_inputs = {
"Param": np.full((height, row_numel), 5.0).astype("float32"),
"Moment1": np.full((height, row_numel), 5.0).astype("float32"),
"Moment2": np.full((height, row_numel), 5.0).astype("float32"),
'Beta1Pow': np.array([beta1**10]).astype("float32"),
'Beta2Pow': np.array([beta2**10]).astype("float32"),
"LearningRate": np.full((1), 2.0).astype("float32")
}
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
np_array = np.ones((len(rows), row_numel)).astype("float32")
np_array[0, 0] = 2.0
np_array[2, 8] = 4.0
grad_tensor = grad_selected_rows.get_tensor()
grad_tensor.set(np_array, place)
self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(
self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2
}
def check_with_place(self, place):
scope = core.Scope()
self.setup(scope, place)
op_args = dict()
for key, np_array in self.dense_inputs.iteritems():
var = scope.var(key).get_tensor()
var.set(np_array, place)
op_args[key] = key
for s in self.sparse_inputs:
op_args[s] = s
for s in self.outputs:
var = scope.var(s).get_tensor()
var.set(self.outputs[s], place)
op_args[s] = s
for k in self.attrs:
op_args[k] = self.attrs[k]
# create and run sgd operator
adam_op = Operator("adam", **op_args)
adam_op.run(scope, place)
for key, np_array in self.outputs.iteritems():
out_var = scope.var(key).get_tensor()
actual = np.array(out_var)
actual = actual.reshape([actual.size])
np_array = np_array.reshape([np_array.size])
for idx, row_id in enumerate(self.rows):
j = 0
while j < self.row_numel:
pos = row_id * self.row_numel + j
self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
0.00001)
j += 1
def test_sparse_sgd(self):
places = [core.CPUPlace()]
if core.is_compile_gpu():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
from paddle.v2.fluid.framework import default_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
...@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase): ...@@ -64,7 +64,7 @@ class TestArrayReadWrite(unittest.TestCase):
total_sum = layers.sums(input=[a_sum, x_sum]) total_sum = layers.sums(input=[a_sum, x_sum])
total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0) total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
append_backward_ops(total_sum_scaled) append_backward(total_sum_scaled)
g_vars = map(default_main_program().global_block().var, g_vars = map(default_main_program().global_block().var,
[each_x.name + "@GRAD" for each_x in x]) [each_x.name + "@GRAD" for each_x in x])
......
...@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers ...@@ -3,7 +3,7 @@ import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.framework import default_startup_program, default_main_program from paddle.v2.fluid.framework import default_startup_program, default_main_program
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy import numpy
...@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase): ...@@ -26,7 +26,7 @@ class ConditionalBlock(unittest.TestCase):
outs = exe.run(feed={'X': x}, fetch_list=[out])[0] outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
print outs print outs
loss = layers.mean(x=out) loss = layers.mean(x=out)
append_backward_ops(loss=loss) append_backward(loss=loss)
outs = exe.run( outs = exe.run(
feed={'X': x}, feed={'X': x},
fetch_list=[ fetch_list=[
......
import unittest
import numpy as np
from op_test import OpTest
class TestUnpoolOp(OpTest):
def setUp(self):
self.op_type = "detection_output"
self.init_test_case()
#loc.shape ((1, 4, 4, 1, 1))
#conf.shape ((1, 4, 2, 1, 1))
loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
[[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]],
[[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
priorbox = np.array([
0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1,
0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4,
0.8, 0.8, 0.1, 0.1, 0.2, 0.2
])
output = np.array([
0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031
])
self.inputs = {
'Loc': loc.astype('float32'),
'Conf': conf.astype('float32'),
'PriorBox': priorbox.astype('float32')
}
self.attrs = {
'num_classes': self.num_classes,
'top_k': self.top_k,
'nms_top_k': self.nms_top_k,
'background_label_id': self.background_label_id,
'nms_threshold': self.nms_threshold,
'confidence_threshold': self.confidence_threshold,
}
self.outputs = {'Out': output.astype('float32')}
def test_check_output(self):
self.check_output()
def init_test_case(self):
self.num_classes = 2
self.top_k = 10
self.nms_top_k = 20
self.background_label_id = 0
self.nms_threshold = 0.01
self.confidence_threshold = 0.01
if __name__ == '__main__':
unittest.main()
import numpy
import random
import collections
import paddle.v2.fluid as fluid
import unittest
from decorators import *
class Memory(object):
def __init__(self, shape, dtype='float32'):
self.ex = numpy.zeros(shape=shape, dtype=dtype)
self.cur = None
def update(self, val):
assert val.shape == self.ex.shape
assert val.dtype == self.ex.dtype
self.cur = val
def ex(self):
return self.ex
def next(self):
self.ex = self.cur
self.cur = None
def __next__(self):
self.next()
def reset(self):
self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype)
self.cur = None
class Output(object):
def __init__(self):
self.outs = []
def next_sequence(self):
self.outs.append([])
def out(self, val):
self.outs[-1].append(val)
def last(self):
return self.outs[-1][-1]
class BaseRNN(object):
def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15):
self.num_seq = num_seq
self.inputs = collections.defaultdict(list)
for _ in xrange(num_seq):
seq_len = random.randint(1, max_seq_len - 1)
for iname in ins:
ishape = ins[iname].get('shape', None)
idtype = ins[iname].get('dtype', 'float32')
lst = []
for _ in xrange(seq_len):
lst.append(numpy.random.random(size=ishape).astype(idtype))
self.inputs[iname].append(lst)
self.mems = dict()
for mname in mems:
mshape = mems[mname].get('shape', None)
mdtype = mems[mname].get('dtype', 'float32')
self.mems[mname] = Memory(shape=mshape, dtype=mdtype)
self.params = dict()
for pname in params:
pshape = params[pname].get('shape', None)
pdtype = params[pname].get('dtype', 'float32')
self.params[pname] = numpy.random.random(size=pshape).astype(pdtype)
self.outputs = dict()
for oname in outs:
self.outputs[oname] = Output()
def step(self, **kwargs):
raise NotImplementedError()
def exe(self):
retv = dict()
for out in self.outputs:
retv[out] = []
for seq_id in xrange(self.num_seq):
for mname in self.mems:
self.mems[mname].reset()
for out in self.outputs:
self.outputs[out].next_sequence()
iname0 = self.inputs.keys()[0]
seq_len = len(self.inputs[iname0][seq_id])
for step_id in xrange(seq_len):
xargs = dict()
for iname in self.inputs:
xargs[iname] = self.inputs[iname][seq_id][step_id]
for mname in self.mems:
xargs[mname] = self.mems[mname]
for pname in self.params:
xargs[pname] = self.params[pname]
for out in self.outputs:
xargs[out] = self.outputs[out]
self.step(**xargs)
for mname in self.mems:
next(self.mems[mname])
for out in self.outputs:
retv[out].append(self.outputs[out].last())
for out in retv:
retv[out] = numpy.array(retv[out])
return retv
def to_feed(self, place):
feed_dict = dict()
for iname in self.inputs:
lod = [0]
np_flatten = []
for seq_id in xrange(len(self.inputs[iname])):
seq_len = len(self.inputs[iname][seq_id])
lod.append(lod[-1] + seq_len)
np_flatten.extend(self.inputs[iname][seq_id])
t = fluid.Tensor()
t.set(numpy.array(np_flatten), place)
t.set_lod([lod])
feed_dict[iname] = t
for pname in self.params:
feed_dict[pname] = self.params[pname]
return feed_dict
def get_numeric_gradient_of_param(self, param_name, delta=0.001):
p = self.params[param_name]
if len(p.shape) != 2:
raise ValueError("Not support get numeric gradient of an parameter,"
" which is not matrix")
g = numpy.zeros(shape=p.shape, dtype=p.dtype)
for i in xrange(p.shape[0]):
for j in xrange(p.shape[1]):
o = p[i][j]
p[i][j] += delta
pos = self._exe_mean_out_()
p[i][j] -= 2 * delta
neg = self._exe_mean_out_()
p[i][j] = o
g[i][j] = (pos - neg) / (delta * 2)
return g
def get_numeric_gradient_of_input(self,
input_name,
delta=0.001,
return_one_tensor=True):
ipt = self.inputs[input_name]
grad = []
for seq in ipt:
seq_grad = []
for item in seq:
item_grad = numpy.zeros(shape=item.shape, dtype=item.dtype)
if len(item.shape) != 1:
raise ValueError("Not support")
for i in xrange(len(item)):
o = item[i]
item[i] += delta
pos = self._exe_mean_out_()
item[i] -= 2 * delta
neg = self._exe_mean_out_()
item[i] = o
item_grad[i] = (pos - neg) / (delta * 2)
seq_grad.append(item_grad)
grad.append(seq_grad)
if not return_one_tensor:
return grad
for i in xrange(len(grad)):
grad[i] = numpy.concatenate(grad[i])
grad = numpy.concatenate(grad)
return grad
def _exe_mean_out_(self):
outs = self.exe()
return numpy.array([o.mean() for o in outs.itervalues()]).mean()
class TestSimpleMul(unittest.TestCase):
DATA_NAME = 'X'
DATA_WIDTH = 32
PARAM_NAME = 'W'
HIDDEN_WIDTH = 10
OUT_NAME = 'Out'
class SimpleMul(BaseRNN):
def __init__(self):
base = TestSimpleMul
super(base.SimpleMul, self).__init__({
base.DATA_NAME: {
'shape': [base.DATA_WIDTH]
}
}, {}, {
base.PARAM_NAME: {
'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
}
}, [base.OUT_NAME])
def step(self, X, W, Out):
Out.out(numpy.matmul(X, W))
# Test many times in local to ensure the random seed cannot breaks CI
# @many_times(10)
@prog_scope()
def test_forward_backward(self):
py_rnn = TestSimpleMul.SimpleMul()
dat = fluid.layers.data(
name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
dat.stop_gradient = False
rnn = fluid.layers.DynamicRNN()
with rnn.block():
d = rnn.step_input(dat)
o = fluid.layers.fc(input=d,
param_attr=self.PARAM_NAME,
bias_attr=False,
size=self.HIDDEN_WIDTH,
act=None)
rnn.output(o)
out = rnn()
out = fluid.layers.sequence_pool(out, pool_type='last')
loss = fluid.layers.mean(x=out)
fluid.backward.append_backward(loss)
cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu)
out, w_g, i_g = map(numpy.array,
exe.run(feed=py_rnn.to_feed(cpu),
fetch_list=[
out, self.PARAM_NAME + "@GRAD",
self.DATA_NAME + "@GRAD"
],
return_numpy=False))
out_by_python = py_rnn.exe()[self.OUT_NAME]
self.assertTrue(numpy.allclose(out, out_by_python))
w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05))
i_g_num = py_rnn.get_numeric_gradient_of_input(
input_name=self.DATA_NAME)
i_g_num = i_g_num.reshape(i_g.shape)
self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.05))
class TestSimpleMulWithMemory(unittest.TestCase):
DATA_WIDTH = 32
HIDDEN_WIDTH = 20
DATA_NAME = 'X'
PARAM_NAME = 'W'
class SimpleMulWithMemory(BaseRNN):
def __init__(self):
super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({
TestSimpleMulWithMemory.DATA_NAME: {
'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
}
}, {'Mem': {
'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
}}, {
TestSimpleMulWithMemory.PARAM_NAME: {
'shape': [
TestSimpleMulWithMemory.DATA_WIDTH,
TestSimpleMulWithMemory.HIDDEN_WIDTH
]
}
}, ['Out'])
def step(self, X, Mem, W, Out):
o = numpy.matmul(X, W)
assert isinstance(Mem, Memory)
o += Mem.ex
Mem.update(o)
assert isinstance(Out, Output)
Out.out(o)
# many_times used locally for debug. Make sure the calculation is stable.
# @many_times(10)
@prog_scope()
def test_forward_backward(self):
py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory()
data = fluid.layers.data(
name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
data.stop_gradient = False
rnn = fluid.layers.DynamicRNN()
with rnn.block():
d = rnn.step_input(data)
mem = rnn.memory(value=0.0, shape=[self.HIDDEN_WIDTH])
hidden = fluid.layers.fc(input=d,
size=self.HIDDEN_WIDTH,
param_attr=self.PARAM_NAME,
bias_attr=False,
act=None)
o = fluid.layers.elementwise_add(x=hidden, y=mem)
rnn.update_memory(mem, o)
rnn.output(o)
out = rnn()
last = fluid.layers.sequence_pool(input=out, pool_type='last')
loss = fluid.layers.mean(x=last)
fluid.backward.append_backward(loss)
cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu)
feed = py_rnn.to_feed(cpu)
last_np, w_g, i_g = map(numpy.array,
exe.run(feed=feed,
fetch_list=[
last, self.PARAM_NAME + "@GRAD",
self.DATA_NAME + "@GRAD"
],
return_numpy=False))
last_by_py, = py_rnn.exe().values()
w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
self.assertTrue(numpy.allclose(last_np, last_by_py))
self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1))
i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME)
i_g_num = i_g_num.reshape(i_g.shape)
# Since this RNN has many float add. The number could be not stable.
# rtol = 0.1
self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.1))
if __name__ == '__main__':
unittest.main()
...@@ -177,8 +177,8 @@ class TestBook(unittest.TestCase): ...@@ -177,8 +177,8 @@ class TestBook(unittest.TestCase):
name='x_t_data', shape=[10, 10], dtype='float32') name='x_t_data', shape=[10, 10], dtype='float32')
x_t = layers.fc(input=x_t_data, size=10) x_t = layers.fc(input=x_t_data, size=10)
prev_hidden_data = layers.data( prev_hidden_data = layers.data(
name='prev_hidden_data', shape=[10, 20], dtype='float32') name='prev_hidden_data', shape=[10, 30], dtype='float32')
prev_hidden = layers.fc(input=prev_hidden_data, size=20) prev_hidden = layers.fc(input=prev_hidden_data, size=30)
prev_cell_data = layers.data( prev_cell_data = layers.data(
name='prev_cell', shape=[10, 30], dtype='float32') name='prev_cell', shape=[10, 30], dtype='float32')
prev_cell = layers.fc(input=prev_cell_data, size=30) prev_cell = layers.fc(input=prev_cell_data, size=30)
......
...@@ -4,7 +4,7 @@ import numpy ...@@ -4,7 +4,7 @@ import numpy
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.framework import Program, program_guard from paddle.v2.fluid.framework import Program, program_guard
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
...@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase): ...@@ -170,7 +170,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
mean = layers.mean(x=result) mean = layers.mean(x=result)
append_backward_ops(mean) append_backward(mean)
tensor = core.LoDTensor() tensor = core.LoDTensor()
tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place) tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
......
import unittest
import numpy as np
from op_test import OpTest
def norm(input, scale, epsilon):
s0, s1, s2, s3 = input.shape
x_square = input * input
for i in xrange(s0):
input_batch = input[i:i + 1, :, :, :]
input_batch = input_batch.reshape(s1, s2 * s3)
x_square_batch = x_square[i:i + 1, :, :, :]
x_square_batch = x_square_batch.reshape(s1, s2 * s3)
square_colsum = x_square_batch.sum(axis=0) + epsilon
tmp = pow(square_colsum, 0.5)
tmp = np.reciprocal(tmp)
tmp_tile = np.tile(tmp, s1)
tmp_tile = tmp_tile.reshape(s1, s2 * s3)
scale_tile = np.tile(scale, (1, s2 * s3))
scale_tile = scale_tile.reshape(s1, s2 * s3)
out_batch = input_batch * tmp_tile * scale_tile
out_batch = out_batch.reshape(1, s1, s2, s3)
if i == 0:
out = out_batch
else:
out = np.concatenate((out, out_batch), 0)
out.reshape(s0, s1, s2, s3)
return out
class TestNormOp(OpTest):
def setUp(self):
self.op_type = "norm"
self.init_test_case()
input = np.random.random(self.shape).astype("float32")
scale = np.array([10, 10, 10])
self.inputs = {
'X': input.astype('float32'),
'Scale': scale.astype('float32')
}
self.attrs = {'epsilon': self.epsilon}
output = norm(input, scale, self.epsilon)
self.outputs = {'Out': output.astype('float32')}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
def init_test_case(self):
self.shape = [2, 3, 2, 2]
self.epsilon = 1e-6
if __name__ == '__main__':
unittest.main()
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestOptimizer(unittest.TestCase): class TestOptimizer(unittest.TestCase):
...@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -102,7 +102,7 @@ class TestMomentumOptimizer(unittest.TestCase):
dtype="float32", shape=[1], lod_level=0, name="mean.out") dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass( opts = momentum_optimizer.create_optimization_pass(
...@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase): ...@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
momentum_optimizer = self.MockMomentum( momentum_optimizer = self.MockMomentum(
learning_rate=learning_rate, momentum=0.2, use_nesterov=True) learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(momentum_optimizer.get_accumulators()), 0) self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
opts = momentum_optimizer.create_optimization_pass( opts = momentum_optimizer.create_optimization_pass(
...@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -209,7 +209,7 @@ class TestAdagradOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
adagrad_optimizer = self.MockAdagrad( adagrad_optimizer = self.MockAdagrad(
learning_rate=learning_rate, epsilon=1.0e-6) learning_rate=learning_rate, epsilon=1.0e-6)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out, opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
...@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase): ...@@ -269,7 +269,7 @@ class TestAdamOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
adam_optimizer = self.MockAdam( adam_optimizer = self.MockAdam(
learning_rate=learning_rate, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adam_optimizer.get_accumulators()), 0) self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
opts = adam_optimizer.create_optimization_pass(params_grads, mul_out, opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
...@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -331,7 +331,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
adamax_optimizer = self.MockAdamax( adamax_optimizer = self.MockAdamax(
learning_rate=learning_rate, beta1=0.9, beta2=0.999) learning_rate=learning_rate, beta1=0.9, beta2=0.999)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(adamax_optimizer.get_accumulators()), 0) self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out, opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
...@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase): ...@@ -390,7 +390,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
learning_rate = 0.01 learning_rate = 0.01
decayed_adagrad_optimizer = self.MockDecayedAdagrad( decayed_adagrad_optimizer = self.MockDecayedAdagrad(
learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6) learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
opts = decayed_adagrad_optimizer.create_optimization_pass( opts = decayed_adagrad_optimizer.create_optimization_pass(
......
...@@ -3,7 +3,7 @@ import unittest ...@@ -3,7 +3,7 @@ import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.framework import Program, grad_var_name from paddle.v2.fluid.framework import Program, grad_var_name
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy as np import numpy as np
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
...@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase): ...@@ -177,7 +177,7 @@ class RecurrentOpTest1(unittest.TestCase):
def test_backward(self): def test_backward(self):
self.check_forward() self.check_forward()
append_backward_ops(self.output) append_backward(self.output)
ana_grad = [np.array(x) for x in self.backward()] ana_grad = [np.array(x) for x in self.backward()]
......
...@@ -3,7 +3,7 @@ import unittest ...@@ -3,7 +3,7 @@ import unittest
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.regularizer as regularizer import paddle.v2.fluid.regularizer as regularizer
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestL2DecayRegularizer(unittest.TestCase): class TestL2DecayRegularizer(unittest.TestCase):
...@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase): ...@@ -33,7 +33,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
dtype="float32", shape=[1], lod_level=0, name="mean.out") dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
count_ops = len(block.ops) count_ops = len(block.ops)
params_grads = optimizer.append_regularization_ops(params_grads) params_grads = optimizer.append_regularization_ops(params_grads)
...@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase): ...@@ -70,7 +70,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
dtype="float32", shape=[1], lod_level=0, name="mean.out") dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op( block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
params_grads = append_backward_ops(mean_out) params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1) self.assertEqual(len(params_grads), 1)
count_ops = len(block.ops) count_ops = len(block.ops)
params_grads = optimizer.append_regularization_ops(params_grads) params_grads = optimizer.append_regularization_ops(params_grads)
......
...@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase): ...@@ -12,7 +12,7 @@ class TestReorderLoDTensor(unittest.TestCase):
new_dat = fluid.layers.reorder_lod_tensor_by_rank( new_dat = fluid.layers.reorder_lod_tensor_by_rank(
x=dat, rank_table=table) x=dat, rank_table=table)
loss = fluid.layers.mean(x=new_dat) loss = fluid.layers.mean(x=new_dat)
fluid.backward.append_backward_ops(loss=loss) fluid.backward.append_backward(loss=loss)
cpu = fluid.CPUPlace() cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu) exe = fluid.Executor(cpu)
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy as np import numpy as np
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
from paddle.v2.fluid.framework import default_main_program from paddle.v2.fluid.framework import default_main_program
import numpy import numpy
...@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase): ...@@ -35,7 +35,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2])) self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
mem3_mean = layers.mean(x=mem3) mem3_mean = layers.mean(x=mem3)
append_backward_ops(loss=mem3_mean) append_backward(loss=mem3_mean)
x_grad = exe.run( x_grad = exe.run(
feed={'x': tensor}, feed={'x': tensor},
fetch_list=[main_program.global_block().var('x@GRAD')])[0] fetch_list=[main_program.global_block().var('x@GRAD')])[0]
......
...@@ -4,7 +4,7 @@ import numpy as np ...@@ -4,7 +4,7 @@ import numpy as np
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.framework import Program, program_guard from paddle.v2.fluid.framework import Program, program_guard
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
class TestCPULoDTensorArrayOps(unittest.TestCase): class TestCPULoDTensorArrayOps(unittest.TestCase):
...@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase): ...@@ -133,7 +133,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
in_true=out_true, in_false=out_false, mask=y, x=x, level=level) in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
mean = layers.mean(x=out) mean = layers.mean(x=out)
append_backward_ops(mean) append_backward(mean)
tensor = core.LoDTensor() tensor = core.LoDTensor()
tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place) tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
......
...@@ -2,7 +2,7 @@ import unittest ...@@ -2,7 +2,7 @@ import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
from paddle.v2.fluid.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward
import numpy import numpy
...@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase): ...@@ -46,7 +46,7 @@ class TestWhileOp(unittest.TestCase):
sum_result = layers.array_read(array=mem_array, i=i) sum_result = layers.array_read(array=mem_array, i=i)
loss = layers.mean(x=sum_result) loss = layers.mean(x=sum_result)
append_backward_ops(loss) append_backward(loss)
cpu = core.CPUPlace() cpu = core.CPUPlace()
exe = Executor(cpu) exe = Executor(cpu)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册