提交 dcfbbd3f 编写于 作者: C caoying03

Merge branch 'develop' into crf

...@@ -49,11 +49,12 @@ if(NOT WITH_GOLANG) ...@@ -49,11 +49,12 @@ if(NOT WITH_GOLANG)
endif(NOT WITH_GOLANG) endif(NOT WITH_GOLANG)
if(NOT WITH_GPU) if(NOT WITH_GPU)
add_definitions(-DPADDLE_ONLY_CPU)
add_definitions(-DHPPL_STUB_FUNC) add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else() else()
add_definitions(-DPADDLE_WITH_CUDA)
FIND_PACKAGE(CUDA REQUIRED) FIND_PACKAGE(CUDA REQUIRED)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
......
...@@ -345,6 +345,11 @@ clip ...@@ -345,6 +345,11 @@ clip
.. autoclass:: paddle.v2.layer.clip .. autoclass:: paddle.v2.layer.clip
:noindex: :noindex:
resize
------
.. autoclass:: paddle.v2.layer.resize
:noindex:
slope_intercept slope_intercept
--------------- ---------------
.. autoclass:: paddle.v2.layer.slope_intercept .. autoclass:: paddle.v2.layer.slope_intercept
......
...@@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples. ...@@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples.
The following C++ programs shows how blocks are used with the `if-else` structure: The following C++ programs shows how blocks are used with the `if-else` structure:
```c++ ```c++
namespace pd = paddle;
int x = 10; int x = 10;
int y = 20; int y = 1;
int out; int z = 10;
bool cond = false; bool cond = false;
int o1, o2;
if (cond) { if (cond) {
int z = x + y; int z = x + y;
out = softmax(z); o1 = z;
o2 = pd::layer::softmax(z);
} else { } else {
int z = fc(x); int d = pd::layer::fc(z);
out = z; o1 = d;
o2 = d+1;
} }
``` ```
An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows: An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
...@@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator ...@@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator
```python ```python
import paddle as pd import paddle as pd
x = var(10) x = minibatch([10, 20, 30]) # shape=[None, 1]
y = var(20) y = var(1) # shape=[1], value=1
cond = var(false) z = minibatch([10, 20, 30]) # shape=[None, 1]
ie = pd.create_ifelseop(inputs=[x], output_num=1) cond = larger_than(x, 15) # [false, true, true]
ie = pd.ifelse()
with ie.true_block(): with ie.true_block():
x = ie.inputs(true, 0) d = pd.layer.add_scalar(x, y)
z = operator.add(x, y) ie.output(d, pd.layer.softmax(d))
ie.set_output(true, 0, operator.softmax(z))
with ie.false_block(): with ie.false_block():
x = ie.inputs(false, 0) d = pd.layer.fc(z)
z = layer.fc(x) ie.output(d, d+1)
ie.set_output(true, 0, operator.softmax(z)) o1, o2 = ie(cond)
out = b(cond)
``` ```
In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`. In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `x+1` and `fc(x)`.
A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values. A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
### Blocks with `for` and `RNNOp` ### Blocks with `for` and `RNNOp`
The following RNN model from the [RNN design doc](./rnn.md) The following RNN model from the [RNN design doc](./rnn.md)
```python ```python
x = sequence([10, 20, 30]) x = sequence([10, 20, 30]) # shape=[None, 1]
m = var(0) m = var(0) # shape=[1]
W = tensor() W = var(0.314, param=true) # shape=[1]
U = tensor() U = var(0.375, param=true) # shape=[1]
rnn = create_rnn(inputs=[input]) rnn = pd.rnn()
with rnn.stepnet() as net: with rnn.step():
x = net.set_inputs(0) h = rnn.memory(init = m)
h = net.add_memory(init=m) hh = rnn.previous_memory(h)
fc_out = pd.matmul(W, x) a = layer.fc(W, x)
hidden_out = pd.matmul(U, h.pre(n=1)) b = layer.fc(U, hh)
sum = pd.add_two(fc_out, hidden_out) s = pd.add(a, b)
act = pd.sigmoid(sum) act = pd.sigmoid(s)
h.update(act) # update memory with act rnn.update_memory(h, act)
net.set_outputs(0, act, hidden_out) # two outputs rnn.output(a, b)
o1, o2 = rnn() o1, o2 = rnn()
print o1, o2
``` ```
has its equivalent C++ program as follows has its equivalent C++ program as follows
```c++ ```c++
int* x = {10, 20, 30}; int* x = {10, 20, 30};
int m = 0; int* m = {0};
int W = some_value(); int* W = {0.314};
int U = some_other_value(); int* U = {0.375};
int mem[sizeof(x) / sizeof(x[0]) + 1]; int mem[sizeof(x) / sizeof(x[0]) + 1];
int o1[sizeof(x) / sizeof(x[0]) + 1]; int o1[sizeof(x) / sizeof(x[0]) + 1];
...@@ -131,20 +135,16 @@ int o2[sizeof(x) / sizeof(x[0]) + 1]; ...@@ -131,20 +135,16 @@ int o2[sizeof(x) / sizeof(x[0]) + 1];
for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) { for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
int x = x[i-1]; int x = x[i-1];
if (i == 1) mem[0] = m; if (i == 1) mem[0] = m;
int fc_out = W * x; int a = W * x;
int hidden_out = Y * mem[i-1]; int b = Y * mem[i-1];
int sum = fc_out + hidden_out; int s = fc_out + hidden_out;
int act = sigmoid(sum); int act = sigmoid(sum);
mem[i] = act; mem[i] = act;
o1[i] = act; o1[i] = act;
o2[i] = hidden_out; o2[i] = hidden_out;
} }
print_array(o1);
print_array(o2);
``` ```
## Compilation and Execution ## Compilation and Execution
Like TensorFlow programs, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference. Like TensorFlow programs, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
...@@ -210,11 +210,11 @@ a = pd.Varaible(shape=[20, 20]) ...@@ -210,11 +210,11 @@ a = pd.Varaible(shape=[20, 20])
b = pd.fc(a, params=["fc.w", "fc.b"]) b = pd.fc(a, params=["fc.w", "fc.b"])
rnn = pd.create_rnn() rnn = pd.create_rnn()
with rnn.stepnet() as net: with rnn.stepnet()
x = net.set_inputs(a) x = a.as_step_input()
# reuse fc's parameter # reuse fc's parameter
fc_without_b = pd.get_variable("fc.w") fc_without_b = pd.get_variable("fc.w")
net.set_outputs(fc_without_b) rnn.output(fc_without_b)
out = rnn() out = rnn()
``` ```
......
IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack(). # The `IfElse` Operator
```python PaddlePaddle's `IfElse` operator differs from TensorFlow's:
import paddle as pd
x = var() - the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
y = var() - the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
cond = var()
default_value = var() ## Example
b = pd.create_ifelseop(inputs=[x], output_num=1)
with b.true_block(): The following PaddlePaddle program shows the usage of the IfElse operator:
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
with b.false_block():
x = b.inputs(0)
z = layer.fc(x)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
```python ```python
import paddle as pd import paddle as pd
x = var() x = minibatch([10, 20, 30]) # shape=[None, 1]
y = var() y = var(1) # shape=[1], value=1
cond = var() z = minibatch([10, 20, 30]) # shape=[None, 1]
default_value = var() cond = larger_than(x, 15) # [false, true, true]
b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
ie = pd.ifelse()
with b.true_block(): with ie.true_block():
x = b.inputs(0) d = pd.layer.add(x, y)
z = operator.add(x, y) ie.output(d, pd.layer.softmax(d))
b.set_output(0, operator.softmax(z)) with ie.false_block():
d = pd.layer.fc(z)
ie.output(d, d+1)
o1, o2 = ie(cond)
```
out = b(cond) A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
An equivalent C++ program is as follows:
```c++
namespace pd = paddle;
int x = 10;
int y = 1;
int z = 10;
bool cond = false;
int o1, o2;
if (cond) {
int d = x + y;
o1 = z;
o2 = pd::layer::softmax(z);
} else {
int d = pd::layer::fc(z);
o1 = d;
o2 = d+1;
}
``` ```
where default_value is a list of vars for `cond` == False.
# Design Doc: ProgramDesc # Design Doc: PaddlePaddle Programs
The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. ## Compile and Execution
A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
```python ```python
x = layer.data("images") x = layer.data("images")
...@@ -13,36 +15,112 @@ optimize(cost) ...@@ -13,36 +15,112 @@ optimize(cost)
train(cost, reader=mnist.train()) train(cost, reader=mnist.train())
``` ```
generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message: The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it.
```protobuf ## Programs and Blocks
message ProgramDesc {
repeated BlockDesc blocks = 1; The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
- program: some nested blocks
- [block](./block.md):
- some local variable definitions, and
- a sequence of operators
The concept of block comes from usual programs. For example, the following C++ program has three blocks:
```c++
int main() { // block 0
int i = 0;
if (i < 10) { // block 1
for (int j = 0; j < 10; j++) { // block 2
}
}
return 0;
} }
```
The following PaddlePaddle program has three blocks:
```python
import paddle as pd // block 0
x = minibatch([10, 20, 30]) # shape=[None, 1]
y = var(1) # shape=[1], value=1
z = minibatch([10, 20, 30]) # shape=[None, 1]
cond = larger_than(x, 15) # [false, true, true]
ie = pd.ifelse()
with ie.true_block(): // block 1
d = pd.layer.add_scalar(x, y)
ie.output(d, pd.layer.softmax(d))
with ie.false_block(): // block 2
d = pd.layer.fc(z)
ie.output(d, d+1)
o1, o2 = ie(cond)
```
## `BlockDesc` and `ProgramDesc`
All protobuf messages are defined in `framework.proto`.
`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
```protobuf
message BlockDesc { message BlockDesc {
required int32 parent = 1; required int32 parent = 1;
repeated VarDesc vars = 2; repeated VarDesc vars = 2;
repeated OpDesc ops = 3; repeated OpDesc ops = 3;
} }
```
The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
```protobuf
message ProgramDesc {
repeated BlockDesc blocks = 1;
}
```
### Global Block
The global block is the first one in the above array.
## Operators that Use Blocks
In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
The definition of `OpDesc` shows that an operator could have some attributes:
```protobuf
message OpDesc { message OpDesc {
AttrDesc attrs = 1; AttrDesc attrs = 1;
... ...
} }
```
and an attribute could be of type block, which is, in fact, a block ID as described above:
```
message AttrDesc { message AttrDesc {
required AttrType type = 1; required string name = 1;
// index into ProgramDesc::blocks when type==BLOCK enum AttrType {
optional int32 block = 2; INT = 1,
STRING = 2,
...
BLOCK = ...
}
required AttrType type = 2;
optional int32 block = 10; // when type == BLOCK
... ...
} }
``` ```
When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions. This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks. This requires that we can trace the parent of a block. ## InferShape
A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp. In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`. So that `AttrDesc::block` could be an integer block ID.
With this design, the InferShape function should take the following parameters: With this design, the InferShape function should take the following parameters:
......
# Design Doc: Python API
Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
| Python classes | Protobuf messages |
| --- | --- |
| Program | ProgramDesc |
| Block | BlockDesc |
| Operator | OpDesc |
| Variable | VarDesc |
Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
## Core Concepts
### Program
A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
```python
class Program(objects):
def __init__(self):
self.proto = core.NewProgram() # a C++ ProgramDesc pointer.
self.blocks = vector<Block>()
self.blocks.append(Block(self, -1)) # the global block
self.current_block = 0 # initialized to the global block
def global_block():
return self.blocks[0]
def current_block():
return self.get_block(self.current_block)
def rollback():
self.current_block = self.current_block().parent_idx
def create_block():
new_block_idx = len(self.block)
self.blocks.append(Block(self, self.current_block))
self.current_block = new_block_idx
return current_block()
```
`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block.
### Block
A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
1. a map from variable names to an instance of the Python `Variable` class, and
1. a list of `Operator` instances.
```python
class Block(objects):
def __init__(self, program, parent_idx):
self.proto = core.NewBlock(program.proto)
self.program = program
self.vars = map<string, Variable>()
self.ops = vector<Operator>()
self.parent_idx = parent_idx
def create_var(self, ...):
return Variable(self, ...)
def _create_global_var(self, ...):
program.global_block().create_var(...)
def create_parameter(self, name, ...):
# Parameter is a subclass of variable. See Parameter section for details.
self.vars[name] = Parameter(self._create_global_var(...), ...)
return self.vars[name]
def append_operator(self, ...):
self.ops.append(Operator(self, ...))
def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
self.ops.prepend(Operator(self, ...))
```
`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
### Operator
The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
```python
class Operator(object):
def __init__(self,
block, # Block
type, # string
inputs, # dict<string, Variable>
outputs,# dict<stirng, Variable>
attrs # dict<string, Any>
):
self.proto = core.NewOpDesc(block.proto, type, inputs, outputs, attrs)
core.infer_shape(self.proto, inputs, outputs)
def type(self):
return self.proto.type()
```
`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
### Variable
Operators take Variables as its inputs and outputs.
```python
class Variable(object):
def __init__(self,
block=None, # Block
name=None, # string
shape, # tuple
dtype="float32", # string
lod_level=None # int
):
if name is None:
name = unique_name_generator()
self.name = name
self.block = block
self.proto = core.NewVarDesc(block.proto, name, shape, lod_level)
self.writer = None
```
Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
### Parameter
A parameter is a global variable with an initializer (or load) operator.
```python
class Parameter(Variable):
def __init__(self,
block=None, # Block
name=None, # string
shape, # tuple
dtype="float32", # string
lod_level=None # int
trainable, # bool
initialize_op_attrs,
optimize_op_attrs):
super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
self.trainable = trainable
self.optimize_op_attrs = optimize_op_attrs
block.prepend(Operator(block, # Block
initialize_op_attrs['type'], # string
None, # no inputs
self, # output is the parameter
initialize_op_attrs)
```
When users create a parameter, they can call
```python
program.create_parameter(
...,
init_attr={
type: "uniform_random",
min: -1.0,
max: 1.0,
})
)
```
In above example, `init_attr.type` names an initialize operator. It can also name the load operator
```python
init_attr={
type: "load",
filename: "something.numpy",
}
```
`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
## Layer Functions
A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
### Data Layer
```python
def data_layer(name, type, column_name):
block = the_current_program.glolal_block()
var = block.create_global_var(
name=name,
shape=[None] + type.dims(),
dtype=type.dtype)
block.prepend_operator(block,
type="Feed",
inputs = None,
outputs = [var],
{column_name: column_name})
return var
```
The input to the feed operator is a special variable in the global scope, which is the output of [Python readers](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md).
### FC Layer
```python
def fc_layer(input, size, ...):
block = program.current_block()
w = block.create_parameter(...)
b = block.create_parameter(...)
out = block.create_var()
op = block.append_operator("FC", X=input, W=w, b=b, out=out)
out.writer = op
return out
```
# Design Doc: Session
## Abstract
The *session* object encapsulates the environment in which the
computation graph is executed.
We will have the *local* session and *remote* session, they offer the
same [interface](#interface). The local session encapsulates the local
runtime environment and the remote session encapsulates the cluster
runtime environment.
The local runtime environment contains:
1. computation devices (i.e., CPU, GPU) handles, and
1. the [scope](../scope.md) which holds all variables.
The remote runtime environment contains:
1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
and
1. the distributed [scope](../scope.md) in a cluster which holds all
variables.
The user can create a remote session on Paddle Cloud and evaluate the
computation graph with it. In this way, the user can control the
remote computation resource in a cluster from his local computer.
## Background
The current design has an implicit global session in which
`paddle.eval()` is executed. The pain point is:
Since the user is not able to explicitly switch between runtime
environments, the user cannot run a topology in two independent
environments.
For example, in reinforcement learning, the user may want to have a
stale model for inference and a fresh model for training, and only
replace the stale model with the fresh model periodically.
Furthermore, we have no concept that encapsulates a remote environment
that executes a computation graph.
We need the session object to address above issues.
## Session
A session is an object that owns the runtime environment. All
computations are executed through `session.eval()`.
### Interface
```python
eval(
targets,
feed_dict=None,
)
```
Evaluates the target Operations or Variables in `targets`.
- *targets*: the evaluation targets. Can be a single Operation or
Variable, or a list with the Operations or Variables as
elements. The value returned by `eval()` has the same shape as the
`target` argument.
The PaddlePaddle program is represented by
the [ProgramDesc](../design/program.md), `eval()` will infer the
ProgramDesc from the given targets and run the PaddlePaddle
program. Please
see
[this graph](./distributed_architecture.md#local-training-architecture) for
the detailed illustration for the local session
and
[this graph](./distributed_architecture.md#distributed-training-architecture) for
the detailed illustration for the remote session.
- *feed_dict*: a dictionary that contains the tensors which override
the edges of the computation graph.
feed_dict not only can provide the input data, it can override any
OP's input as well:
```python
a = pd.constant(2.0, name="a")
b = pd.variable(name="b")
c = pd.mul(a,b)
sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
```
```python
close()
```
Closes the session and releases the scope that the session owns.
### Create a Local Session
```python
session(
devices=None
)
```
Creates a new session. One session owns one global scope, so creating
multiple sessions will create different scopes.
- *devices*: a single `string` or a list of `string` of device names,
the corresponding devices will be the computation devices for
`eval()`. If not specified, all available devices (e.g., all GPUs)
will be used. The user doesn't need to specify the CPU device since
it will be always used. Multiple sessions can use the same device.
#### Example
```Python
a = paddle.constant(1.0)
b = paddle.constant(2.0)
c = a + b
sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
sess.eval(c)
sess.close()
```
### Create a Remote Session
```python
create_cloud_job(
name,
num_trainer,
mem_per_trainer,
gpu_per_trainer,
cpu_per_trainer,
num_ps,
mem_per_ps,
cpu_per_ps,
)
```
Creates a Paddle Cloud job. Fails if the job name exists.
```python
get_cloud_job(
name
)
```
Gets a Paddle Cloud job.
```python
remote_session(
job
)
```
- *job*: the Paddle Cloud job.
#### Example
```Python
reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
image = reader.column(0)
label = reader.column(1)
fc1 = paddle.op.fc(image, size=256, act="sigmoid")
fc2 = paddle.op.fc(fc1, size=10, act="softmax")
cost = paddle.op.cross_entropy(fc2, label)
opt = paddle.optimizer.sgd(cost)
job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
sess = paddle.remote_ession(job)
for i in range(1000):
sess.eval(opt)
sess.close()
```
...@@ -17,7 +17,7 @@ The goals of refactoring include: ...@@ -17,7 +17,7 @@ The goals of refactoring include:
1. A graph is composed of *variables* and *operators*. 1. A graph is composed of *variables* and *operators*.
1. The description of graphs must be capable of being serialized/deserialized, so that 1. The description of graphs must be capable of being serialized/deserialized, so that:
1. It can to be sent to the cloud for distributed execution, and 1. It can to be sent to the cloud for distributed execution, and
1. It can be sent to clients for mobile or enterprise deployment. 1. It can be sent to clients for mobile or enterprise deployment.
...@@ -137,19 +137,18 @@ Compile Time -> IR -> Runtime ...@@ -137,19 +137,18 @@ Compile Time -> IR -> Runtime
* `Eigen::Tensor` contains basic math and element-wise functions. * `Eigen::Tensor` contains basic math and element-wise functions.
* Note that `Eigen::Tensor` has broadcast implementation. * Note that `Eigen::Tensor` has broadcast implementation.
* Limit the number of `tensor.device(dev) = ` in your code. * Limit the number of `tensor.device(dev) = ` in your code.
* `thrust::tranform` and `std::transform`. * `thrust::transform` and `std::transform`.
* `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized elementwise kernels. * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
* `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`. * `thrust` also has more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
* Hand-writing `GPUKernel` and `CPU` code * Hand-writing `GPUKernel` and `CPU` code
* Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
--- ---
# Operator Registration # Operator Registration
## Why registration is necessary? ## Why is registration necessary?
We need a method to build mappings between Op type names and Op classes. We need a method to build mappings between Op type names and Op classes.
## How is registration implemented? ## How is registration implemented?
Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
--- ---
...@@ -170,7 +169,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding ...@@ -170,7 +169,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
# Related Concepts # Related Concepts
### Op_Maker ### Op_Maker
It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
### Register Macros ### Register Macros
```cpp ```cpp
...@@ -200,7 +199,7 @@ Make sure the registration process is executed and linked. ...@@ -200,7 +199,7 @@ Make sure the registration process is executed and linked.
--- ---
# Backward Module (2/2) # Backward Module (2/2)
### Build Backward Network ### Build Backward Network
- **Input**: graph of forwarding operators - **Input**: graph of forward operators
- **Output**: graph of backward operators - **Output**: graph of backward operators
- **Corner cases in construction** - **Corner cases in construction**
- Shared Variables => insert an `Add` operator to combine gradients - Shared Variables => insert an `Add` operator to combine gradients
...@@ -224,7 +223,7 @@ Make sure the registration process is executed and linked. ...@@ -224,7 +223,7 @@ Make sure the registration process is executed and linked.
--- ---
# Block (in design) # Block (in design)
## the difference with original RNNOp ## the difference between original RNNOp and Block
- As an operator is more intuitive than `RNNOp`, - As an operator is more intuitive than `RNNOp`,
- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
- Fits the compile-time/ runtime separation design paradigm. - Fits the compile-time/ runtime separation design paradigm.
......
# Design Doc: Gradient Operators Registration
## The Problem Posed
In our current operator registration mechanism, for each operator, the programmer should register a *gradient operator creator* function, which takes a C++ operator instance, and returns the corresponding gradient instance.
However, as we decided to separate the *compilation* and *execution* of DL models, we need to reshape the creator to take a protobuf `OpDesc` message, and returns a corresponding message.
More than that, the new registration mechanism need to support the fact that an operators' gradient computation might be a composition of operators.
## Current Implementation
OpInfos store in a association map which key is the operator type. The `grad_op_type` indicate associated gradient operator type. Operator can create gradient operator by `OpInfo::creator_` of gradient. The pseudo code is
```cpp
struct OpInfo {
std::function<OperatorBase*(...)> creator_;
std::string grad_op_type_;
...
};
map<string, OpInfo> OpInfoMap;
OperatorBase* CreateGradientOperator(const OperatorBase& op) {
return OpInfoMap.at(op.Type()).creator_(...);
}
```
## Proposed Solution
The mapping relationship between an operator and its gradient operators is a function. The interface of that function is:
```cpp
// (OpDesc) --> vector<OpDesc>
std::function<std::vector<OpDescBind>(const OpDescBind&)>;
```
The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for protobuf message `OpDesc` to manipulate `OpDesc` fast.
The `GradOpDescMaker` will be registered in `OpInfo`, to replace `grad_op_type_` field. The `OpInfo` should be
```cpp
struct OpInfo {
std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)> grad_op_maker_;
...
};
```
The `grad_op_maker_ ` is `nullptr` if the operator does not have associated gradient operators.
We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
```cpp
class GradOpDescMakerBase {
public:
GradOpDescMakerBase(const OpDescBind& );
virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
};
```
We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
```cpp
using GradOpMaker = ...;
std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
func = [] (const OpDescBind& fwd_op) {
GradOpMaker maker(fwd_op);
return maker();
};
```
We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
We should chagne register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
The user interface should be
```cpp
vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
// Developers can still manually implement gradient operator.
REGISTER_OPERATOR(minus_grad, MinusGradOp);
```
The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
```cpp
REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
```
# Design for TensorArray # Design for TensorArray
This design doc presents the necessity of a new C++ class `TensorArray`.
In addition to the very simple C++ implementation
```c++
class TensorArray {
public:
explicit TensorArray(const LoDTensor&);
explicit TensorArray(size_t size);
private:
vector<LoDTensor> values_;
};
```
We also need to expose it to PaddlePaddle's Python API,
because users would want to use it with our very flexible operators `WhileLoop`.
An example for a RNN based on dynamic operators is
```python
input = pd.data(...)
num_steps = Var(12)
TensorArray states(size=num_steps)
TensorArray step_inputs(unstack_from=input)
TensorArray step_outputs(size=num_steps)
W = Tensor(...)
U = Tensor(...)
default_state = some_op()
step = Var(1)
wloop = paddle.create_whileloop(loop_vars=[step])
with wloop.frame():
wloop.break_if(pd.equal(step, num_steps)
pre_state = states.read(step-1, default_state)
step_input = step_inputs.read(step)
state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
states.write(step, state)
step_outputs.write(step, state) # output state
step.update(state+1)
output = step_outputs.stack()
```
## Background
Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
An RNN can be implemented with the following pseudocode
```c++
Array states;
Array input_segments;
Array output_segments;
Parameter W, U;
step = 1
seq_len = 12
while_loop {
if (step == seq_len) break;
states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
output_segments[step] = states[step] // take state as output
step++;
}
```
According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
The implementation is similar to `recurrent_op`.
The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
## Why `TensorArray`
The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes.
So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
This is where the notion of `TensorArray` comes from.
## Introduce TensorArray to uniform all the three RNNs
TensorArray as a new concept is borrowed from TensorFlow, TensorArray as a new concept is borrowed from TensorFlow,
it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`. it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers,
such as `RecurrentGradientMachine`. such as `recurrent_op`, `RecurrentGradientMachine`.
In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401),
`TensorArray` is used to segment inputs and store states in all time steps. `TensorArray` is used to segment inputs and store states in all time steps.
By providing some methods similar to a C++ array, By providing some methods similar to a C++ array,
the definition of some state-based dynamic models such as RNN could be more natural and highly flexible. the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
## Dynamic-Related Methods ## Dynamic-operations on TensorArray
Some basic methods should be proposed as follows:
`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
### stack()
Pack the values in a `TensorArray` into a tensor with rank one higher than each tensor in `values`. ```python
### unstack(axis=0) # several helper operators for TensorArray
Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. def tensor_array_stack(ta, tensor):
### concat() '''
Return the values in the `TensorArray` as a concatenated Tensor. get a tensor array `ta`, return a packed `tensor`.
### write(index, value, data_shared=true) '''
Write value into index of the TensorArray. pass
### read(index)
Read the value at location `index` in the `TensorArray`. def tensor_array_unstack(tensor, ta):
### size() '''
Return the number of values. get a `tensor`, unstack it and get a tensor array `ta`.
'''
pass
def tensor_array_write(ta, index, tensor, data_shared):
'''
get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
value of the tensor array `ta`.
`data_shared` is an attribute that specifies whether to copy or reference the tensors.
'''
pass
def tensor_array_read(ta, index, tensor):
'''
get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
`ta` and return as the `tensor`.
'''
pass
def tensor_array_size(ta, tensor):
'''
get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
'''
pass
```
It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use,
for example
```python
class TensorArray:
def __init__(self, name):
self.name = name
self.desc = TensorArrayDesc()
def stack(self, name=None):
'''
Pack the values in a `TensorArray` into a tensor with rank one higher
than each tensor in `values`.
`stack` can be used to split tensor into time steps for RNN or whileloop.
@name: str
the name of the variable to output.
'''
tensor = NewVar(name)
tensor_array_stack(self.name, tensor)
return tensor
def unstack(self, input):
'''
Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
`unstack` can be used to concatenate all the time steps for RNN or whileloop.
@input: str
the name of input tensor
'''
tensor_array_unstack(tensor, self.name)
def write(self, index, value, data_shared=True):
'''
Write value into index of the TensorArray.
If `data_shared` is set to True, than the index-th value in TensorArray will
be shared with the tensor passed in.
@index: str
name of a scalar tensor
@value: str
name of a tensor
@data_shared: bool
'''
tensor_array_write(self.name, index, value, data_shared)
def read(self, index, output):
'''
Read the value at location `index` in the `TensorArray`.
@index: str
name of a scalar tensor
@output:
name of a output variable
'''
tensor_array_read(self.name, index, output)
def size(self, output):
'''
Return the number of values.
@output: str
name of a scalar tensor
'''
tensor_array_size(self.name, output)
```
## LoDTensor-related Supports ## LoDTensor-related Supports
The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variant length sequences as input, The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
because each step of RNN could only take a tensor-represented batch of data as input,
Since each step of RNN can only take a tensor-represented batch of data as input,
some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches. some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`. Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
Some definitions are like
```python
def unpack(level):
'''
Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
will sort by length.
With these two methods, a variant-sentence-RNN can be implemented like Returns:
- a new `TensorArray`, whose values are LodTensors and represents batches
of data.
- an int32 Tensor, which stores the map from the new batch's indices to
original LoDTensor
'''
pass
def pack(level, indices_map):
'''
Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
and `level` and `indices_map`.
'''
pass
```
With these two methods, a varience-length sentence supported RNN can be implemented like
```c++ ```c++
// input is the varient-length data // input is the varient-length data
...@@ -58,16 +269,3 @@ LoDTensor rnn_output = ta.pack(ta, indice_map); ...@@ -58,16 +269,3 @@ LoDTensor rnn_output = ta.pack(ta, indice_map);
``` ```
the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`, the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend. the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
some details are as follows.
### unpack(level, sort_by_length)
Split LodTensor in some `level` and generate batches, if set `sort_by_length`, will sort by length.
Returns:
- a new `TensorArray`, whose values are LodTensors and represents batches of data.
- an int32 Tensor, which stores the map from the new batch's indices to original LoDTensor
### pack(level, indices_map)
Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` and `level` and `indices_map`.
...@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, ...@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。 - `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。
- `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。 - `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulKernel`类。 - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。
-`.cu`文件中注册GPU Kernel。 -`.cu`文件中注册GPU Kernel。
......
...@@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar ...@@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar
- `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`. - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
- `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient. - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
- `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`. - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
- Registering GPU Kernel in `.cu` files - Registering GPU Kernel in `.cu` files
......
...@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; } ...@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; } void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
bool isGpuVersion() { bool isGpuVersion() {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
return false; return false;
#else #else
return true; return true;
......
...@@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat, ...@@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE; if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
paddle::real* buf = ptr->mat->getRowBuf(rowID); paddle::real* buf = ptr->mat->getRowBuf(rowID);
size_t width = ptr->mat->getWidth(); size_t width = ptr->mat->getWidth();
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
hl_memcpy(buf, rowArray, sizeof(paddle::real) * width); hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
#else #else
std::copy(rowArray, rowArray + width, buf); std::copy(rowArray, rowArray + width, buf);
......
...@@ -22,14 +22,12 @@ cc_library(attribute SRCS attribute.cc DEPS framework_proto) ...@@ -22,14 +22,12 @@ cc_library(attribute SRCS attribute.cc DEPS framework_proto)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope proto_desc)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator proto_desc) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator)
cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
py_proto_compile(framework_py_proto SRCS framework.proto) py_proto_compile(framework_py_proto SRCS framework.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module. # Generate an empty __init__.py to make framework_py_proto as a valid python module.
...@@ -43,3 +41,6 @@ add_custom_command(TARGET framework_py_proto POST_BUILD ...@@ -43,3 +41,6 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
cc_library(backward SRCS backward.cc DEPS net_op) cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
...@@ -21,20 +21,12 @@ limitations under the License. */ ...@@ -21,20 +21,12 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/type_defs.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/variant.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// The order should be as same as framework.proto
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>, bool,
std::vector<bool>, BlockDesc*>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
ProgramDesc& GetProgramDesc(); ProgramDesc& GetProgramDesc();
template <typename T> template <typename T>
......
...@@ -13,10 +13,13 @@ ...@@ -13,10 +13,13 @@
limitations under the License. */ limitations under the License. */
#include "paddle/framework/backward.h" #include "paddle/framework/backward.h"
#include "paddle/operators/net_op.h"
#include <deque>
#include <list> #include <list>
#include <memory> #include <memory>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h" #include "paddle/operators/recurrent_op.h"
...@@ -24,6 +27,35 @@ ...@@ -24,6 +27,35 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static inline std::unique_ptr<OperatorBase> CreateGradOp(
const OperatorBase& op) {
OpDescBind op_desc;
op_desc.SetInputMap(op.Inputs());
op_desc.SetOutputMap(op.Outputs());
op_desc.SetType(op.Type());
op_desc.SetAttrMap(op.Attrs());
auto& info = OpInfoMap::Instance().Get(op.Type());
auto grad_descs = info.GradOpMaker()(op_desc);
std::vector<std::unique_ptr<OperatorBase>> grad_ops;
grad_ops.reserve(grad_descs.size());
std::transform(grad_descs.begin(), grad_descs.end(),
std::back_inserter(grad_ops),
[](const std::unique_ptr<OpDescBind>& grad_desc) {
return OpRegistry::CreateOp(*grad_desc);
});
PADDLE_ENFORCE(!grad_ops.empty());
if (grad_ops.size() == 1) {
return std::move(grad_ops[0]);
} else {
auto net_op = new operators::NetOp();
for (auto& grad_op : grad_ops) {
net_op->AppendOp(std::move(grad_op));
}
net_op->CompleteAddOp();
return std::unique_ptr<OperatorBase>(net_op);
}
}
template <typename Map, typename T> template <typename Map, typename T>
static void ForEachVarName(const Map& names, T callback) { static void ForEachVarName(const Map& names, T callback) {
for (auto& name : names) { for (auto& name : names) {
...@@ -141,9 +173,26 @@ static std::unique_ptr<OperatorBase> BackwardRecursive( ...@@ -141,9 +173,26 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
net->ops_[op_offset]->Rename(name, dup_outputs.back()); net->ops_[op_offset]->Rename(name, dup_outputs.back());
} }
// collect all the offset to append `add` op for each alias // collect all the offset to append `add` op for each alias
//
// one variable is shared between multiple operators.
// insert add operator one by one, then add it to output
for (size_t output_idx = 0; output_idx < dup_outputs.size() - 1;
++output_idx) {
auto insert_add_x = dup_outputs[output_idx];
auto insert_add_y = dup_outputs[output_idx + 1];
auto insert_add_out = name + "@SHARED@" + std::to_string(output_idx);
// first add op inserted
if (output_idx == dup_outputs.size() - 2) {
insert_add_out = name;
}
if (output_idx != 0) {
insert_add_y = name + "@SHARED@" + std::to_string(output_idx - 1);
}
insert_position.push_back( insert_position.push_back(
{dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}}, {dup_op.back(),
{{"Out", {name}}}, {})}); OpRegistry::CreateOp("sum", {{"X", {insert_add_x, insert_add_y}}},
{{"Out", {insert_add_out}}}, {})});
}
} }
// make sure the inserted `add` ops follow the BFS order. // make sure the inserted `add` ops follow the BFS order.
...@@ -154,7 +203,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive( ...@@ -154,7 +203,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
net->InsertOp(pos.first + 1, std::move(pos.second)); net->InsertOp(pos.first + 1, std::move(pos.second));
} }
} else { } else {
std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp)); std::unique_ptr<OperatorBase> grad_op(CreateGradOp(forwardOp));
ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op]( ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
const std::string& grad_input) { const std::string& grad_input) {
...@@ -182,7 +231,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive( ...@@ -182,7 +231,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
// process recurrent gradient op as a special operator. // process recurrent gradient op as a special operator.
if (forwardOp.Type() == "recurrent") { if (forwardOp.Type() == "recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
// or
// this will result in infinite loop. // this will result in infinite loop.
const auto& rnnop = const auto& rnnop =
*static_cast<const operators::RecurrentOp*>(&forwardOp); *static_cast<const operators::RecurrentOp*>(&forwardOp);
...@@ -222,5 +272,145 @@ std::unique_ptr<OperatorBase> Backward( ...@@ -222,5 +272,145 @@ std::unique_ptr<OperatorBase> Backward(
return BackwardRecursive(forwardOp, no_grad_names, uid); return BackwardRecursive(forwardOp, no_grad_names, uid);
} }
// ==================================== //
static bool AllGradInSet(const std::vector<std::string>& names,
const std::unordered_set<std::string>& set) {
for (const std::string& name : names) {
if (!set.count(GradVarName(name))) {
return false;
}
}
return true;
}
std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
const std::unique_ptr<OpDescBind>& op_desc,
std::unordered_set<std::string>& no_grad_vars) {
std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
// All input gradients of forwarding operator do not need to calculat.
const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
if (AllGradInSet(inputs, no_grad_vars)) {
return grad_op_descs; // empty vector
}
// All output gradients of forwarding operator do not need to calculate.
const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
if (AllGradInSet(outputs, no_grad_vars)) {
for (const std::string& name : inputs) {
no_grad_vars.insert(GradVarName(name));
}
return grad_op_descs; // empty vector
}
grad_op_descs = OpRegistry::CreateGradOpDescs(*op_desc);
std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
for (auto& desc : grad_op_descs) {
for (const std::string& in_name : desc->InputArgumentNames()) {
if (no_grad_vars.count(in_name)) {
std::string prefix = in_name.substr(
0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
std::string new_name = prefix + kZeroVarSuffix;
desc->Rename(in_name, new_name);
std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
"fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
}
}
for (const std::string& out_name : desc->OutputArgumentNames()) {
if (no_grad_vars.count(out_name)) {
desc->Rename(out_name, kEmptyVarName);
}
}
}
for (auto& p : pending_fill_zeros_ops) {
grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
}
return grad_op_descs;
}
std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
ProgramDescBind& program_desc, int block_idx,
std::unordered_set<std::string>& no_grad_vars) {
BlockDescBind* cur_block = program_desc.Block(block_idx);
std::deque<std::unique_ptr<OpDescBind>>& op_descs = cur_block->ops_;
std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
size_t grad_desc_idx = 0;
std::vector<std::unique_ptr<OpDescBind>> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
std::vector<std::unique_ptr<OpDescBind>> op_grads =
MakeOpGrad(*it, no_grad_vars);
if ((*it)->Type() == "recurrent") {
PADDLE_ENFORCE_EQ(
op_grads.size(), size_t(1),
"rnn_op's gradient process should contain only one op.");
int step_block_idx = (*it)->GetBlockAttr("stop_block");
auto backward_block_op_descs =
MakeBlockBackward(program_desc, step_block_idx, no_grad_vars);
BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
for (auto& ptr : backward_block_op_descs) {
backward_block->ops_.push_back(std::move(ptr));
}
op_grads[0]->SetBlockAttr("step_block", *backward_block);
}
for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) {
dup_out_ops[out_name].emplace_back(grad_desc_idx);
}
++grad_desc_idx;
}
std::transform(
op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
[](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
}
// Check whether some variables are written more than once
std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
for (const auto& dup : dup_out_ops) {
const std::string& out_name = dup.first;
const std::vector<size_t> dup_op = dup.second;
if (out_name != kEmptyVarName && dup_op.size() > 1) {
std::vector<std::string> sum_op_inputs;
for (size_t i = 0; i < dup_op.size(); ++i) {
std::string new_name = out_name + "@RENAME@" + std::to_string(i);
backward_descs[dup_op[i]]->Rename(out_name, new_name);
sum_op_inputs.emplace_back(new_name);
}
std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
"sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
}
}
pending_sum_ops.sort(
[](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
return a.first > b.first;
});
for (auto& p : pending_sum_ops) {
backward_descs.insert(backward_descs.begin() + p.first + 1,
std::move(p.second));
}
return backward_descs;
}
void AppendBackward(ProgramDescBind& program_desc,
const std::unordered_set<std::string>& no_grad_vars) {
std::unordered_set<std::string> no_grad_var_names;
no_grad_var_names.reserve(no_grad_vars.size() + 1);
no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
for (auto& name : no_grad_vars) {
no_grad_var_names.insert(GradVarName(name));
}
const int root_block_idx = 0;
auto backward_op_descs =
MakeBlockBackward(program_desc, root_block_idx, no_grad_var_names);
auto& forw_op_descs = program_desc.Block(root_block_idx)->ops_;
for (auto& ptr : backward_op_descs) {
forw_op_descs.push_back(std::move(ptr));
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,8 +13,11 @@ ...@@ -13,8 +13,11 @@
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <unordered_set> #include <unordered_set>
#include "operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -23,5 +26,9 @@ namespace framework { ...@@ -23,5 +26,9 @@ namespace framework {
extern std::unique_ptr<OperatorBase> Backward( extern std::unique_ptr<OperatorBase> Backward(
const OperatorBase& forwardOp, const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars); const std::unordered_set<std::string>& no_grad_vars);
void AppendBackward(ProgramDescBind& program_desc,
const std::unordered_set<std::string>& no_grad_vars);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -15,30 +15,42 @@ ...@@ -15,30 +15,42 @@
#include "paddle/framework/backward.h" #include "paddle/framework/backward.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/op_desc.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
using OperatorBase = framework::OperatorBase;
using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
using OpProto = framework::OpProto;
using OpAttrChecker = framework::OpAttrChecker;
using Scope = framework::Scope;
using DeviceContext = platform::DeviceContext; using DeviceContext = platform::DeviceContext;
class RowWiseAddOpMaker : public OpProtoAndCheckerMaker { class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
public: public:
RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker) RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input X of Add").NotInGradient(); AddInput("X", "Input X of Add");
AddInput("b", "Bias of Add").NotInGradient(); AddInput("b", "Bias of Add");
AddOutput("Out", "Out of Add").NotInGradient(); AddOutput("Out", "Out of Add");
AddComment("Add Op"); AddComment("Add Op");
} }
}; };
class RowWiseAddGradMaker : public SingleGradOpDescMaker {
public:
using SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<OpDescBind> Apply() const override {
auto grad_op = new OpDescBind();
grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
grad_op->SetType("rowwise_add_grad");
return std::unique_ptr<OpDescBind>(grad_op);
}
};
class MulOpMaker : public OpProtoAndCheckerMaker { class MulOpMaker : public OpProtoAndCheckerMaker {
public: public:
MulOpMaker(OpProto *proto, OpAttrChecker *op_checker) MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
...@@ -133,42 +145,46 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker { ...@@ -133,42 +145,46 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
} }
}; };
class AddOpMaker : public OpProtoAndCheckerMaker { class SumOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "the input tensors of sum operator.").AsDuplicable();
AddOutput("Out", "the output tensor of sum operator.");
AddComment("");
}
};
class MultInOutOpMaker : public OpProtoAndCheckerMaker {
public: public:
AddOpMaker(OpProto *proto, OpAttrChecker *op_checker) MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "x").AsDuplicable(); AddInput("X", "x");
AddOutput("Out", "out"); AddInput("H", "h");
AddOutput("Y", "y");
AddOutput("Z", "z");
AddComment(""); AddComment("");
} }
}; };
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
namespace f = paddle::framework; namespace f = paddle::framework;
namespace ops = paddle::operators; namespace ops = paddle::operators;
using EnforceNotMet = paddle::platform::EnforceNotMet; using EnforceNotMet = paddle::platform::EnforceNotMet;
REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad, REGISTER_OPERATOR(rowwise_add, f::NOP, f::RowWiseAddOpMaker,
f::NOP); f::RowWiseAddGradMaker);
REGISTER_OPERATOR(rowwise_add_grad, f::NOP);
REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP); REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP); REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker); REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker); REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP); REGISTER_OP(sum, f::NOP, f::SumOpMaker, sum_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker); REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad, REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
f::NOP); f::NOP);
REGISTER_OP(mult_in_out, f::NOP, f::MultInOutOpMaker, mult_in_out_grad, f::NOP);
TEST(Backward, simple_op_grad) {
auto fwd = f::OpRegistry::CreateOp(
"rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
ASSERT_NE(fwd, nullptr);
auto gop = f::OpRegistry::CreateGradOp(*fwd);
ASSERT_EQ(1UL, gop->Inputs().size());
ASSERT_EQ("rowwise_add_grad", gop->Type());
ASSERT_EQ(f::GradVarName("x"), gop->Output(f::GradVarName("X")));
ASSERT_EQ(f::GradVarName("b"), gop->Output(f::GradVarName("b")));
}
TEST(Backward, simple_op_not_need_grad) { TEST(Backward, simple_op_not_need_grad) {
auto fwd = f::OpRegistry::CreateOp( auto fwd = f::OpRegistry::CreateOp(
...@@ -283,18 +299,7 @@ TEST(Backward, net_shared_weight) { ...@@ -283,18 +299,7 @@ TEST(Backward, net_shared_weight) {
ASSERT_TRUE(bwd->IsNetOp()); ASSERT_TRUE(bwd->IsNetOp());
auto bwd_net = static_cast<ops::NetOp *>(bwd.get()); auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
ASSERT_EQ(3UL, bwd_net->ops_.size()); ASSERT_EQ(3UL, bwd_net->ops_.size());
ASSERT_EQ("add", bwd_net->ops_[2]->Type()); ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
}
TEST(Backward, op_register_grad_not_for_network) {
auto fwd =
f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
{{"mul_result", {"mul_out"}},
{"add_result", {"add_out"}},
{"Out", {"out1"}}},
{{"temporary_index", std::vector<int>{0, 1}}});
ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
} }
TEST(Backward, op_all_input_are_not_need) { TEST(Backward, op_all_input_are_not_need) {
...@@ -399,3 +404,293 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { ...@@ -399,3 +404,293 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL); EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL); EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
} }
// =================================== //
f::ProgramDesc *GetNewProgramDesc() {
auto *program_desc = new f::ProgramDesc();
auto *root_block = program_desc->add_blocks();
root_block->set_idx(0);
root_block->set_parent_idx(-1);
return program_desc;
}
TEST(Backward, simple_single_op) {
f::ProgramDesc *program_desc = GetNewProgramDesc();
f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
f::BlockDescBind *block = program.Block(0);
f::OpDescBind *op = block->AppendOp();
op->SetType("rowwise_add");
op->SetInput("X", {"x"});
op->SetInput("b", {"b"});
op->SetOutput("Out", {"out"});
AppendBackward(program, {});
ASSERT_EQ(block->AllOps().size(), 2UL);
f::OpDescBind *grad_op = block->AllOps()[1];
EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op->InputNames().size(), 1UL);
ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b")}));
}
TEST(Backward, simple_mult_op) {
f::ProgramDesc *program_desc = GetNewProgramDesc();
f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
f::BlockDescBind *block = program.Block(0);
f::OpDescBind *op1 = block->AppendOp();
op1->SetType("rowwise_add");
op1->SetInput("X", {"x1"});
op1->SetInput("b", {"b1"});
op1->SetOutput("Out", {"out1"});
f::OpDescBind *op2 = block->AppendOp();
op2->SetType("mul");
op2->SetInput("X", {"out1"});
op2->SetInput("Y", {"y2"});
op2->SetOutput("Out", {"out2"});
f::OpDescBind *op3 = block->AppendOp();
op3->SetType("rowwise_add");
op3->SetInput("X", {"out2"});
op3->SetInput("b", {"b3"});
op3->SetOutput("Out", {"out3"});
AppendBackward(program, {});
ASSERT_EQ(block->AllOps().size(), 6UL);
f::OpDescBind *grad_op1 = block->AllOps()[5];
EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b1")}));
f::OpDescBind *grad_op2 = block->AllOps()[4];
EXPECT_EQ(grad_op2->Type(), "mul_grad");
ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out2")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y2")}));
f::OpDescBind *grad_op3 = block->AllOps()[3];
EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out3")}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out2")}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b3")}));
}
TEST(Backward, intermedia_var_no_grad) {
f::ProgramDesc *program_desc = GetNewProgramDesc();
f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
f::BlockDescBind *block = program.Block(0);
f::OpDescBind *op1 = block->AppendOp();
op1->SetType("rowwise_add");
op1->SetInput("X", {"x1"});
op1->SetInput("b", {"b1"});
op1->SetOutput("Out", {"out1"});
f::OpDescBind *op2 = block->AppendOp();
op2->SetType("mul");
op2->SetInput("X", {"x2"});
op2->SetInput("Y", {"y2"});
op2->SetOutput("Out", {"out2"});
f::OpDescBind *op3 = block->AppendOp();
op3->SetType("rowwise_add");
op3->SetInput("X", {"out2"});
op3->SetInput("b", {"b3"});
op3->SetOutput("Out", {"out3"});
f::OpDescBind *op4 = block->AppendOp();
op4->SetType("mul");
op4->SetInput("X", {"out1"});
op4->SetInput("Y", {"out3"});
op4->SetOutput("Out", {"out4"});
AppendBackward(program, {"out3"});
ASSERT_EQ(block->AllOps().size(), 6UL);
f::OpDescBind *grad_op1 = block->AllOps()[5];
EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b1")}));
f::OpDescBind *grad_op4 = block->AllOps()[4];
EXPECT_EQ(grad_op4->Type(), "mul_grad");
ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out4")}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
std::vector<std::string>({f::kEmptyVarName}));
}
TEST(Backward, var_no_grad) {
f::ProgramDesc *program_desc = GetNewProgramDesc();
f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
f::BlockDescBind *block = program.Block(0);
f::OpDescBind *op1 = block->AppendOp();
op1->SetType("mult_in_out");
op1->SetInput("X", {"x1"});
op1->SetInput("H", {"h1"});
op1->SetOutput("Y", {"y1"});
op1->SetOutput("Z", {"z1"});
f::OpDescBind *op2 = block->AppendOp();
op2->SetType("mult_in_out");
op2->SetInput("X", {"y1"});
op2->SetInput("H", {"z1"});
op2->SetOutput("Y", {"y2"});
op2->SetOutput("Z", {"z2"});
AppendBackward(program, {"z1"});
ASSERT_EQ(block->AllOps().size(), 5UL);
f::OpDescBind *grad_op2 = block->AllOps()[2];
ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y2")}));
EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
std::vector<std::string>({f::GradVarName("z2")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("y1")}));
EXPECT_EQ(grad_op2->Output(f::GradVarName("H")),
std::vector<std::string>({f::kEmptyVarName}));
f::OpDescBind *fill_zero_op = block->AllOps()[3];
ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
EXPECT_EQ(fill_zero_op->Output("Y"),
std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
f::OpDescBind *grad_op1 = block->AllOps()[4];
ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y1")}));
EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
std::vector<std::string>({f::GradVarName("h1")}));
}
TEST(Backward, shared_var) {
f::ProgramDesc *program_desc = GetNewProgramDesc();
f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
f::BlockDescBind *block = program.Block(0);
f::OpDescBind *op1 = block->AppendOp();
op1->SetType("rowwise_add");
op1->SetInput("X", {"x1"});
op1->SetInput("b", {"b1"});
op1->SetOutput("Out", {"out1"});
f::OpDescBind *op2 = block->AppendOp();
op2->SetType("mul");
op2->SetInput("X", {"out1"});
op2->SetInput("Y", {"y2"});
op2->SetOutput("Out", {"out2"});
f::OpDescBind *op3 = block->AppendOp();
op3->SetType("rowwise_add");
op3->SetInput("X", {"out1"});
op3->SetInput("b", {"b3"});
op3->SetOutput("Out", {"out3"});
AppendBackward(program, {});
ASSERT_EQ(block->AllOps().size(), 7UL);
f::OpDescBind *grad_op3 = block->AllOps()[3];
ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out3")}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b3")}));
f::OpDescBind *grad_op4 = block->AllOps()[4];
ASSERT_EQ(grad_op4->Type(), "mul_grad");
ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out2")}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
std::vector<std::string>({f::GradVarName("y2")}));
f::OpDescBind *sum_op = block->AllOps()[5];
ASSERT_EQ(sum_op->Type(), "sum");
ASSERT_EQ(sum_op->InputNames().size(), 1UL);
ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
EXPECT_EQ(sum_op->Input("X"),
std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
f::GradVarName("out1") + "@RENAME@1"}));
EXPECT_EQ(sum_op->Output("Out"),
std::vector<std::string>({f::GradVarName("out1")}));
f::OpDescBind *grad_op1 = block->AllOps()[6];
ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
std::vector<std::string>({f::GradVarName("x1")}));
EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
std::vector<std::string>({f::GradVarName("b1")}));
}
\ No newline at end of file
...@@ -34,6 +34,10 @@ VarDescBind *BlockDescBind::Var(const std::string &name) const { ...@@ -34,6 +34,10 @@ VarDescBind *BlockDescBind::Var(const std::string &name) const {
return it->second.get(); return it->second.get();
} }
bool BlockDescBind::HasVar(const std::string &name) const {
return vars_.find(name) != vars_.end();
}
std::vector<VarDescBind *> BlockDescBind::AllVars() const { std::vector<VarDescBind *> BlockDescBind::AllVars() const {
std::vector<VarDescBind *> res; std::vector<VarDescBind *> res;
for (const auto &p : vars_) { for (const auto &p : vars_) {
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/framework/op_desc.h" #include "paddle/framework/op_desc.h"
#include "paddle/framework/var_desc.h" #include "paddle/framework/var_desc.h"
#include "paddle/platform/macros.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -31,12 +32,17 @@ class ProgramDescBind; ...@@ -31,12 +32,17 @@ class ProgramDescBind;
class BlockDescBind { class BlockDescBind {
public: public:
friend std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
ProgramDescBind &program_desc, int block_idx,
std::unordered_set<std::string> &no_grad_vars);
friend void AppendBackward(
ProgramDescBind &program_desc,
const std::unordered_set<std::string> &no_grad_vars);
BlockDescBind(ProgramDescBind *prog, BlockDesc *desc) BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
: prog_(prog), desc_(desc), need_update_(false) {} : prog_(prog), desc_(desc), need_update_(false) {}
BlockDescBind(const BlockDescBind &o) = delete;
BlockDescBind &operator=(const BlockDescBind &o) = delete;
int32_t ID() const { return desc_->idx(); } int32_t ID() const { return desc_->idx(); }
int32_t Parent() const { return desc_->parent_idx(); } int32_t Parent() const { return desc_->parent_idx(); }
...@@ -45,6 +51,8 @@ class BlockDescBind { ...@@ -45,6 +51,8 @@ class BlockDescBind {
VarDescBind *Var(const std::string &name_bytes) const; VarDescBind *Var(const std::string &name_bytes) const;
bool HasVar(const std::string &var_name) const;
std::vector<VarDescBind *> AllVars() const; std::vector<VarDescBind *> AllVars() const;
BlockDescBind *ParentBlock() const; BlockDescBind *ParentBlock() const;
...@@ -66,6 +74,8 @@ class BlockDescBind { ...@@ -66,6 +74,8 @@ class BlockDescBind {
std::deque<std::unique_ptr<OpDescBind>> ops_; std::deque<std::unique_ptr<OpDescBind>> ops_;
std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_; std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
DISABLE_COPY_AND_ASSIGN(BlockDescBind);
}; };
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/grad_op_desc_maker.h"
#include "paddle/framework/op_info.h"
#include "paddle/framework/op_proto_maker.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace framework {
namespace details {
enum OpInfoFillType {
kOperator = 0,
kOpProtoAndCheckerMaker = 1,
kGradOpDescMaker = 2
};
template <typename T>
struct OpInfoFillTypeID {
static constexpr OpInfoFillType ID() {
return std::is_base_of<OperatorBase, T>::value
? kOperator
: (std::is_base_of<OpProtoAndCheckerMaker, T>::value
? kOpProtoAndCheckerMaker
: (std::is_base_of<GradOpDescMakerBase, T>::value
? kGradOpDescMaker
: static_cast<OpInfoFillType>(-1)));
}
};
template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
struct OpInfoFiller;
template <size_t I, bool at_end, typename... ARGS>
class OperatorRegistrarRecursive;
template <size_t I, typename... ARGS>
class OperatorRegistrarRecursive<I, false, ARGS...> {
public:
using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
OpInfoFiller<T> fill;
fill(op_type, info);
constexpr auto size = sizeof...(ARGS);
OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
info);
(void)(reg);
}
};
template <size_t I, typename... ARGS>
class OperatorRegistrarRecursive<I, true, ARGS...> {
public:
OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
};
template <typename T>
struct OpInfoFiller<T, kOperator> {
void operator()(const char* op_type, OpInfo* info) const {
info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs,
const AttributeMap& attrs) {
return new T(type, inputs, outputs, attrs);
};
}
};
template <typename T>
struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
void operator()(const char* op_type, OpInfo* info) const {
info->proto_ = new OpProto;
info->checker_ = new OpAttrChecker();
auto maker = T(info->proto_, info->checker_);
maker.Validate();
info->proto_->set_type(op_type);
PADDLE_ENFORCE(
info->proto_->IsInitialized(),
"Fail to initialize %s's OpProto, because %s is not initialized",
op_type, info->proto_->InitializationErrorString());
}
};
template <typename T>
struct OpInfoFiller<T, kGradOpDescMaker> {
void operator()(const char* op_type, OpInfo* info) const {
info->grad_op_maker_ = [](const OpDescBind& fwd_op) {
T maker(fwd_op);
return maker();
};
}
};
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -66,7 +66,6 @@ message OpProto { ...@@ -66,7 +66,6 @@ message OpProto {
optional bool duplicable = 3 [ default = false ]; optional bool duplicable = 3 [ default = false ];
optional bool intermediate = 4 [ default = false ]; optional bool intermediate = 4 [ default = false ];
optional bool not_in_gradient = 5 [ default = false ];
} }
// AttrProto describes the C++ type Attribute. // AttrProto describes the C++ type Attribute.
...@@ -106,6 +105,7 @@ message LoDTensorDesc { ...@@ -106,6 +105,7 @@ message LoDTensorDesc {
message VarDesc { message VarDesc {
required string name = 1; required string name = 1;
optional LoDTensorDesc lod_tensor = 2; optional LoDTensorDesc lod_tensor = 2;
optional bool persistable = 3 [ default = false ];
} }
message BlockDesc { message BlockDesc {
...@@ -115,4 +115,7 @@ message BlockDesc { ...@@ -115,4 +115,7 @@ message BlockDesc {
repeated OpDesc ops = 4; repeated OpDesc ops = 4;
} }
// Please refer to
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
// for more details.
message ProgramDesc { repeated BlockDesc blocks = 1; } message ProgramDesc { repeated BlockDesc blocks = 1; }
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
express or implied. See the License for the specific language governing
permissions and limitations under the License. */
#include "paddle/framework/grad_op_builder.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace framework {
enum class OpArgType { IN, OUT };
static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
bool is_grad, VariableNameMap* vars) {
const auto& src_inout =
src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
auto& dst_inout = *vars;
auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
const auto& src_arg_list =
src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
for (const auto& arg : src_arg_list) {
if (arg.not_in_gradient() && !is_grad) continue;
const std::string src_name = arg.name();
std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
dst_inout[dst_name].reserve(src_inout.at(src_name).size());
for (auto& var_name : src_inout.at(src_name)) {
std::string s = is_grad ? GradVarName(var_name) : var_name;
dst_inout[dst_name].emplace_back(s);
}
}
}
OperatorBase* BuildGradOp(const OperatorBase* op) {
auto& info = OpInfoMap::Instance().Get(op->Type());
PADDLE_ENFORCE(info.HasGradientOp());
VariableNameMap inputs;
VariableNameMap outputs;
TransOpArg(op, OpArgType::IN, false, &inputs); // I
TransOpArg(op, OpArgType::OUT, false, &inputs); // O
TransOpArg(op, OpArgType::OUT, true, &inputs); // OG
TransOpArg(op, OpArgType::IN, true, &outputs); // IG
auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
}
static void TransOpDescArg(const OpDescBind* src_op, const OpArgType& src_type,
bool is_grad, OpDescBind* dst_op,
const OpArgType& dst_type) {
PADDLE_ENFORCE(dst_op != nullptr,
"Protobuf desc of gradient op must be initialized first.");
const auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
const auto& src_arg_list =
src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
for (const auto& arg : src_arg_list) {
if (arg.not_in_gradient() && !is_grad) continue;
const std::string src_name = arg.name();
std::vector<std::string> vars = src_type == OpArgType::IN
? src_op->Input(src_name)
: src_op->Output(src_name);
if (is_grad) {
for (std::string& var : vars) {
var = GradVarName(var);
}
}
std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
dst_type == OpArgType::IN ? dst_op->SetInput(dst_name, vars)
: dst_op->SetOutput(dst_name, vars);
}
}
void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op) {
auto& info = OpInfoMap::Instance().Get(forw_op->Type());
PADDLE_ENFORCE(info.HasGradientOp());
grad_op->SetType(info.grad_op_type_);
TransOpDescArg(forw_op, OpArgType::IN, false, grad_op, OpArgType::IN);
TransOpDescArg(forw_op, OpArgType::OUT, false, grad_op, OpArgType::IN);
TransOpDescArg(forw_op, OpArgType::OUT, true, grad_op, OpArgType::IN);
TransOpDescArg(forw_op, OpArgType::IN, true, grad_op, OpArgType::OUT);
grad_op->SetAttrMap(forw_op->GetAttrMap());
}
} // namespace framework
} // namespace paddle
#include "paddle/framework/grad_op_builder.h"
#include <gtest/gtest.h>
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
USE_OP(add);
namespace paddle {
namespace framework {
class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
public:
MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("In1", "a single input");
AddInput("In2_mult", "a multiple input").AsDuplicable();
AddInput("In3", "another single input");
AddOutput("Out1", "a single output");
AddOutput("Out2_mult", "a multiple output").AsDuplicable();
AddComment("test op with multiple inputs and outputs");
}
};
class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
public:
IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("In1", "a single input");
AddInput("In2_mult", "a multiple input").AsDuplicable().NotInGradient();
AddInput("In3_mult", "another multiple input").AsDuplicable();
AddOutput("Out1_mult", "a multiple output").AsDuplicable();
AddOutput("Out2", "a single output").NotInGradient();
AddComment("op with inputs and outputs ignored in gradient calculating");
}
};
} // namespace framework
} // namespace paddle
namespace f = paddle::framework;
TEST(GradOpBuilder, AddTwo) {
std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
"add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
std::shared_ptr<f::OperatorBase> grad_add_op =
f::OpRegistry::CreateGradOp(*add_op);
EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
EXPECT_EQ(grad_add_op->Outputs().size(), 2UL);
EXPECT_EQ(grad_add_op->Input("X"), "x");
EXPECT_EQ(grad_add_op->Input("Y"), "y");
EXPECT_EQ(grad_add_op->Input("Out"), "out");
EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
EXPECT_EQ(grad_add_op->Output(f::GradVarName("X")), f::GradVarName("x"));
EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
}
REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
TEST(GradOpBuilder, MutiInOut) {
std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
"mult_io", {{"In1", {"in1"}},
{"In2_mult", {"in2_1", "in2_2", "in2_3"}},
{"In3", {"in3"}}},
{{"Out1", {"out1"}}, {"Out2_mult", {"out2_1", "out2_2"}}}, {}));
std::shared_ptr<f::OperatorBase> grad_test_op =
f::OpRegistry::CreateGradOp(*test_op);
ASSERT_EQ(grad_test_op->Inputs().size(), 3UL + 2UL + 2UL);
EXPECT_EQ(grad_test_op->Input("In1"), "in1");
EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
EXPECT_EQ(grad_test_op->Input("In3"), "in3");
EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
std::vector<std::string>({"out2_1", "out2_2"}));
EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
f::GradVarName("out1"));
EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
std::vector<std::string>(
{f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
std::vector<std::string>({f::GradVarName("in2_1"),
f::GradVarName("in2_2"),
f::GradVarName("in2_3")}));
EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
}
TEST(GradOpBuilder, IOIgnoredInGradient) {
std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
"io_ignored", {{"In1", {"in1"}},
{"In2_mult", {"in2_1", "in2_2"}},
{"In3_mult", {"in3_1", "in3_2"}}},
{{"Out1_mult", {"out1_1", "out1_2"}}, {"Out2", {"out2"}}}, {}));
std::shared_ptr<f::OperatorBase> grad_test_op =
f::OpRegistry::CreateGradOp(*test_op);
// 'In2' and 'Out2' are ignored in gradient calculating
ASSERT_EQ(grad_test_op->Inputs().size(), 2UL + 1UL + 2UL);
EXPECT_EQ(grad_test_op->Input("In1"), "in1");
EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
std::vector<std::string>({"in3_1", "in3_2"}));
EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
std::vector<std::string>({"out1_1", "out1_2"}));
EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
std::vector<std::string>(
{f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
f::GradVarName("out2"));
ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
std::vector<std::string>(
{f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
std::vector<std::string>(
{f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
}
TEST(GradOpDescBuilder, MutiInOut) {
f::OpDescBind *forw_op = new f::OpDescBind();
forw_op->SetType("mult_io");
forw_op->SetInput("In1", {"in1"});
forw_op->SetInput("In2_mult", {"in2_1", "in2_2", "in2_3"});
forw_op->SetInput("In3", {"in3"});
forw_op->SetOutput("Out1", {"out1"});
forw_op->SetOutput("Out2_mult", {"out2_1", "out2_2"});
f::OpDescBind *grad_op = new f::OpDescBind();
f::CompleteGradOpDesc(forw_op, grad_op);
EXPECT_EQ(grad_op->Type(), "mult_io_grad");
ASSERT_EQ(grad_op->InputNames().size(), 3UL + 2UL + 2UL);
EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
EXPECT_EQ(grad_op->Input("In2_mult"),
std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
EXPECT_EQ(grad_op->Input("In3"), std::vector<std::string>({"in3"}));
EXPECT_EQ(grad_op->Input("Out1"), std::vector<std::string>({"out1"}));
EXPECT_EQ(grad_op->Input("Out2_mult"),
std::vector<std::string>({"out2_1", "out2_2"}));
EXPECT_EQ(grad_op->Input(f::GradVarName("Out1")),
std::vector<std::string>({f::GradVarName("out1")}));
EXPECT_EQ(grad_op->Input(f::GradVarName("Out2_mult")),
std::vector<std::string>(
{f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
std::vector<std::string>({f::GradVarName("in1")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
std::vector<std::string>({f::GradVarName("in2_1"),
f::GradVarName("in2_2"),
f::GradVarName("in2_3")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("In3")),
std::vector<std::string>({f::GradVarName("in3")}));
delete forw_op;
delete grad_op;
}
TEST(GradOpDescBuilder, IOIgnoredInGradient) {
f::OpDescBind *forw_op = new f::OpDescBind();
forw_op->SetType("io_ignored");
forw_op->SetInput("In1", {"in1"});
forw_op->SetInput("In2_mult", {"in2_1", "in2_2"});
forw_op->SetInput("In3_mult", {"in3_1", "in3_2"});
forw_op->SetOutput("Out1_mult", {"out1_1", "out1_2"});
forw_op->SetOutput("Out2", {"out2"});
f::OpDescBind *grad_op = new f::OpDescBind();
f::CompleteGradOpDesc(forw_op, grad_op);
EXPECT_EQ(grad_op->Type(), "io_ignored_grad");
// 'In2' and 'Out2' are ignored in gradient calculating
ASSERT_EQ(grad_op->InputNames().size(), 2UL + 1UL + 2UL);
EXPECT_EQ(grad_op->Input("In1"), std::vector<std::string>({"in1"}));
EXPECT_EQ(grad_op->Input("In3_mult"),
std::vector<std::string>({"in3_1", "in3_2"}));
EXPECT_EQ(grad_op->Input("Out1_mult"),
std::vector<std::string>({"out1_1", "out1_2"}));
EXPECT_EQ(grad_op->Input(f::GradVarName("Out1_mult")),
std::vector<std::string>(
{f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
EXPECT_EQ(grad_op->Input(f::GradVarName("Out2")),
std::vector<std::string>({f::GradVarName("out2")}));
ASSERT_EQ(grad_op->OutputNames().size(), 3UL);
EXPECT_EQ(grad_op->Output(f::GradVarName("In1")),
std::vector<std::string>({f::GradVarName("in1")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("In2_mult")),
std::vector<std::string>(
{f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
EXPECT_EQ(grad_op->Output(f::GradVarName("In3_mult")),
std::vector<std::string>(
{f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
delete forw_op;
delete grad_op;
}
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_desc.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace framework {
class GradOpDescMakerBase {
public:
explicit GradOpDescMakerBase(const OpDescBind& fwd_op) : fwd_op_(fwd_op) {}
virtual ~GradOpDescMakerBase() = default;
virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
protected:
static std::vector<std::string> ToGradNames(
const std::vector<std::string>& var_names) {
std::vector<std::string> ret_val;
ret_val.reserve(var_names.size());
std::transform(var_names.begin(), var_names.end(),
std::back_inserter(ret_val), GradVarName);
return ret_val;
}
std::vector<std::string> InputGrad(const std::string& name) const {
return ToGradNames(fwd_op_.Input(name));
}
std::vector<std::string> OutputGrad(const std::string& name) const {
return ToGradNames(fwd_op_.Output(name));
}
std::vector<std::string> InputNames() const {
return this->fwd_op_.InputNames();
}
std::vector<std::string> OutputNames() const {
return this->fwd_op_.OutputNames();
}
std::vector<std::string> Input(const std::string& name) const {
return fwd_op_.Input(name);
}
std::vector<std::string> Output(const std::string& name) const {
return fwd_op_.Output(name);
}
const std::unordered_map<std::string, Attribute>& Attrs() const {
return fwd_op_.GetAttrMap();
}
const Attribute& GetAttr(const std::string& name) const {
auto& map = fwd_op_.GetAttrMap();
auto it = map.find(name);
PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
return it->second;
}
std::string ForwardOpType() const { return this->fwd_op_.Type(); }
private:
const OpDescBind& fwd_op_;
};
class SingleGradOpDescMaker : public GradOpDescMakerBase {
public:
using GradOpDescMakerBase::GradOpDescMakerBase;
std::vector<std::unique_ptr<OpDescBind>> operator()() const {
std::vector<std::unique_ptr<OpDescBind>> retv;
retv.emplace_back(this->Apply());
return retv;
}
protected:
virtual std::unique_ptr<OpDescBind> Apply() const = 0;
};
class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
public:
using SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
virtual std::unique_ptr<OpDescBind> Apply() const {
auto* grad = new OpDescBind();
grad->SetType(this->GradOpType());
for (auto& input_param : this->InputNames()) {
grad->SetInput(input_param, this->Input(input_param));
grad->SetOutput(GradVarName(input_param), this->InputGrad(input_param));
}
for (auto& output_param : this->OutputNames()) {
grad->SetInput(output_param, this->Output(output_param));
grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
}
grad->SetAttrMap(this->Attrs());
return std::unique_ptr<OpDescBind>(grad);
}
virtual std::string GradOpType() const {
return this->ForwardOpType() + "_grad";
}
};
} // namespace framework
} // namespace paddle
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#pragma once #pragma once
#include <memory> #include <memory>
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h> #include <thrust/system/cuda/experimental/pinned_allocator.h>
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
template <typename T> template <typename T>
using Vector = std::vector<T>; using Vector = std::vector<T>;
#else #else
......
# Design Doc: LoD (Level-of-Detail) Tensor # Design Doc: LoD (Level-of-Detail) Tensor
PaddlePaddle's RNN doesn't require that all instances have the same length. To do so, we introduce an extension to Tensor, namely, LoD Tensor. Like other deep learning systems, PaddlePaddle supports training models from sequence data. Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor. What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
## Challenge of Variable-length Inputs | | TensorFlow | PaddlePaddle |
|-----------------------|------------|--------------|
| RNN | Support | Support |
| recursive RNN | Support | Support |
| padding zeros | Must | No need |
| blob data type | Tensor | LoDTensor |
People usually represent a mini-batch by a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor. So a transformation, T, of all images can be a matrix multiplication of the 10xOx32-dimensional tensor T and the 10x32x32 Tensor. PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators. The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences. This document presents the design of LoD and LoDTensor.
Another example is that each mini-batch contains 32 sentences, where each word is a D-dimensional one-hot vector. If all sentences have the same length L, we can represent this mini-batch by a 32xLxD tensor. However, in most cases, sentences have variable lengths, and we will need an index data structure to record these variable lengths.
## LoD as a Solution ## The Challenge: Variable-length Sequences
### Mini-Batch of variable-length sentences Most deep learning systems represent a mini-batch as a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor. Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector. Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
Let's imagine a mini-batch of 3 variable lengths sentences, containing 3, 1, and 2 words respectively. We can represent it by a (3+1+2)xD tensor plus some index information: Both examples show that the elements of sequences are usually of the same size. In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors. It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences. Also, sequences might consist of sub-sequences.
## A Solution: The LoD Index
To understand our solution, it is best to look at some examples.
### A Mini-Batch of Sentences
Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively. We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
``` ```
3
3 1 2 3 1 2
||| | || ||| | ||
``` ```
Each `|` represents a D-dimensional word vectors. The number 3 on top indicate 3 sentences, and numbers 3, 1, and 2 on the second level represent the number of words in each sentence. where each `|` represents a D-dimensional word vector. The numbers, 3, 1, and 2, form a 1-level LoD.
### Mini-Batch of variable-length videos ### Recursive Sequences
This approach generalizes to the case where elements are not words, but higher dimensional objects, like images. Suppose that a mini-batch contains videos of the same frame size 640x480. If a mini-batch contains 3 videos of 3, 1, and 2 frames respectively. The underlying tensor is of size (3+1+2)x640x480. The index information illustrates as: Let check another example of a 2-level LoD Tensor. Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
```
3 1 2
3 2 4 1 2 3
||| || |||| | || |||
```
### A Mini-Batch of Videos
LoD tensors generalize to the case where elements are higher dimensional objects, like images. Suppose that a mini-batch contains videos of the same frame size 640x480. Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
``` ```
3
3 1 2 3 1 2
口口口 口 口口 口口口 口 口口
``` ```
where each `口` represents an image. The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
### Mini-Batch of fixed-size images ### A Mini-Batch of Images
Let's get back to a typical example, image classification, where each mini-batch has M fixed-sized images. The LoD Tensor representation is In traditional cases like a mini-batch with N fixed-sized images, the LoD Tensor representation is as
``` ```
M
1 1 1 1 1 1 1 1 1 1
口口口口 ... 口 口口口口 ... 口
``` ```
The many 1's on the second level seem duplicated. For this particular case of 2 levels and the second level always have length 1, we can ignore the LoD index. In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
### Design and summarization ```
口口口口 ... 口
In summary, as long as that the essential elements (words or images) have the same size, we can represent mini-batches by a LoD Tensor: ```
- The underlying tensor has size LxD1xD2x..., where D1xD2... is the size of the essential elements, and ### Model Parameters
- The first dimension size L has an additonal property -- a LoD index as a nested vector:
```c++ A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
typedef std::vector<std::<vector>> LoD;
```
- The LoD index is not necessary when there are only two levels and all elements of the second level have length 1.
## Slicing of LoD Tensor ## The LoD Tensor
Consider that we have a network with three levels of RNN: the top level one handles articles, the second level one handles sentences, and the basic level one handles words. This network requires that mini-batches represented by 3 level LoD Tensor, for example, Let us revisit above example of the 2-level LoD Tensor
``` ```
3
3 1 2 3 1 2
3 2 4 1 2 3 3 2 4 1 2 3
||| || |||| | || ||| ||| || |||| | || |||
``` ```
To allow each level of RNN to handle its input, we define **the slicing of a LoD Tensor is defined as getting the j-th sequence on level i, or the <i,j>-slice** It is indeed a tree, where leaves are elementary sequences identified by **branches**.
For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
For example, the <2,1>-slice of above slice is ### The LoD Index
We can save the LoD index in the above example
``` ```
2 3 1 2
|| 3 2 4 1 2 3
``` ```
and the <1,2>-slice of above example is in a not-full 2D matrix:
```c++
typedef std::vector<std::vector<int> > LoD;
``` ```
2
2 3
|| |||
```
Let's go on slicing this slice. Its <1,1>-slice is where
- `LoD.size()` is the number of levels, or the maximum length of branches,
- `LoD[i][j]` is the length of the j-th segment at the i-th level.
## The Offset Representation
To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
In the above example, we accumulate the length of elementary sequences:
``` ```
1 3 2 4 1 2 3
1
|
``` ```
### The Slicing Algorithm into offsets
The algorithm, with over-simplified data structure, is defined as ```
0 3 5 9 10 12 15
= = = = = =
3 2+3 4+5 1+9 2+10 3+12
```
```c++ so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
typedef std::vector<std::vector<int>> LoD;
struct LoDTensor { Similarly, the lengths in the top level LoD
LoD lod_;
float* tensor_;
};
LoDTensor Slice(const LoDTensor& lodt, int level, int sequence); ```
3 1 2
``` ```
Let us revisit the example above are transformed into offsets of elements/words as follows:
``` ```
3 0 9 10 15
3 1 2 = = =
3 2 4 1 2 3 3+2+4 1+9 2+3+10
||| || |||| | || |||
``` ```
Suppose that we want to retrieve the <1,2>-slice so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
The complete offset representation is as follows:
``` ```
2 0 9 10 15
2 3 0 3 5 9 10 12 15
|| ||| ||| || |||| | || |||
``` ```
we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10. ## Slicing of LoD Tensors
When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences. Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
To avoid the traversal of the LoD tree at slicing time, we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level. For example, above LoD Tensor can be transformed into For example, the <2>-slice of above example is
``` ```
0 10 15
0 9 10 10 12 15
0 3 5 9 10 12 || |||
||| || |||| | || |||
``` ```
We don't really need the 0 on top, so the LoD Tensor could be and the <2,0>-slice of above slice is
``` ```
0 9 10 10 12
0 3 5 9 10 12 ||
||| || |||| | || |||
``` ```
...@@ -18,6 +18,15 @@ limitations under the License. */ ...@@ -18,6 +18,15 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs) {
op_desc_.set_type(type);
inputs_ = inputs;
outputs_ = outputs;
attrs_ = attrs;
}
OpDesc *OpDescBind::Proto() { OpDesc *OpDescBind::Proto() {
Sync(); Sync();
return &op_desc_; return &op_desc_;
...@@ -31,11 +40,10 @@ const std::vector<std::string> &OpDescBind::Input( ...@@ -31,11 +40,10 @@ const std::vector<std::string> &OpDescBind::Input(
return it->second; return it->second;
} }
std::vector<std::string> OpDescBind::InputNames() const { std::vector<std::string> OpDescBind::InputArgumentNames() const {
std::vector<std::string> retv; std::vector<std::string> retv;
retv.reserve(this->inputs_.size());
for (auto &ipt : this->inputs_) { for (auto &ipt : this->inputs_) {
retv.push_back(ipt.first); retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
} }
return retv; return retv;
} }
...@@ -54,11 +62,10 @@ const std::vector<std::string> &OpDescBind::Output( ...@@ -54,11 +62,10 @@ const std::vector<std::string> &OpDescBind::Output(
return it->second; return it->second;
} }
std::vector<std::string> OpDescBind::OutputNames() const { std::vector<std::string> OpDescBind::OutputArgumentNames() const {
std::vector<std::string> retv; std::vector<std::string> retv;
retv.reserve(this->outputs_.size());
for (auto &ipt : this->outputs_) { for (auto &ipt : this->outputs_) {
retv.push_back(ipt.first); retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
} }
return retv; return retv;
} }
...@@ -112,6 +119,42 @@ const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap() ...@@ -112,6 +119,42 @@ const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
return attrs_; return attrs_;
} }
void OpDescBind::Rename(const std::string &old_name,
const std::string &new_name) {
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
for (auto &output : outputs_) {
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
need_update_ = true;
}
struct SetAttrDescVisitor : public boost::static_visitor<void> {
explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
mutable OpDesc::Attr *attr_;
void operator()(int v) const { attr_->set_i(v); }
void operator()(float v) const { attr_->set_f(v); }
void operator()(const std::string &v) const { attr_->set_s(v); }
void operator()(bool b) const { attr_->set_b(b); }
void operator()(const std::vector<int> &v) const {
VectorToRepeated(v, attr_->mutable_ints());
}
void operator()(const std::vector<float> &v) const {
VectorToRepeated(v, attr_->mutable_floats());
}
void operator()(const std::vector<std::string> &v) const {
VectorToRepeated(v, attr_->mutable_strings());
}
void operator()(const std::vector<bool> &v) const {
VectorToRepeated(v, attr_->mutable_bools());
}
void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
};
void OpDescBind::Sync() { void OpDescBind::Sync() {
if (need_update_) { if (need_update_) {
this->op_desc_.mutable_inputs()->Clear(); this->op_desc_.mutable_inputs()->Clear();
...@@ -134,7 +177,8 @@ void OpDescBind::Sync() { ...@@ -134,7 +177,8 @@ void OpDescBind::Sync() {
attr_desc->set_name(attr.first); attr_desc->set_name(attr.first);
attr_desc->set_type( attr_desc->set_type(
static_cast<framework::AttrType>(attr.second.which() - 1)); static_cast<framework::AttrType>(attr.second.which() - 1));
boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second); SetAttrDescVisitor visitor(attr_desc);
boost::apply_visitor(visitor, attr.second);
} }
need_update_ = false; need_update_ = false;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/type_defs.h"
#include "paddle/framework/var_desc.h" #include "paddle/framework/var_desc.h"
namespace paddle { namespace paddle {
...@@ -26,6 +27,11 @@ class BlockDescBind; ...@@ -26,6 +27,11 @@ class BlockDescBind;
class OpDescBind { class OpDescBind {
public: public:
OpDescBind() {}
OpDescBind(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs);
OpDesc *Proto(); OpDesc *Proto();
std::string Type() const { return op_desc_.type(); } std::string Type() const { return op_desc_.type(); }
...@@ -34,14 +40,14 @@ class OpDescBind { ...@@ -34,14 +40,14 @@ class OpDescBind {
const std::vector<std::string> &Input(const std::string &name) const; const std::vector<std::string> &Input(const std::string &name) const;
std::vector<std::string> InputNames() const; std::vector<std::string> InputArgumentNames() const;
void SetInput(const std::string &param_name, void SetInput(const std::string &param_name,
const std::vector<std::string> &args); const std::vector<std::string> &args);
const std::vector<std::string> &Output(const std::string &name) const; const std::vector<std::string> &Output(const std::string &name) const;
std::vector<std::string> OutputNames() const; std::vector<std::string> OutputArgumentNames() const;
void SetOutput(const std::string &param_name, void SetOutput(const std::string &param_name,
const std::vector<std::string> &args); const std::vector<std::string> &args);
...@@ -60,49 +66,52 @@ class OpDescBind { ...@@ -60,49 +66,52 @@ class OpDescBind {
void SetBlockAttr(const std::string &name, BlockDescBind &block); void SetBlockAttr(const std::string &name, BlockDescBind &block);
// Only be used in C++
void SetAttrMap(const std::unordered_map<std::string, Attribute> &attr_map);
Attribute GetAttr(const std::string &name) const; Attribute GetAttr(const std::string &name) const;
int GetBlockAttr(const std::string &name) const; int GetBlockAttr(const std::string &name) const;
void Rename(const std::string &old_name, const std::string &new_name);
// Only be used in C++ // Only be used in C++
const std::unordered_map<std::string, Attribute> &GetAttrMap() const; const AttributeMap &GetAttrMap() const;
private: // Only be used in C++
struct SetAttrDescVisitor : public boost::static_visitor<void> { void SetAttrMap(const AttributeMap &attr_map);
explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
mutable OpDesc::Attr *attr_; std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
void operator()(int v) const { attr_->set_i(v); } std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
void operator()(float v) const { attr_->set_f(v); }
void operator()(const std::string &v) const { attr_->set_s(v); } void SetInputMap(const VariableNameMap &input) {
void operator()(bool b) const { attr_->set_b(b); } this->inputs_ = input;
this->need_update_ = true;
void operator()(const std::vector<int> &v) const {
VectorToRepeated(v, attr_->mutable_ints());
}
void operator()(const std::vector<float> &v) const {
VectorToRepeated(v, attr_->mutable_floats());
}
void operator()(const std::vector<std::string> &v) const {
VectorToRepeated(v, attr_->mutable_strings());
}
void operator()(const std::vector<bool> &v) const {
VectorToRepeated(v, attr_->mutable_bools());
} }
void operator()(BlockDesc *desc) const {
attr_->set_block_idx(desc->idx()); void SetOutputMap(const VariableNameMap &output) {
this->outputs_ = output;
this->need_update_ = true;
} }
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
};
void Sync(); void Sync();
const VariableNameMap &Inputs() const { return inputs_; }
const VariableNameMap &Outputs() const { return outputs_; }
private:
template <typename MapType>
static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
std::vector<typename MapType::key_type> ret_val;
ret_val.reserve(map.size());
std::transform(
map.begin(), map.end(), std::back_inserter(ret_val),
[](const typename MapType::value_type &pair) { return pair.first; });
return ret_val;
}
OpDesc op_desc_; OpDesc op_desc_;
std::unordered_map<std::string, std::vector<std::string>> inputs_; VariableNameMap inputs_;
std::unordered_map<std::string, std::vector<std::string>> outputs_; VariableNameMap outputs_;
std::unordered_map<std::string, Attribute> attrs_; AttributeMap attrs_;
// need_update_ indicate there some local changes not be synchronized. If // need_update_ indicate there some local changes not be synchronized. If
// local changes should be synchronized, need_update_ should be set to true. // local changes should be synchronized, need_update_ should be set to true.
......
...@@ -19,21 +19,18 @@ ...@@ -19,21 +19,18 @@
#include <unordered_map> #include <unordered_map>
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/op_desc.h"
#include "paddle/framework/type_defs.h"
#include "paddle/platform/macros.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class OperatorBase;
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
using OpCreator = std::function<OperatorBase*(
const std::string& /*type*/, const VariableNameMap& /*inputs*/,
const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
struct OpInfo { struct OpInfo {
OpCreator creator_; OpCreator creator_;
std::string grad_op_type_; GradOpMakerFN grad_op_maker_;
OpProto* proto_; OpProto* proto_{nullptr};
OpAttrChecker* checker_; OpAttrChecker* checker_{nullptr};
bool HasOpProtoAndChecker() const { bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr; return proto_ != nullptr && checker_ != nullptr;
...@@ -46,30 +43,25 @@ struct OpInfo { ...@@ -46,30 +43,25 @@ struct OpInfo {
return *proto_; return *proto_;
} }
const OpAttrChecker& Checker() const {
PADDLE_ENFORCE_NOT_NULL(checker_,
"Operator Checker has not been registered");
return *checker_;
}
const OpCreator& Creator() const { const OpCreator& Creator() const {
PADDLE_ENFORCE_NOT_NULL(creator_, PADDLE_ENFORCE_NOT_NULL(creator_,
"Operator Creator has not been registered"); "Operator Creator has not been registered");
return creator_; return creator_;
} }
bool HasGradientOp() const { return !grad_op_type_.empty(); } const GradOpMakerFN& GradOpMaker() const {
PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
"Operator GradOpMaker has not been registered.");
return grad_op_maker_;
}
const OpAttrChecker* Checker() const { return checker_; }
}; };
class OpInfoMap { class OpInfoMap {
public: public:
static OpInfoMap& Instance(); static OpInfoMap& Instance();
OpInfoMap(const OpInfoMap& o) = delete;
OpInfoMap(OpInfoMap&& o) = delete;
OpInfoMap& operator=(const OpInfoMap& o) = delete;
OpInfoMap& operator=(OpInfoMap&& o) = delete;
bool Has(const std::string& op_type) const { bool Has(const std::string& op_type) const {
return map_.find(op_type) != map_.end(); return map_.find(op_type) != map_.end();
} }
...@@ -105,6 +97,8 @@ class OpInfoMap { ...@@ -105,6 +97,8 @@ class OpInfoMap {
private: private:
OpInfoMap() = default; OpInfoMap() = default;
std::unordered_map<std::string, const OpInfo> map_; std::unordered_map<std::string, const OpInfo> map_;
DISABLE_COPY_AND_ASSIGN(OpInfoMap);
}; };
} // namespace framework } // namespace framework
......
...@@ -44,11 +44,6 @@ class OpProtoAndCheckerMaker { ...@@ -44,11 +44,6 @@ class OpProtoAndCheckerMaker {
var_->set_intermediate(true); var_->set_intermediate(true);
return *this; return *this;
} }
VariableBuilder& NotInGradient() {
var_->set_not_in_gradient(true);
return *this;
}
}; };
VariableBuilder AddInput(const std::string& name, const std::string& comment); VariableBuilder AddInput(const std::string& name, const std::string& comment);
......
...@@ -23,7 +23,9 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp( ...@@ -23,7 +23,9 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
const std::string& type, const VariableNameMap& inputs, const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, AttributeMap attrs) { const VariableNameMap& outputs, AttributeMap attrs) {
auto& info = OpInfoMap::Instance().Get(type); auto& info = OpInfoMap::Instance().Get(type);
info.Checker().Check(attrs); if (info.Checker() != nullptr) {
info.Checker()->Check(attrs);
}
auto op = info.Creator()(type, inputs, outputs, attrs); auto op = info.Creator()(type, inputs, outputs, attrs);
return std::unique_ptr<OperatorBase>(op); return std::unique_ptr<OperatorBase>(op);
} }
...@@ -52,9 +54,15 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) { ...@@ -52,9 +54,15 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
return CreateOp(op_desc.type(), inputs, outputs, attrs); return CreateOp(op_desc.type(), inputs, outputs, attrs);
} }
std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) { std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops"); return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
return std::unique_ptr<OperatorBase>(BuildGradOp(&op)); op_desc.GetAttrMap());
}
std::vector<std::unique_ptr<OpDescBind>> OpRegistry::CreateGradOpDescs(
const OpDescBind& op_desc) {
auto& info = OpInfoMap::Instance().Get(op_desc.Type());
return info.grad_op_maker_(op_desc);
} }
} // namespace framework } // namespace framework
......
...@@ -21,49 +21,54 @@ limitations under the License. */ ...@@ -21,49 +21,54 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/details/op_registry.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/grad_op_builder.h" #include "paddle/framework/grad_op_desc_maker.h"
#include "paddle/framework/op_info.h" #include "paddle/framework/op_desc.h"
#include "paddle/framework/op_proto_maker.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Registrar {
public:
// In our design, various kinds of classes, e.g., operators and kernels,
// have their corresponding registry and registrar. The action of
// registration is in the constructor of a global registrar variable, which,
// however, are not used in the code that calls package framework, and would
// be removed from the generated binary file by the linker. To avoid such
// removal, we add Touch to all registrar classes and make USE_OP macros to
// call this method. So, as long as the callee code calls USE_OP, the global
// registrar variable won't be removed by the linker.
void Touch() {}
};
template <typename... ARGS>
struct OperatorRegistrar : public Registrar {
explicit OperatorRegistrar(const char* op_type) : op_type(op_type) {
PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
"'%s' is registered more than once.", op_type);
static_assert(sizeof...(ARGS) != 0,
"OperatorRegistrar should be invoked at least by OpClass");
details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
OpInfoMap::Instance().Insert(op_type, info);
}
const char* op_type;
OpInfo info;
};
class OpRegistry { class OpRegistry {
public: public:
template <typename OpType, typename ProtoMakerType, typename GradOpType> template <typename OpType, typename ProtoMakerType, typename GradOpType>
static void RegisterOp(const std::string& op_type, static void RegisterOp(const std::string& op_type,
const std::string& grad_op_type) { const std::string& grad_op_type) {
PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type), OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
"'%s' is registered more than once.", op_type); reg.info.grad_op_type_ = grad_op_type;
OpInfo op_info;
op_info.creator_ = [](
const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs) {
return new OpType(type, inputs, outputs, attrs);
};
op_info.grad_op_type_ = grad_op_type;
if (std::type_index(typeid(ProtoMakerType)) !=
std::type_index(typeid(NOPMaker))) {
op_info.proto_ = new OpProto;
op_info.checker_ = new OpAttrChecker;
auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
maker.Validate();
op_info.proto_->set_type(op_type);
PADDLE_ENFORCE(
op_info.proto_->IsInitialized(),
"Fail to initialize %s's OpProto, because %s is not initialized",
op_type, op_info.proto_->InitializationErrorString());
} else {
op_info.proto_ = nullptr;
op_info.checker_ = nullptr;
}
OpInfoMap::Instance().Insert(op_type, op_info);
// register gradient op // register gradient op
if (!grad_op_type.empty()) { if (!grad_op_type.empty()) {
RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, ""); OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
} }
} }
...@@ -74,20 +79,10 @@ class OpRegistry { ...@@ -74,20 +79,10 @@ class OpRegistry {
static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc); static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op); static std::vector<std::unique_ptr<OpDescBind>> CreateGradOpDescs(
}; const OpDescBind& op_desc);
class Registrar { static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
public:
// In our design, various kinds of classes, e.g., operators and kernels,
// have their corresponding registry and registrar. The action of
// registration is in the constructor of a global registrar variable, which,
// however, are not used in the code that calls package framework, and would
// be removed from the generated binary file by the linker. To avoid such
// removal, we add Touch to all registrar classes and make USE_OP macros to
// call this method. So, as long as the callee code calls USE_OP, the global
// registrar variable won't be removed by the linker.
void Touch() {}
}; };
template <typename OpType, typename ProtoMakerType, typename GradOpType> template <typename OpType, typename ProtoMakerType, typename GradOpType>
...@@ -145,33 +140,41 @@ class OpKernelRegistrar : public Registrar { ...@@ -145,33 +140,41 @@ class OpKernelRegistrar : public Registrar {
__test_global_namespace_##uniq_name##__>::value, \ __test_global_namespace_##uniq_name##__>::value, \
msg) msg)
/** #define REGISTER_OPERATOR(op_type, op_class, ...) \
* Macro to register Operator.
*/
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \ __reg_op__##op_type, \
"REGISTER_OPERATOR must be called in global namespace"); \
class _OpClass_##op_type##_ : public op_class { \ class _OpClass_##op_type##_ : public op_class { \
public: \ public: \
DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_); \ DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_); \
DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class); \ DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class); \
}; \ }; \
class _OpGradClass_##op_type##_ : public grad_op_class { \ static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
public: \ ##__VA_ARGS__> \
DEFINE_OP_CLONE_METHOD(_OpGradClass_##op_type##_); \ __op_registrar_##op_type##__(#op_type); \
DEFINE_OP_CONSTRUCTOR(_OpGradClass_##op_type##_, grad_op_class); \
}; \
static ::paddle::framework::OpRegistrar< \
_OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_> \
__op_registrar_##op_type##__(#op_type, #grad_op_type); \
int TouchOpRegistrar_##op_type() { \ int TouchOpRegistrar_##op_type() { \
__op_registrar_##op_type##__.Touch(); \ __op_registrar_##op_type##__.Touch(); \
return 0; \ return 0; \
} }
/**
* Macro to register Operator.
*/
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \
REGISTER_OPERATOR(grad_op_type, grad_op_class); \
class _GradOpDescMaker_##grad_op_type##_ \
: public ::paddle::framework::DefaultGradOpDescMaker { \
using ::paddle::framework::DefaultGradOpDescMaker::DefaultGradOpDescMaker; \
\
protected: \
virtual std::string GradOpType() const { return #grad_op_type; } \
}; \
REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
op_maker_class);
#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \ #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP) REGISTER_OPERATOR(op_type, op_class, op_maker_class)
/** /**
* Macro to register OperatorKernel. * Macro to register OperatorKernel.
...@@ -218,7 +221,7 @@ class OpKernelRegistrar : public Registrar { ...@@ -218,7 +221,7 @@ class OpKernelRegistrar : public Registrar {
// TODO(fengjiayi): The following macros // TODO(fengjiayi): The following macros
// seems ugly, do we have better method? // seems ugly, do we have better method?
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU) #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
#else #else
#define USE_OP_KERNEL(op_type) \ #define USE_OP_KERNEL(op_type) \
......
...@@ -173,3 +173,14 @@ TEST(OpRegistry, CustomChecker) { ...@@ -173,3 +173,14 @@ TEST(OpRegistry, CustomChecker) {
int test_attr = op->Attr<int>("test_attr"); int test_attr = op->Attr<int>("test_attr");
ASSERT_EQ(test_attr, 4); ASSERT_EQ(test_attr, 4);
} }
class CosineOpComplete : public paddle::framework::CosineOp {
public:
DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
DEFINE_OP_CLONE_METHOD(CosineOpComplete);
};
TEST(OperatorRegistrar, Test) {
using namespace paddle::framework;
OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
}
...@@ -25,7 +25,7 @@ Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< ...@@ -25,7 +25,7 @@ Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
return *device_context_.GetEigenDevice<platform::CPUPlace>(); return *device_context_.GetEigenDevice<platform::CPUPlace>();
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
template <> template <>
Eigen::GpuDevice& Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const { ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
...@@ -245,5 +245,12 @@ std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>( ...@@ -245,5 +245,12 @@ std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
return res; return res;
} }
std::ostream& operator<<(std::ostream& os,
const OperatorWithKernel::OpKernelKey& kernel_key) {
os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
<< "]";
return os;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "op_info.h" #include "op_info.h"
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/block_desc.h"
#include "paddle/framework/data_type.h" #include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
...@@ -317,46 +318,170 @@ class ExecutionContext : public InferShapeContext { ...@@ -317,46 +318,170 @@ class ExecutionContext : public InferShapeContext {
const platform::DeviceContext& device_context_; const platform::DeviceContext& device_context_;
}; };
class CompileTimeInferShapeContext : public InferShapeContextBase {
public:
CompileTimeInferShapeContext(const OpDescBind& op, const BlockDescBind& block)
: op_(op), block_(block) {}
bool HasInput(const std::string& name) const override {
const std::vector<std::string>& input_names = op_.Input(name);
auto length = input_names.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Input(%s) should have only one value, "
"but it have %d now",
name, length);
return block_.HasVar(input_names[0]);
}
bool HasOutput(const std::string& name) const override {
const std::vector<std::string>& output_names = op_.Output(name);
auto length = output_names.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Output(%s) should have only one value, "
"but it have %d now",
name, length);
return block_.HasVar(output_names[0]);
}
bool HasInputs(const std::string& name) const override {
const std::vector<std::string>& input_names = op_.Input(name);
PADDLE_ENFORCE(!input_names.empty(), "Inputs(%s) length is 0", name);
for (auto& input : input_names) {
if (!block_.HasVar(input)) return false;
}
return true;
}
bool HasOutputs(const std::string& name) const override {
const std::vector<std::string>& output_names = op_.Output(name);
PADDLE_ENFORCE(!output_names.empty(), "Inputs(%s) length is 0", name);
for (auto& output : output_names) {
if (!block_.HasVar(output)) return false;
}
return true;
}
DDim GetInputDim(const std::string& name) const override {
std::vector<DDim> ddims = GetInputsDim(name);
auto length = ddims.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Input(%s) should have 1 value, "
"but it has %d now",
name, length);
return ddims[0];
}
void SetInputDim(const std::string& name, const DDim& dim) override {
SetInputsDim(name, {dim});
}
DDim GetOutputDim(const std::string& name) const override {
std::vector<DDim> ddims = GetOutputsDim(name);
auto length = ddims.size();
PADDLE_ENFORCE_EQ(length, 1UL,
"Output(%s) should have 1 value, "
"but it has %d now",
name, length);
return ddims[0];
}
void SetOutputDim(const std::string& name, const DDim& dim) override {
SetOutputsDim(name, {dim});
}
AttrReader Attrs() const override { return AttrReader(op_.GetAttrMap()); }
const std::vector<std::string>& Inputs(
const std::string& name) const override {
return op_.Input(name);
}
const std::vector<std::string>& Outputs(
const std::string& name) const override {
return op_.Output(name);
}
private:
DDim GetDim(const std::string& name) const override {
return framework::make_ddim(block_.Var(name)->Shape());
}
void SetDim(const std::string& name, const DDim& dim) override {
block_.Var(name)->SetShape(framework::vectorize(dim));
}
const OpDescBind& op_;
const BlockDescBind& block_;
};
class RuntimeInferShapeContext : public InferShapeContextBase { class RuntimeInferShapeContext : public InferShapeContextBase {
public: public:
RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
: op_(op), scope_(scope) {} : op_(op), scope_(scope) {}
bool HasInput(const std::string& name) const { bool HasInput(const std::string& name) const override {
auto ipt = op_.Input(name); auto ipt = op_.Input(name);
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr; return var != nullptr;
} }
bool HasOutput(const std::string& name) const { bool HasOutput(const std::string& name) const override {
auto ipt = op_.Output(name); auto ipt = op_.Output(name);
auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
return var != nullptr; return var != nullptr;
} }
DDim GetInputDim(const std::string& name) const { bool HasInputs(const std::string& name) const override {
auto inputs = op_.Inputs(name);
if (inputs.empty()) {
return false;
}
for (auto& input : inputs) {
if (scope_.FindVar(input) == nullptr) {
return false;
}
}
return true;
}
bool HasOutputs(const std::string& name) const override {
auto outputs = op_.Outputs(name);
if (outputs.empty()) {
return false;
}
for (auto& output : outputs) {
if (scope_.FindVar(output) == nullptr) {
return false;
}
}
return true;
}
DDim GetInputDim(const std::string& name) const override {
return GetDim(op_.Input(name)); return GetDim(op_.Input(name));
} }
void SetInputDim(const std::string& name, const DDim& dim) { void SetInputDim(const std::string& name, const DDim& dim) override {
SetDim(op_.Input(name), dim); SetDim(op_.Input(name), dim);
} }
DDim GetOutputDim(const std::string& name) const { DDim GetOutputDim(const std::string& name) const override {
return GetDim(op_.Output(name)); return GetDim(op_.Output(name));
} }
void SetOutputDim(const std::string& name, const DDim& dim) { void SetOutputDim(const std::string& name, const DDim& dim) override {
SetDim(op_.Output(name), dim); SetDim(op_.Output(name), dim);
} }
AttrReader Attrs() const { return AttrReader(op_.Attrs()); } AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
const std::vector<std::string>& Inputs(const std::string& name) const { const std::vector<std::string>& Inputs(
const std::string& name) const override {
return op_.Inputs(name); return op_.Inputs(name);
} }
const std::vector<std::string>& Outputs(const std::string& name) const { const std::vector<std::string>& Outputs(
const std::string& name) const override {
return op_.Outputs(name); return op_.Outputs(name);
} }
...@@ -377,11 +502,11 @@ class RuntimeInferShapeContext : public InferShapeContextBase { ...@@ -377,11 +502,11 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
return t; return t;
} }
DDim GetDim(const std::string& name) const { DDim GetDim(const std::string& name) const override {
return GetTensor<false>(name)->dims(); return GetTensor<false>(name)->dims();
} }
void SetDim(const std::string& name, const DDim& dim) { void SetDim(const std::string& name, const DDim& dim) override {
GetTensor<true>(name)->Resize(dim); GetTensor<true>(name)->Resize(dim);
} }
...@@ -452,9 +577,25 @@ class OperatorWithKernel : public OperatorBase { ...@@ -452,9 +577,25 @@ class OperatorWithKernel : public OperatorBase {
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
ExecutionContext ctx(*this, scope, dev_ctx); ExecutionContext ctx(*this, scope, dev_ctx);
auto& opKernel = AllOpKernels().at(type_).at(
OpKernelKey(IndicateDataType(ctx), dev_ctx)); // check if op[type] has kernel registered.
opKernel->Compute(ctx); auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);
if (kernels_iter == all_op_kernels.end()) {
PADDLE_THROW("op[%s] has no kernel", type_);
}
// check if op[type] have kernel for kernel_key
OpKernelMap& kernels = kernels_iter->second;
auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
auto kernel_iter = kernels.find(kernel_key);
if (kernel_iter == kernels.end()) {
PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_,
kernel_key);
}
kernel_iter->second->Compute(ctx);
} }
static std::unordered_map<std::string /* op_type */, OpKernelMap>& static std::unordered_map<std::string /* op_type */, OpKernelMap>&
...@@ -471,9 +612,9 @@ class OperatorWithKernel : public OperatorBase { ...@@ -471,9 +612,9 @@ class OperatorWithKernel : public OperatorBase {
}); });
} }
protected:
virtual void InferShape(InferShapeContextBase* ctx) const = 0; virtual void InferShape(InferShapeContextBase* ctx) const = 0;
protected:
// indicate kernel DataType by input data. Defaultly all input data must be // indicate kernel DataType by input data. Defaultly all input data must be
// same. // same.
virtual DataType IndicateDataType(const ExecutionContext& ctx) const { virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
...@@ -503,5 +644,8 @@ class OperatorWithKernel : public OperatorBase { ...@@ -503,5 +644,8 @@ class OperatorWithKernel : public OperatorBase {
} }
}; };
std::ostream& operator<<(std::ostream& os,
const OperatorWithKernel::OpKernelKey& kernel_key);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/platform/macros.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -26,9 +27,6 @@ class ProgramDescBind { ...@@ -26,9 +27,6 @@ class ProgramDescBind {
public: public:
static ProgramDescBind &Instance(ProgramDesc *prog); static ProgramDescBind &Instance(ProgramDesc *prog);
ProgramDescBind(const ProgramDescBind &o) = delete;
ProgramDescBind &operator=(const ProgramDescBind &o) = delete;
BlockDescBind *AppendBlock(const BlockDescBind &parent); BlockDescBind *AppendBlock(const BlockDescBind &parent);
BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); } BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
...@@ -46,6 +44,8 @@ class ProgramDescBind { ...@@ -46,6 +44,8 @@ class ProgramDescBind {
ProgramDesc *prog_; ProgramDesc *prog_;
std::vector<std::unique_ptr<BlockDescBind>> blocks_; std::vector<std::unique_ptr<BlockDescBind>> blocks_;
DISABLE_COPY_AND_ASSIGN(ProgramDescBind);
}; };
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include "paddle/framework/variable.h" #include "paddle/framework/variable.h"
#include "paddle/platform/macros.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -38,11 +39,6 @@ class Scope { ...@@ -38,11 +39,6 @@ class Scope {
Scope() {} Scope() {}
~Scope(); ~Scope();
// Disable Copy, Assign, Move.
Scope(const Scope& other) = delete;
Scope& operator=(const Scope& other) = delete;
Scope(Scope&& other) = delete;
/// Create a sub-scope. Returns a reference other than a pointer so /// Create a sub-scope. Returns a reference other than a pointer so
/// to prevent from manual deletion. /// to prevent from manual deletion.
/// Mark it to const because that new kid scope cannot change parent scope. /// Mark it to const because that new kid scope cannot change parent scope.
...@@ -73,6 +69,8 @@ class Scope { ...@@ -73,6 +69,8 @@ class Scope {
std::unordered_map<std::string, Variable*> vars_; std::unordered_map<std::string, Variable*> vars_;
mutable std::list<Scope*> kids_; mutable std::list<Scope*> kids_;
Scope const* parent_{nullptr}; Scope const* parent_{nullptr};
DISABLE_COPY_AND_ASSIGN(Scope);
}; };
} // namespace framework } // namespace framework
......
...@@ -19,11 +19,18 @@ limitations under the License. */ ...@@ -19,11 +19,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// TODO(longfei): Once after both CompileTimeInferShapeContext and
// RuntimeInferShapeContext get merged, we can rename InferShapeContextBase into
// InferShapeContext so to replace the current InferShapeContext.
class InferShapeContextBase { class InferShapeContextBase {
public: public:
virtual ~InferShapeContextBase() {} virtual ~InferShapeContextBase() {}
virtual bool HasInput(const std::string &name) const = 0; virtual bool HasInput(const std::string &name) const = 0;
virtual bool HasOutput(const std::string &name) const = 0; virtual bool HasOutput(const std::string &name) const = 0;
virtual bool HasInputs(const std::string &name) const = 0;
virtual bool HasOutputs(const std::string &name) const = 0;
virtual framework::DDim GetInputDim(const std::string &name) const = 0; virtual framework::DDim GetInputDim(const std::string &name) const = 0;
std::vector<framework::DDim> GetInputsDim(const std::string &name) const { std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
const std::vector<std::string> &names = Inputs(name); const std::vector<std::string> &names = Inputs(name);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/tensor_array.h"
#include <glog/logging.h>
#include <algorithm>
#include <limits>
namespace paddle {
namespace framework {
namespace detail {
/*
* Offer an iterator over the length-sorted lod-tensor's top level. The top
* level of a lod-tensor stores batch-size of sequences, each top-level sequence
* may contains several lower-level sequences, sort top-level lod by the numbers
* of lower-level sequences in descending order, so that during RNN's running,
* the batch-size will keep decreasing, the short sentences will end at the tail
* of each batch.
*
* Let's take a simple lod-tensor for example
*
* |(0) |(1) top-level has two instances
* ||| ||||| lower-level
*
* sort by lower-level's length
*
* |(1) |(0)
* ||||| |||
*
* when RNN runs, it get 5 batches (equals the number of elements the longest
* sequence has)
*
* |||||
* |||
*
* the first three batches has two elements, the last two elements just has 1
* element each.
*/
struct DynamicBatchUnpacker {
using value_type = float;
DynamicBatchUnpacker(const LoDTensor& source, size_t level,
bool descend = true)
: source(&source), level(level) {
BuildLengthSortedMeta(descend);
}
LoDTensor GetBatch(size_t index);
std::vector<DySeqMeta> meta;
LoDTensor const* source;
size_t level;
protected:
void BuildLengthSortedMeta(bool descend);
};
LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
const std::vector<DySeqMeta>& meta, const LoD& lod,
size_t level);
} // namespace detail
const LoDTensor& TensorArray::Read(size_t index) const {
PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
if (index >= size()) {
values_.resize(index + 1);
}
return values_[index];
}
void TensorArray::Write(size_t index, const LoDTensor& value) {
PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
if (index >= size()) {
values_.resize(index + 1);
}
values_[index].Resize(value.dims());
values_[index].mutable_data<value_type>(platform::CPUPlace());
values_[index].CopyFrom<value_type>(value, platform::CPUPlace());
}
void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
if (index >= size()) {
values_.resize(index + 1);
}
values_[index].ShareDataWith<value_type>(value);
}
LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
const LoD& lod) const {
return detail::PackDynamicBatch(values_, meta, lod, level);
}
std::vector<DySeqMeta> TensorArray::Unpack(const LoDTensor& source, int level,
bool length_desend) {
detail::DynamicBatchUnpacker unpacker(source, level,
length_desend /*descend*/);
// find max length of all the sequences
size_t max_length = 0;
for (const auto& seq : unpacker.meta) {
max_length = std::max(max_length, seq.end - seq.begin);
}
// write batches to values
for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
Write(batch_id, unpacker.GetBatch(batch_id));
}
return unpacker.meta;
}
LoDTensor TensorArray::Stack() const {
LoDTensor result;
if (size() == 0) return result;
const auto& first_dims = values_.front().dims();
// check all the values have the same shape
// TODO(superjom) check the same dtypes
for (size_t idx = 1; idx < size(); idx++) {
const auto& value_dims = values_[idx].dims();
PADDLE_ENFORCE_EQ(first_dims, value_dims);
}
// copy
auto result_dims = vectorize(first_dims);
result_dims.insert(result_dims.begin(), size());
result.Resize(make_ddim(result_dims));
result.mutable_data<value_type>(platform::CPUPlace());
for (size_t idx = 0; idx < size(); idx++) {
result.Slice<value_type>(idx, idx + 1)
.CopyFrom<value_type>(Read(idx), platform::CPUPlace());
}
return result;
}
void TensorArray::Unstack(const LoDTensor& source) const {
Unstack(source, false /*data_shared*/);
}
void TensorArray::UnstackShared(const LoDTensor& source) const {
Unstack(source, true /*data_shared*/);
}
void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
size_t first_dim = source.dims()[0];
DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
PADDLE_ENFORCE_GT(first_dim, 0,
"source should have some data to be unstacked");
values_.resize(first_dim);
for (size_t elem = 0; elem < first_dim; elem++) {
// create a new value
auto& value = values_[elem];
if (data_shared) {
// share memory
value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1));
} else {
// copy
value.Resize(value_dims);
value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
platform::CPUPlace());
}
}
}
size_t TensorArray::size() const { return values_.size(); }
namespace detail {
void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
// collect meta for each sequence in some level
auto lod = SliceLevels(source->lod(), level, level + 1)[0];
for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
meta.push_back(seq_meta);
}
PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
// sort by length
sort(meta.begin(), meta.end(),
[descend](const DySeqMeta& a, const DySeqMeta& b) {
bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
return descend ? a_ge_b : !a_ge_b;
});
}
LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
PADDLE_ENFORCE(!meta.empty(), "should build meta first");
LoDTensor result;
// collect indice need to copy to the batch
std::vector<size_t> indice;
for (const auto& seq : meta) {
size_t id = seq.begin + index;
if (id >= seq.end) break;
indice.push_back(id);
}
PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
// copy the indice of records in LoDTensor
auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
auto record_dims_vec = vectorize(record_dims);
record_dims_vec.insert(record_dims_vec.begin(), indice.size());
result.Resize(make_ddim(record_dims_vec));
result.mutable_data<value_type>(platform::CPUPlace());
for (size_t i = 0; i < indice.size(); i++) {
auto index = indice[i];
auto target = result.Slice<value_type>(i, i + 1);
auto source_ = source->Slice<value_type>(index, index + 1);
target.CopyFrom<value_type>(source_, platform::CPUPlace());
}
return result;
}
// TODO(supejom) to cache lod if reasonable
LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
const std::vector<DySeqMeta>& meta, const LoD& lod,
size_t level) {
PADDLE_ENFORCE(!source.empty());
PADDLE_ENFORCE(!meta.empty());
PADDLE_ENFORCE(!lod.empty());
LoDTensor result;
// init result space
auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
auto record_dims_vec = vectorize(record_dims);
auto height = lod[level].back();
record_dims_vec.insert(record_dims_vec.begin(), height);
result.Resize(make_ddim(record_dims_vec));
result.mutable_data<float>(platform::CPUPlace());
for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
const auto& seq_meta = meta[seq_id];
// source is source[batch_id][seq_id]
// target is result[index]
auto index = seq_meta.begin + batch_id;
if (index >= seq_meta.end) break;
auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
auto target = result.Slice<float>(index, index + 1);
target.CopyFrom<float>(source_, platform::CPUPlace());
}
}
result.set_lod(lod);
return result;
}
} // namespace detail
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/framework/lod_tensor.h"
namespace paddle {
namespace framework {
/*
* DyBatchSeqPosition stores indices of the basic element in tensor. It is used
* after lod-tensor's re-assembling, its info can be used to recover the order
* in original lod-tensor.
*/
struct DySeqMeta {
DySeqMeta(size_t begin, size_t end, size_t ori_idx)
: begin(begin), end(end), ori_idx(ori_idx) {}
size_t begin;
size_t end; // not included
size_t ori_idx;
};
/*
* TensorArray is a C-array-like array of tensors, it is meant to be used with
* dynamic iteration primitives such as while_loop. It is used to segment inputs
* and store states in all time steps.
*
* By providing some methods similar to a C++ array, the difinition of some
* state-based dynamic models such as RNN cound be more natural and highly
* flexible.
*/
class TensorArray {
public:
using value_type = float;
// max number of values allowed to store.
const size_t MAX_SIZE{100000};
/*
* Read the value at location `index` in the `TensorArray`.
*/
const LoDTensor &Read(size_t index) const;
/*
* Write value into the index of the TensorArray.
*/
void Write(size_t index, const LoDTensor &value);
/*
* Write value into the index of the TensorArray, with memory shared.
*/
void WriteShared(size_t index, const LoDTensor &value);
/*
* Recover the original LoD-arranged LoDTensor with the `values`, `level` and
* `indice_map`.
*/
LoDTensor Pack(size_t level, const std::vector<DySeqMeta> &meta,
const LoD &lod) const;
/*
* Split LoDTensor in some `level` and write the generated batches to
* `values`, if set `desend`, will sort by length in descending order else in
* ascending order.
*/
std::vector<DySeqMeta> Unpack(const LoDTensor &source, int level,
bool length_desend);
/*
* Pack the values into a tensor with rank one higher than each tensor in
* values.
*/
LoDTensor Stack() const;
/*
* Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
*/
void Unstack(const LoDTensor &source) const;
/*
* Unpacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
* with memory of tensors shared.
*/
void UnstackShared(const LoDTensor &source) const;
/*
* Return the number of values.
*/
size_t size() const;
protected:
void Unstack(const LoDTensor &source, bool data_shared) const;
private:
mutable std::vector<LoDTensor> values_;
}; // class TensorArray
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/tensor_array.h"
#include <gtest/gtest.h>
namespace paddle {
namespace framework {
class TensorArrayTester : public ::testing::Test {
protected:
void SetUp() override {
LoDTensor source;
source.Resize(make_ddim({batch_size, dim}));
int* data = source.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 16 * 32; i++) {
data[i] = i;
}
ta.Unstack(source);
}
TensorArray ta;
const int batch_size = 16;
const int dim = 32;
};
TEST_F(TensorArrayTester, Read) {
for (int i = 0; i < batch_size; i++) {
const auto& tensor = ta.Read(i);
ASSERT_EQ(tensor.dims()[0], 1);
ASSERT_EQ(tensor.dims()[1], dim);
}
}
TEST_F(TensorArrayTester, Write) {
LoDTensor source;
source.Resize(make_ddim({1, dim}));
for (int i = 0; i < dim; i++) {
*(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
}
ta.Write(2, source);
const auto& tensor = ta.Read(2);
for (int i = 0; i < dim; i++) {
EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
}
}
TEST_F(TensorArrayTester, WriteShared) {
LoDTensor source;
source.Resize(make_ddim({1, dim}));
for (int i = 0; i < dim; i++) {
*(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
}
ta.WriteShared(2, source);
const auto& tensor = ta.Read(2);
for (int i = 0; i < dim; i++) {
EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
}
EXPECT_EQ(source.data<int>(), tensor.data<int>());
}
class TensorArrayPackTester : public ::testing::Test {
protected:
virtual void SetUp() override {
lod.push_back(std::vector<size_t>{0, 2, 9, 13});
source.set_lod(lod);
source.Resize(make_ddim({13, 128}));
source.mutable_data<int>(platform::CPUPlace());
// content of each setence: 0 1 2 3 4
const auto& level = lod.front();
for (size_t i = 0; i < level.size() - 1; i++) {
size_t begin = level[i];
size_t end = level[i + 1];
for (size_t j = begin; j < end; j++) {
auto record = source.Slice<int>(j, j + 1);
for (int dim = 0; dim < 128; dim++) {
record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
}
}
}
// unpack
meta = ta.Unpack(source, 0, true);
}
LoD lod;
TensorArray ta;
LoDTensor source;
std::vector<DySeqMeta> meta;
};
TEST_F(TensorArrayPackTester, Unpack) {
ASSERT_EQ(ta.size(), 7UL);
const auto& t0 = ta.Read(0);
const auto& t1 = ta.Read(1);
ASSERT_EQ(t0.data<int>()[0], int(0));
ASSERT_EQ(t1.data<int>()[0], int(1));
}
TEST_F(TensorArrayPackTester, Pack) {
LoDTensor packed = ta.Pack(0, meta, lod);
}
TEST_F(TensorArrayTester, size) {
ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
}
} // namespace framework
} // namespace paddle
...@@ -65,7 +65,7 @@ inline T* Tensor::mutable_data(platform::Place place) { ...@@ -65,7 +65,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>( holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size)); boost::get<platform::CPUPlace>(place), size));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
} }
#else #else
...@@ -103,7 +103,7 @@ inline void Tensor::CopyFrom(const Tensor& src, ...@@ -103,7 +103,7 @@ inline void Tensor::CopyFrom(const Tensor& src,
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size); boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src_place) && else if (platform::is_gpu_place(src_place) &&
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......
...@@ -74,7 +74,7 @@ TEST(Tensor, MutableData) { ...@@ -74,7 +74,7 @@ TEST(Tensor, MutableData) {
EXPECT_EQ(p1, p2); EXPECT_EQ(p1, p2);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; Tensor src_tensor;
float* p1 = nullptr; float* p1 = nullptr;
...@@ -126,7 +126,7 @@ TEST(Tensor, ShareDataWith) { ...@@ -126,7 +126,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; Tensor src_tensor;
Tensor dst_tensor; Tensor dst_tensor;
...@@ -163,7 +163,7 @@ TEST(Tensor, Slice) { ...@@ -163,7 +163,7 @@ TEST(Tensor, Slice) {
EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace()); src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
...@@ -218,7 +218,7 @@ TEST(Tensor, CopyFrom) { ...@@ -218,7 +218,7 @@ TEST(Tensor, CopyFrom) {
EXPECT_EQ(dst_ptr[i], slice_ptr[i]); EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
} }
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
{ {
Tensor src_tensor; Tensor src_tensor;
Tensor gpu_tensor; Tensor gpu_tensor;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <map>
#include "paddle/platform/variant.h"
namespace paddle {
namespace framework {
class OperatorBase;
class OpDescBind;
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
// The order should be as same as framework.proto
using Attribute =
boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>, bool,
std::vector<bool>, BlockDesc*>;
using AttributeMap = std::unordered_map<std::string, Attribute>;
using OpCreator = std::function<OperatorBase*(
const std::string& /*type*/, const VariableNameMap& /*inputs*/,
const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
using GradOpMakerFN =
std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>;
} // namespace framework
} // namespace paddle
...@@ -194,7 +194,7 @@ public: ...@@ -194,7 +194,7 @@ public:
REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward); REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward); REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward); REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward); REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
#endif #endif
......
...@@ -395,7 +395,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward, ...@@ -395,7 +395,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
REGISTER_TYPED_FUNC(ContextProjectionBackward, REGISTER_TYPED_FUNC(ContextProjectionBackward,
CPU, CPU,
ContextProjectionBackwardFunc); ContextProjectionBackwardFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(ContextProjectionForward, REGISTER_TYPED_FUNC(ContextProjectionForward,
GPU, GPU,
ContextProjectionForwardFunc); ContextProjectionForwardFunc);
......
...@@ -233,7 +233,7 @@ private: ...@@ -233,7 +233,7 @@ private:
REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc); REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc); REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc); REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc); REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
#endif #endif
......
...@@ -169,7 +169,7 @@ private: ...@@ -169,7 +169,7 @@ private:
REGISTER_TYPED_FUNC(Crop, CPU, CropFunc); REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc); REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(Crop, GPU, CropFunc); REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc); REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
#endif #endif
......
...@@ -336,7 +336,7 @@ private: ...@@ -336,7 +336,7 @@ private:
REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc); REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc); REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc); REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc); REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
#endif #endif
......
...@@ -292,7 +292,7 @@ REGISTER_TYPED_FUNC(DepthwiseConvGradInput, ...@@ -292,7 +292,7 @@ REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
REGISTER_TYPED_FUNC(DepthwiseConvGradFilter, REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
CPU, CPU,
DepthwiseConvGradFilterFunction); DepthwiseConvGradFilterFunction);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction); REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradInput, REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
GPU, GPU,
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(DepthwiseConv, Forward) { TEST(DepthwiseConv, Forward) {
DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>( DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
"GemmConv-CPU", "DepthwiseConv-GPU", forward); "GemmConv-CPU", "DepthwiseConv-GPU", forward);
......
...@@ -340,7 +340,7 @@ public: ...@@ -340,7 +340,7 @@ public:
REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction); REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction); REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction); REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction); REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction); REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction); REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
......
...@@ -24,7 +24,7 @@ TEST(GemmConv, NaiveConv) { ...@@ -24,7 +24,7 @@ TEST(GemmConv, NaiveConv) {
"NaiveConv-CPU", "GemmConv-CPU", forward); "NaiveConv-CPU", "GemmConv-CPU", forward);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(GemmConv, Forward) { TEST(GemmConv, Forward) {
Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>( Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
"GemmConv-CPU", "GemmConv-GPU", forward); "GemmConv-CPU", "GemmConv-GPU", forward);
......
...@@ -116,7 +116,7 @@ void TestIm2ColFunctor() { ...@@ -116,7 +116,7 @@ void TestIm2ColFunctor() {
TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); } TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); } TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
......
...@@ -341,7 +341,7 @@ private: ...@@ -341,7 +341,7 @@ private:
}; };
REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc); REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc); REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
#endif #endif
} // namespace paddle } // namespace paddle
...@@ -207,7 +207,7 @@ private: ...@@ -207,7 +207,7 @@ private:
REGISTER_TYPED_FUNC(Pad, CPU, PadFunc); REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc); REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(Pad, GPU, PadFunc); REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc); REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
#endif #endif
......
...@@ -217,7 +217,7 @@ public: ...@@ -217,7 +217,7 @@ public:
REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc); REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc); REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc); REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc); REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
#endif #endif
......
...@@ -132,7 +132,7 @@ public: ...@@ -132,7 +132,7 @@ public:
REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc); REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc); REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc); REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc); REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
#endif #endif
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "BatchNormalizationLayer.h" #include "BatchNormalizationLayer.h"
#include "Layer.h" #include "Layer.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include "CudnnBatchNormLayer.h" #include "CudnnBatchNormLayer.h"
#endif #endif
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include "hl_batch_transpose.h" #include "hl_batch_transpose.h"
#endif #endif
#include "BatchNormalizationLayer.h" #include "BatchNormalizationLayer.h"
...@@ -90,7 +90,7 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) { ...@@ -90,7 +90,7 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
size_t batchSize = in->getHeight(); size_t batchSize = in->getHeight();
CHECK_EQ(out->getHeight(), batchSize * imgPixels_); CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
if (useGpu_) { if (useGpu_) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
LOG(FATAL) << "paddle is compiled only for cpu"; LOG(FATAL) << "paddle is compiled only for cpu";
#else #else
batchTranspose( batchTranspose(
...@@ -127,7 +127,7 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) { ...@@ -127,7 +127,7 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
} }
CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_)); CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
if (useGpu_) { if (useGpu_) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
LOG(FATAL) << "paddle is compiled only for cpu"; LOG(FATAL) << "paddle is compiled only for cpu";
#else #else
batchTranspose( batchTranspose(
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "PoolLayer.h" #include "PoolLayer.h"
#include "PoolProjectionLayer.h" #include "PoolProjectionLayer.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include "CudnnPoolLayer.h" #include "CudnnPoolLayer.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -53,7 +53,7 @@ Layer* PoolLayer::create(const LayerConfig& config) { ...@@ -53,7 +53,7 @@ Layer* PoolLayer::create(const LayerConfig& config) {
const std::string& pool = config.inputs(0).pool_conf().pool_type(); const std::string& pool = config.inputs(0).pool_conf().pool_type();
if (pool == "max-projection" || pool == "avg-projection") { if (pool == "max-projection" || pool == "avg-projection") {
return new PoolProjectionLayer(config); return new PoolProjectionLayer(config);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
} else if (CudnnPoolLayer::typeCheck(pool)) { } else if (CudnnPoolLayer::typeCheck(pool)) {
return new CudnnPoolLayer(config); return new CudnnPoolLayer(config);
#endif #endif
......
...@@ -674,7 +674,7 @@ void testLayerGradKernel(TestConfig testConf, ...@@ -674,7 +674,7 @@ void testLayerGradKernel(TestConfig testConf,
bool useGpu, bool useGpu,
bool useWeight, bool useWeight,
float epsilon) { float epsilon) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) return; if (useGpu) return;
#endif #endif
FLAGS_use_gpu = useGpu; FLAGS_use_gpu = useGpu;
......
...@@ -119,7 +119,7 @@ TEST(Layer, batchNorm) { ...@@ -119,7 +119,7 @@ TEST(Layer, batchNorm) {
CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576); CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
void batchNormInference(int n, int c, int h, int w) { void batchNormInference(int n, int c, int h, int w) {
MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w); MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w); MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
......
...@@ -117,7 +117,7 @@ MatrixPtr doOneConvTest(size_t imgSize, ...@@ -117,7 +117,7 @@ MatrixPtr doOneConvTest(size_t imgSize,
} }
TEST(Layer, convParaUnified) { TEST(Layer, convParaUnified) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
MatrixPtr input, resultCpu, resultGpu; MatrixPtr input, resultCpu, resultGpu;
/// TEST1 for conv /// /// TEST1 for conv ///
......
...@@ -150,7 +150,7 @@ TEST(Layer, detectionOutputLayerFwd) { ...@@ -150,7 +150,7 @@ TEST(Layer, detectionOutputLayerFwd) {
useGpu, useGpu,
result2); result2);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
// GPU case 1. // GPU case 1.
useGpu = true; useGpu = true;
inputLoc = Matrix::create(1, 16, false, useGpu); inputLoc = Matrix::create(1, 16, false, useGpu);
......
...@@ -51,7 +51,7 @@ void testEvaluator(TestConfig testConf, ...@@ -51,7 +51,7 @@ void testEvaluator(TestConfig testConf,
string testEvaluatorName, string testEvaluatorName,
size_t batchSize, size_t batchSize,
bool useGpu) { bool useGpu) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) return; if (useGpu) return;
#endif #endif
FLAGS_use_gpu = useGpu; FLAGS_use_gpu = useGpu;
......
...@@ -97,7 +97,7 @@ TEST(Layer, kmaxSeqScoreLayer) { ...@@ -97,7 +97,7 @@ TEST(Layer, kmaxSeqScoreLayer) {
Matrix::create(subSeqStartPosition.back(), 1, false, false); Matrix::create(subSeqStartPosition.back(), 1, false, false);
std::vector<bool> mode = {false}; std::vector<bool> mode = {false};
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
mode.push_back(true); mode.push_back(true);
#endif #endif
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include <cudnn.h> #include <cudnn.h>
#endif #endif
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -258,7 +258,7 @@ void testProjectionConv(size_t groups, bool isDeconv) { ...@@ -258,7 +258,7 @@ void testProjectionConv(size_t groups, bool isDeconv) {
true); true);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(Projection, conv) { TEST(Projection, conv) {
/// test ConvProjection /// test ConvProjection
testProjectionConv(1, false); testProjectionConv(1, false);
...@@ -422,7 +422,7 @@ TEST(Layer, depthwiseConvLayer) { ...@@ -422,7 +422,7 @@ TEST(Layer, depthwiseConvLayer) {
// 'depthwise_conv' is a sepecial case of 'exconv' whose // 'depthwise_conv' is a sepecial case of 'exconv' whose
// groups size equals to the input channels size. // groups size equals to the input channels size.
testDepthwiseConvLayer("exconv", /* useGpu= */ false); testDepthwiseConvLayer("exconv", /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testDepthwiseConvLayer("exconv", /* useGpu= */ true); testDepthwiseConvLayer("exconv", /* useGpu= */ true);
#endif #endif
} }
...@@ -480,7 +480,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -480,7 +480,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
TEST(Layer, convLayer) { TEST(Layer, convLayer) {
testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false); testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true); testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true); testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
#endif #endif
...@@ -525,7 +525,7 @@ TEST(Layer, convTransLayer) { ...@@ -525,7 +525,7 @@ TEST(Layer, convTransLayer) {
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu); testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true); testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
...@@ -638,7 +638,7 @@ TEST(Layer, SelectiveFullyConnectedLayer) { ...@@ -638,7 +638,7 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
/* trans= */ false, /* trans= */ false,
/* useGup= */ false, /* useGup= */ false,
false); false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testLayerGrad(config, testLayerGrad(config,
"selective_fc", "selective_fc",
100, 100,
...@@ -1210,7 +1210,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) { ...@@ -1210,7 +1210,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
testLayerGrad(config, "pool", 100, trans, useGpu); testLayerGrad(config, "pool", 100, trans, useGpu);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
void testPoolLayer2(const string& poolType, bool trans, bool useGpu) { void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
TestConfig config; TestConfig config;
config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0}); config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
...@@ -1236,7 +1236,7 @@ TEST(Layer, PoolLayer) { ...@@ -1236,7 +1236,7 @@ TEST(Layer, PoolLayer) {
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false); testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false); testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true); testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true); testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
...@@ -1309,7 +1309,7 @@ void testPool3DLayer(const string& poolType, bool trans, bool useGpu) { ...@@ -1309,7 +1309,7 @@ void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
TEST(Layer, Pool3DLayer) { TEST(Layer, Pool3DLayer) {
testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false); testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false); testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true); testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true); testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
#endif #endif
...@@ -1695,7 +1695,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { ...@@ -1695,7 +1695,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
TEST(Layer, BatchNormalizationLayer) { TEST(Layer, BatchNormalizationLayer) {
testBatchNormLayer("batch_norm", false, false); testBatchNormLayer("batch_norm", false, false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testBatchNormLayer("batch_norm", false, true); testBatchNormLayer("batch_norm", false, true);
if (hl_get_cudnn_lib_version() >= int(4000)) { if (hl_get_cudnn_lib_version() >= int(4000)) {
testBatchNormLayer("cudnn_batch_norm", false, true); testBatchNormLayer("cudnn_batch_norm", false, true);
...@@ -1744,7 +1744,7 @@ void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) { ...@@ -1744,7 +1744,7 @@ void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
TEST(Layer, testBatchNorm3DLayer) { TEST(Layer, testBatchNorm3DLayer) {
testBatchNorm3DLayer("batch_norm", false, false); testBatchNorm3DLayer("batch_norm", false, false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testBatchNorm3DLayer("batch_norm", false, true); testBatchNorm3DLayer("batch_norm", false, true);
if (hl_get_cudnn_lib_version() >= int(4000)) { if (hl_get_cudnn_lib_version() >= int(4000)) {
testBatchNorm3DLayer("cudnn_batch_norm", false, true); testBatchNorm3DLayer("cudnn_batch_norm", false, true);
...@@ -2262,7 +2262,7 @@ void test3DConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -2262,7 +2262,7 @@ void test3DConvLayer(const string& type, bool trans, bool useGpu) {
TEST(Layer, test3DConvLayer) { TEST(Layer, test3DConvLayer) {
test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false); test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true); test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
...@@ -2339,7 +2339,7 @@ void test3DDeConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -2339,7 +2339,7 @@ void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
TEST(Layer, test3DDeConvLayer) { TEST(Layer, test3DDeConvLayer) {
test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false); test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true); test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
......
...@@ -243,7 +243,7 @@ TEST(Compare, concat_slice) { ...@@ -243,7 +243,7 @@ TEST(Compare, concat_slice) {
compareNetwork(config_file_a, config_file_b); compareNetwork(config_file_a, config_file_b);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(Compare, img_pool) { TEST(Compare, img_pool) {
std::string config_file_a = "./gserver/tests/img_pool_a.conf"; std::string config_file_a = "./gserver/tests/img_pool_a.conf";
std::string config_file_b = "./gserver/tests/img_pool_b.conf"; std::string config_file_b = "./gserver/tests/img_pool_b.conf";
......
...@@ -151,7 +151,7 @@ TEST(Layer, priorBoxLayerFwd) { ...@@ -151,7 +151,7 @@ TEST(Layer, priorBoxLayerFwd) {
useGpu, useGpu,
result); result);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
// reset the input parameters // reset the input parameters
variance[1] = 0.1; variance[1] = 0.1;
variance[3] = 0.2; variance[3] = 0.2;
......
...@@ -485,7 +485,7 @@ TEST(ProtoDataProvider, test) { ...@@ -485,7 +485,7 @@ TEST(ProtoDataProvider, test) {
// Currently in async mode, useGpu is not supported // Currently in async mode, useGpu is not supported
continue; continue;
} }
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) { if (useGpu) {
continue; continue;
} }
...@@ -525,7 +525,7 @@ TEST(ProtoDataProvider, constant_slots) { ...@@ -525,7 +525,7 @@ TEST(ProtoDataProvider, constant_slots) {
for (int numConstantSlots : {1, 2}) { for (int numConstantSlots : {1, 2}) {
for (int useGpu : numTwoArray) { for (int useGpu : numTwoArray) {
for (int dataCompression : numTwoArray) { for (int dataCompression : numTwoArray) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) { if (useGpu) {
continue; continue;
} }
...@@ -708,7 +708,7 @@ TEST(ProtoSequenceDataProvider, test) { ...@@ -708,7 +708,7 @@ TEST(ProtoSequenceDataProvider, test) {
// Currently in async mode, useGpu is not supported // Currently in async mode, useGpu is not supported
continue; continue;
} }
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) { if (useGpu) {
continue; continue;
} }
......
...@@ -37,7 +37,7 @@ TEST(PyDataProvider, py_fill_slots) { ...@@ -37,7 +37,7 @@ TEST(PyDataProvider, py_fill_slots) {
config.clear_files(); config.clear_files();
std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList"; std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
config.set_files(dataFile); config.set_files(dataFile);
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
bool useGpu = false; bool useGpu = false;
#else #else
bool useGpu = true; bool useGpu = true;
...@@ -71,7 +71,7 @@ TEST(PyDataProvider, py_fill_nest_slots) { ...@@ -71,7 +71,7 @@ TEST(PyDataProvider, py_fill_nest_slots) {
std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList"; std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
config.set_files(dataFile); config.set_files(dataFile);
EXPECT_EQ(config.IsInitialized(), true); EXPECT_EQ(config.IsInitialized(), true);
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
bool useGpu = false; bool useGpu = false;
#else #else
bool useGpu = true; bool useGpu = true;
......
...@@ -321,7 +321,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) { ...@@ -321,7 +321,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
"filelist=gserver/tests/SelectiveFcTest/dense_mul_list"; "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) { if (useGpu) {
break; break;
} }
...@@ -388,7 +388,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config, ...@@ -388,7 +388,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
outMatSelfc->getWidth(), outMatSelfc->getWidth(),
outMatSelfc->getElementCnt())); outMatSelfc->getElementCnt()));
cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT); cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
if (useGpu) { if (useGpu) {
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
} }
...@@ -418,7 +418,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config, ...@@ -418,7 +418,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
MatrixPtr cpuOutMatFc( MatrixPtr cpuOutMatFc(
new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth())); new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT); cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
if (useGpu) { if (useGpu) {
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
} }
...@@ -443,7 +443,7 @@ TEST(Layer, SelectiveFcLayer_train_sparse_mul) { ...@@ -443,7 +443,7 @@ TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
selLayerConfig.set_size(fcLayerWidth); selLayerConfig.set_size(fcLayerWidth);
testSelectiveFcLayerTrainSparseMul(selLayerConfig, false); testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testSelectiveFcLayerTrainSparseMul(selLayerConfig, true); testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
#endif #endif
} }
......
...@@ -195,7 +195,7 @@ TEST(Layer, SeqSliceLayer) { ...@@ -195,7 +195,7 @@ TEST(Layer, SeqSliceLayer) {
vector<vector<real>> ends; vector<vector<real>> ends;
std::vector<bool> mode = {false}; std::vector<bool> mode = {false};
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
mode.push_back(true); mode.push_back(true);
#endif #endif
genSeqInfo(seqStartPos, subSeqStartPos); genSeqInfo(seqStartPos, subSeqStartPos);
......
...@@ -199,7 +199,7 @@ TEST(Layer, WarpCTCLayer) { ...@@ -199,7 +199,7 @@ TEST(Layer, WarpCTCLayer) {
for (auto batchSize : {1, 10, 32}) { for (auto batchSize : {1, 10, 32}) {
for (auto normByTimes : {false, true}) { for (auto normByTimes : {false, true}) {
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) continue; if (useGpu) continue;
#endif #endif
LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
......
...@@ -670,7 +670,7 @@ void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) { ...@@ -670,7 +670,7 @@ void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
} }
void GpuMatrix::selectRows(Matrix& table, IVector& ids) { void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
CHECK(dynamic_cast<GpuMatrix*>(&table)); CHECK(dynamic_cast<GpuMatrix*>(&table));
CHECK(table.useGpu()); CHECK(table.useGpu());
CHECK(ids.useGpu()); CHECK(ids.useGpu());
...@@ -694,7 +694,7 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) { ...@@ -694,7 +694,7 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
} }
void GpuMatrix::addToRows(Matrix& table, IVector& ids) { void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
CHECK(dynamic_cast<GpuMatrix*>(&table)); CHECK(dynamic_cast<GpuMatrix*>(&table));
CHECK(table.useGpu()); CHECK(table.useGpu());
CHECK(ids.useGpu()); CHECK(ids.useGpu());
...@@ -741,7 +741,7 @@ void GpuMatrix::rowMax(Matrix& max) { ...@@ -741,7 +741,7 @@ void GpuMatrix::rowMax(Matrix& max) {
} }
void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) { void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal"; CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
size_t numSamples = getHeight(); size_t numSamples = getHeight();
size_t beam = maxVal.getWidth(); size_t beam = maxVal.getWidth();
......
...@@ -99,7 +99,11 @@ public: ...@@ -99,7 +99,11 @@ public:
/** /**
* @brief clear local buffer. It only affect auto-growth buffer. * @brief clear local buffer. It only affect auto-growth buffer.
*/ */
inline void clear() { rowStore_.clear(); } inline void clear() {
// swap an empty vector to it to free the memory.
std::vector<real, AlignedAllocator<real, 32>> empty;
rowStore_.swap(empty);
}
/** /**
* @brief get current number of rows. * @brief get current number of rows.
......
...@@ -836,7 +836,7 @@ void GpuSparseMatrix::zeroMem() { ...@@ -836,7 +836,7 @@ void GpuSparseMatrix::zeroMem() {
} }
void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) { void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal"; CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
size_t numSamples = getHeight(); size_t numSamples = getHeight();
size_t beam = maxVal.getWidth(); size_t beam = maxVal.getWidth();
......
...@@ -172,7 +172,7 @@ void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) { ...@@ -172,7 +172,7 @@ void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
template <class T> template <class T>
void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) { void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
hl_vector_select_from<T>(this->getData(), hl_vector_select_from<T>(this->getData(),
this->getSize(), this->getSize(),
src.getData(), src.getData(),
...@@ -850,7 +850,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src, ...@@ -850,7 +850,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
size_t size) size_t size)
: sync_(nullptr) { : sync_(nullptr) {
CHECK_LE(offset + size, static_cast<size_t>(src.getSize())); CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
SyncedFlag* flag = src.getSync(); SyncedFlag* flag = src.getSync();
if (*flag == DATA_AT_CPU) { if (*flag == DATA_AT_CPU) {
src.copyToGpu(); // will set synchronous data between CPU and GPU src.copyToGpu(); // will set synchronous data between CPU and GPU
...@@ -861,7 +861,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src, ...@@ -861,7 +861,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
auto cMemHandle = (src.getVector(false))->getMemoryHandle(); auto cMemHandle = (src.getVector(false))->getMemoryHandle();
cpuVectorT_ = std::make_shared<CpuVectorT<T>>( cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset); size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
auto gMemHandle = (src.getVector(true))->getMemoryHandle(); auto gMemHandle = (src.getVector(true))->getMemoryHandle();
gpuVectorT_ = std::make_shared<GpuVectorT<T>>( gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset); size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
......
...@@ -68,7 +68,7 @@ void testPoolAllocator() { ...@@ -68,7 +68,7 @@ void testPoolAllocator() {
TEST(Allocator, Pool) { TEST(Allocator, Pool) {
testPoolAllocator<CpuAllocator>(); testPoolAllocator<CpuAllocator>();
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testPoolAllocator<GpuAllocator>(); testPoolAllocator<GpuAllocator>();
#endif #endif
} }
...@@ -92,7 +92,7 @@ TEST(MemoryHandle, Cpu) { ...@@ -92,7 +92,7 @@ TEST(MemoryHandle, Cpu) {
EXPECT_EQ(ptr1, ptr2); EXPECT_EQ(ptr1, ptr2);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(MemoryHandle, Gpu) { TEST(MemoryHandle, Gpu) {
int numGpu = hl_get_device_count(); int numGpu = hl_get_device_count();
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
/** /**
* This test file use autotest::AutoCompare and cmpWithoutArg to compares the * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
* implementation of CPU and GPU member function in * implementation of CPU and GPU member function in
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/math/Vector.h" #include "paddle/math/Vector.h"
......
...@@ -94,7 +94,7 @@ void testWrapper(F&& f) { ...@@ -94,7 +94,7 @@ void testWrapper(F&& f) {
} }
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(ExecViaCpu, test1) { TEST(ExecViaCpu, test1) {
testWrapper(f); testWrapper(f);
testWrapper(&f); testWrapper(&f);
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/math/Matrix.h" #include "paddle/math/Matrix.h"
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
/** /**
* This test file use autotest::AutoCompare and cmpWithArg to compares the * This test file use autotest::AutoCompare and cmpWithArg to compares the
* implementation of CPU and GPU member function in Matrix.cpp. * implementation of CPU and GPU member function in Matrix.cpp.
......
...@@ -47,7 +47,7 @@ struct MatrixPara { ...@@ -47,7 +47,7 @@ struct MatrixPara {
SparseFormat format; SparseFormat format;
}; };
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
void test_sparse_matrix_mul(MatrixPara paraA, void test_sparse_matrix_mul(MatrixPara paraA,
MatrixPara paraB, MatrixPara paraB,
MatrixPara paraC) { MatrixPara paraC) {
...@@ -452,7 +452,7 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) { ...@@ -452,7 +452,7 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
matB->trimFrom(*mat); matB->trimFrom(*mat);
checkSMatrixEqual2(matA, matB); checkSMatrixEqual2(matA, matB);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>( GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true); height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
matC->trimFrom(*mat); matC->trimFrom(*mat);
...@@ -546,7 +546,7 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) { ...@@ -546,7 +546,7 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
matB->trimFrom(*mat); matB->trimFrom(*mat);
checkSMatrixEqual2(matA, matB); checkSMatrixEqual2(matA, matB);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>( GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true); height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
matC->trimFrom(*mat); matC->trimFrom(*mat);
......
...@@ -270,7 +270,7 @@ TEST(Unary, BaseOp) { ...@@ -270,7 +270,7 @@ TEST(Unary, BaseOp) {
TestUnaryVectorT<CpuIVector, int> testCpuIVector( TestUnaryVectorT<CpuIVector, int> testCpuIVector(
testUnaryBaseOpInt<CpuIVector>); testUnaryBaseOpInt<CpuIVector>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>); TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>); TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
TestUnaryVectorT<GpuIVector, int> testGpuIVector( TestUnaryVectorT<GpuIVector, int> testGpuIVector(
...@@ -317,7 +317,7 @@ void testUnayrMathOp(Tensor& A1, Tensor& A2) { ...@@ -317,7 +317,7 @@ void testUnayrMathOp(Tensor& A1, Tensor& A2) {
TEST(Unary, MathOp) { TEST(Unary, MathOp) {
TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>); TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>); TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
#endif #endif
} }
...@@ -374,7 +374,7 @@ void testUnayrCompareOp(Tensor& A1, Tensor& A2) { ...@@ -374,7 +374,7 @@ void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
TEST(Unary, CompareOp) { TEST(Unary, CompareOp) {
TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>); TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>); TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
#endif #endif
} }
...@@ -536,7 +536,7 @@ void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -536,7 +536,7 @@ void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
TEST(Binary, BaseOp) { TEST(Binary, BaseOp) {
TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>); TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>); TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
#endif #endif
} }
...@@ -710,7 +710,7 @@ void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -710,7 +710,7 @@ void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
TEST(Binary, MathOp) { TEST(Binary, MathOp) {
TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>); TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>); TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
#endif #endif
} }
...@@ -810,7 +810,7 @@ void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) { ...@@ -810,7 +810,7 @@ void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
TEST(Binary, CompareOp) { TEST(Binary, CompareOp) {
TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>); TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>); TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
#endif #endif
} }
...@@ -955,7 +955,7 @@ void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { ...@@ -955,7 +955,7 @@ void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TEST(Ternary, BaseOp) { TEST(Ternary, BaseOp) {
TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>); TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>); TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
#endif #endif
} }
...@@ -1058,7 +1058,7 @@ void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) { ...@@ -1058,7 +1058,7 @@ void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
TEST(Ternary, CompareOp) { TEST(Ternary, CompareOp) {
TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>); TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>); TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
#endif #endif
} }
...@@ -1086,7 +1086,7 @@ void testQuaternaryAdd( ...@@ -1086,7 +1086,7 @@ void testQuaternaryAdd(
TEST(Quaternary, BaseOp) { TEST(Quaternary, BaseOp) {
TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>); TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>); TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
#endif #endif
} }
...@@ -1156,7 +1156,7 @@ void testQuaternaryCompareOp( ...@@ -1156,7 +1156,7 @@ void testQuaternaryCompareOp(
TEST(Quaternary, CompareOp) { TEST(Quaternary, CompareOp) {
TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>); TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>); TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
#endif #endif
} }
...@@ -91,7 +91,7 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) { ...@@ -91,7 +91,7 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc; typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
void testCase(testMatrixFunc matrixFunc) { void testCase(testMatrixFunc matrixFunc) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
#else #else
for (auto useGpu : {false}) { for (auto useGpu : {false}) {
......
...@@ -17,7 +17,7 @@ limitations under the License. */ ...@@ -17,7 +17,7 @@ limitations under the License. */
using namespace paddle; // NOLINT using namespace paddle; // NOLINT
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(MatrixBatchTransTest, test_batch_matrix_transpose) { TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
const int nx = 100; const int nx = 100;
const int ny = 50; const int ny = 50;
......
...@@ -72,7 +72,7 @@ void testLazyAssign(int height, int width) { ...@@ -72,7 +72,7 @@ void testLazyAssign(int height, int width) {
TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); } TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); } TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
#endif #endif
...@@ -142,6 +142,6 @@ void testSgdUpdate(int height, int width) { ...@@ -142,6 +142,6 @@ void testSgdUpdate(int height, int width) {
TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); } TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_GPU
TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); } TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
#endif #endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
/// only cpu version. /// only cpu version.
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result, /// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
// so disable when // so disable when
/// only cpu version. /// only cpu version.
......
...@@ -175,7 +175,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) { ...@@ -175,7 +175,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
} }
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
if (system_allocator_->UseGpu()) { if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) { if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation. // Compute the maximum allocation size for the first allocation.
......
...@@ -62,7 +62,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -62,7 +62,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
bool CPUAllocator::UseGpu() const { return false; } bool CPUAllocator::UseGpu() const { return false; }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
void* GPUAllocator::Alloc(size_t& index, size_t size) { void* GPUAllocator::Alloc(size_t& index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr // CUDA documentation doesn't explain if cudaMalloc returns nullptr
......
...@@ -40,7 +40,7 @@ class CPUAllocator : public SystemAllocator { ...@@ -40,7 +40,7 @@ class CPUAllocator : public SystemAllocator {
virtual bool UseGpu() const; virtual bool UseGpu() const;
}; };
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
class GPUAllocator : public SystemAllocator { class GPUAllocator : public SystemAllocator {
public: public:
virtual void* Alloc(size_t& index, size_t size); virtual void* Alloc(size_t& index, size_t size);
......
...@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) { ...@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
TestAllocator(a, 0); TestAllocator(a, 0);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(GPUAllocator, Alloc) { TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a; paddle::memory::detail::GPUAllocator a;
TestAllocator(a, 2048); TestAllocator(a, 2048);
......
...@@ -26,7 +26,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -26,7 +26,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
template <> template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place, void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
......
...@@ -33,7 +33,7 @@ namespace memory { ...@@ -33,7 +33,7 @@ namespace memory {
template <typename DstPlace, typename SrcPlace> template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
/** /**
* \brief Copy memory from one place to another place. * \brief Copy memory from one place to another place.
......
...@@ -62,7 +62,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) { ...@@ -62,7 +62,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
return GetCPUBuddyAllocator()->Used(); return GetCPUBuddyAllocator()->Used();
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
using BuddyAllocVec = std::vector<BuddyAllocator*>; using BuddyAllocVec = std::vector<BuddyAllocator*>;
...@@ -77,7 +77,7 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -77,7 +77,7 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
// GPU buddy allocator initialization // GPU buddy allocator initialization
std::call_once(gpu_allocator_flag, [&]() { std::call_once(gpu_allocator_flag, [&]() {
int gpu_num = platform::GetDeviceCount(); int gpu_num = platform::GetCUDADeviceCount();
allocators.reserve(gpu_num); allocators.reserve(gpu_num);
for (int gpu = 0; gpu < gpu_num; gpu++) { for (int gpu = 0; gpu < gpu_num; gpu++) {
platform::SetDeviceId(gpu); platform::SetDeviceId(gpu);
......
...@@ -80,7 +80,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -80,7 +80,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
} }
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
size_t align(size_t size, paddle::platform::GPUPlace place) { size_t align(size_t size, paddle::platform::GPUPlace place) {
size += sizeof(paddle::memory::detail::Metadata); size += sizeof(paddle::memory::detail::Metadata);
......
...@@ -55,6 +55,12 @@ function(op_library TARGET) ...@@ -55,6 +55,12 @@ function(op_library TARGET)
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
if ("${TARGET}" STREQUAL "pool_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
endif()
# activation_op contains several operators # activation_op contains several operators
if ("${TARGET}" STREQUAL "activation_op") if ("${TARGET}" STREQUAL "activation_op")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -97,12 +103,16 @@ set(DEPS_OPS ...@@ -97,12 +103,16 @@ set(DEPS_OPS
recurrent_op recurrent_op
cond_op cond_op
cross_entropy_op cross_entropy_op
softmax_with_cross_entropy_op) softmax_with_cross_entropy_op
sum_op)
op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS framework_proto tensor net_op) DEPS framework_proto tensor net_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(sum_op DEPS net_op)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
......
...@@ -69,6 +69,22 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -69,6 +69,22 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
} }
}; };
template <typename AttrType>
class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LeakyReluOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of LeakyRelu operator");
AddOutput("Y", "Output of LeakyRelu operator");
AddComment(
"LeakyRelu activation operator, "
"leaky_relu = max(x, alpha * x)");
AddAttr<AttrType>("alpha", "The small negative slope")
.SetDefault(static_cast<AttrType>(0.02f));
}
};
class TanhOpMaker : public framework::OpProtoAndCheckerMaker { class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
...@@ -81,6 +97,17 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -81,6 +97,17 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
} }
}; };
class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
public:
TanhShrinkOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of TanhShrink operator");
AddOutput("Y", "Output of TanhShrink operator");
AddComment("TanhShrink activation operator, tanhshrink(x) = x - tanh(x)");
}
};
class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
...@@ -206,120 +233,63 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -206,120 +233,63 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad, REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(sigmoid,
ops::ActivationKernel<paddle::platform::CPUPlace, float,
ops::SigmoidFunctor<float>>);
REGISTER_OP_CPU_KERNEL(
sigmoid_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::SigmoidGradFunctor<float>>);
REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
exp,
ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::ExpFunctor>);
REGISTER_OP_CPU_KERNEL(exp_grad,
ops::ActivationGradKernel<paddle::platform::CPUPlace,
float, ops::ExpGradFunctor>);
REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad, REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(relu,
ops::ActivationKernel<paddle::platform::CPUPlace, float,
ops::ReluFunctor<float>>);
REGISTER_OP_CPU_KERNEL(
relu_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::ReluGradFunctor<float>>);
REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
tanh, REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::TanhFunctor>); tanh_shrink_grad, ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
tanh_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::TanhGradFunctor<float>>);
REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
sqrt,
ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::SqrtFunctor>);
REGISTER_OP_CPU_KERNEL(
sqrt_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::SqrtGradFunctor<float>>);
REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad, REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
abs,
ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::AbsFunctor>);
REGISTER_OP_CPU_KERNEL(abs_grad,
ops::ActivationGradKernel<paddle::platform::CPUPlace,
float, ops::AbsGradFunctor>);
REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker, REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
reciprocal_grad, ops::ActivationOpGrad); reciprocal_grad, ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(reciprocal,
ops::ActivationKernel<paddle::platform::CPUPlace, float,
ops::ReciprocalFunctor<float>>);
REGISTER_OP_CPU_KERNEL(
reciprocal_grad,
ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::ReciprocalGradFunctor<float>>);
REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad, REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(
log,
ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::LogFunctor>);
REGISTER_OP_CPU_KERNEL(
log_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::LogGradFunctor<float>>);
REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad, REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(square,
ops::ActivationKernel<paddle::platform::CPUPlace, float,
ops::SquareFunctor>);
REGISTER_OP_CPU_KERNEL(
square_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::SquareGradFunctor<float>>);
REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad, REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(softsign,
ops::ActivationKernel<paddle::platform::CPUPlace, float,
ops::SoftsignFunctor<float>>);
REGISTER_OP_CPU_KERNEL(
softsign_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
ops::SoftsignGradFunctor<float>>);
REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad, REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(brelu,
ops::BReluKernel<paddle::platform::CPUPlace, float>); REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
REGISTER_OP_CPU_KERNEL(brelu_grad, leaky_relu_grad, ops::ActivationOpGrad);
ops::BReluGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>, REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
soft_relu_grad, ops::ActivationOpGrad); soft_relu_grad, ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(soft_relu,
ops::SoftReluKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad, REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(pow, ops::PowKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(pow_grad,
ops::PowGradKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad, REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP_CPU_KERNEL(stanh,
ops::STanhKernel<paddle::platform::CPUPlace, float>); #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL(stanh_grad, REGISTER_OP_CPU_KERNEL( \
ops::STanhGradKernel<paddle::platform::CPUPlace, float>); act_type, \
paddle::operators::ActivationKernel<paddle::platform::CPUPlace, \
paddle::operators::functor<float>>); \
REGISTER_OP_CPU_KERNEL(act_type##_grad, \
paddle::operators::ActivationGradKernel< \
paddle::platform::CPUPlace, \
paddle::operators::grad_functor<float>>);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
...@@ -15,93 +15,14 @@ ...@@ -15,93 +15,14 @@
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/operators/activation_op.h" #include "paddle/operators/activation_op.h"
namespace ops = paddle::operators; #define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_GPU_KERNEL( \
REGISTER_OP_GPU_KERNEL(sigmoid, act_type, \
ops::ActivationKernel<paddle::platform::GPUPlace, float, paddle::operators::ActivationKernel<paddle::platform::GPUPlace, \
ops::SigmoidFunctor<float>>); paddle::operators::functor<float>>); \
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(act_type##_grad, \
sigmoid_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float, paddle::operators::ActivationGradKernel< \
ops::SigmoidGradFunctor<float>>); paddle::platform::GPUPlace, \
paddle::operators::grad_functor<float>>);
REGISTER_OP_GPU_KERNEL(
exp, FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::ExpFunctor>);
REGISTER_OP_GPU_KERNEL(exp_grad,
ops::ActivationGradKernel<paddle::platform::GPUPlace,
float, ops::ExpGradFunctor>);
REGISTER_OP_GPU_KERNEL(relu,
ops::ActivationKernel<paddle::platform::GPUPlace, float,
ops::ReluFunctor<float>>);
REGISTER_OP_GPU_KERNEL(
relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
ops::ReluGradFunctor<float>>);
REGISTER_OP_GPU_KERNEL(
tanh,
ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::TanhFunctor>);
REGISTER_OP_GPU_KERNEL(
tanh_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
ops::TanhGradFunctor<float>>);
REGISTER_OP_GPU_KERNEL(
sqrt,
ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::SqrtFunctor>);
REGISTER_OP_GPU_KERNEL(
sqrt_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
ops::SqrtGradFunctor<float>>);
REGISTER_OP_GPU_KERNEL(
abs,
ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::AbsFunctor>);
REGISTER_OP_GPU_KERNEL(abs_grad,
ops::ActivationGradKernel<paddle::platform::GPUPlace,
float, ops::AbsGradFunctor>);
REGISTER_OP_GPU_KERNEL(reciprocal,
ops::ActivationKernel<paddle::platform::GPUPlace, float,
ops::ReciprocalFunctor<float>>);
REGISTER_OP_GPU_KERNEL(
reciprocal_grad,
ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
ops::ReciprocalGradFunctor<float>>);
REGISTER_OP_GPU_KERNEL(
log,
ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::LogFunctor>);
REGISTER_OP_GPU_KERNEL(
log_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
ops::LogGradFunctor<float>>);
REGISTER_OP_GPU_KERNEL(square,
ops::ActivationKernel<paddle::platform::GPUPlace, float,
ops::SquareFunctor>);
REGISTER_OP_GPU_KERNEL(
square_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
ops::SquareGradFunctor<float>>);
REGISTER_OP_GPU_KERNEL(softsign,
ops::ActivationKernel<paddle::platform::GPUPlace, float,
ops::SoftsignFunctor<float>>);
REGISTER_OP_GPU_KERNEL(
softsign_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
ops::SoftsignGradFunctor<float>>);
REGISTER_OP_GPU_KERNEL(brelu,
ops::BReluKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(brelu_grad,
ops::BReluGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(soft_relu,
ops::SoftReluKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(pow_grad,
ops::PowGradKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(stanh,
ops::STanhKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(stanh_grad,
ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
...@@ -19,9 +19,12 @@ ...@@ -19,9 +19,12 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename Place, typename T, typename Functor> template <typename Place, typename Functor>
class ActivationKernel : public framework::OpKernel<T> { class ActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public: public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<framework::Tensor>("X"); auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Output<framework::Tensor>("Y"); auto* Y = context.Output<framework::Tensor>("Y");
...@@ -31,13 +34,20 @@ class ActivationKernel : public framework::OpKernel<T> { ...@@ -31,13 +34,20 @@ class ActivationKernel : public framework::OpKernel<T> {
auto y = framework::EigenVector<T>::Flatten(*Y); auto y = framework::EigenVector<T>::Flatten(*Y);
auto place = context.GetEigenDevice<Place>(); auto place = context.GetEigenDevice<Place>();
Functor functor; Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(place, x, y); functor(place, x, y);
} }
}; };
template <typename Place, typename T, typename Functor> template <typename Place, typename Functor>
class ActivationGradKernel : public framework::OpKernel<T> { class ActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public: public:
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<framework::Tensor>("X"); auto* X = context.Input<framework::Tensor>("X");
auto* Y = context.Input<framework::Tensor>("Y"); auto* Y = context.Input<framework::Tensor>("Y");
...@@ -51,159 +61,228 @@ class ActivationGradKernel : public framework::OpKernel<T> { ...@@ -51,159 +61,228 @@ class ActivationGradKernel : public framework::OpKernel<T> {
auto dx = framework::EigenVector<T>::Flatten(*dX); auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>(); auto place = context.GetEigenDevice<Place>();
Functor functor; Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(place, x, y, dy, dx); functor(place, x, y, dy, dx);
} }
}; };
template <typename T>
struct BaseActivationFunctor {
using ELEMENT_TYPE = T;
using AttrPair = std::vector<std::pair<const char*, float*>>;
AttrPair GetAttrs() { return AttrPair(); }
};
// sigmoid(x) = 1 / (1 + exp(-x)) // sigmoid(x) = 1 / (1 + exp(-x))
template <typename T> template <typename T>
struct SigmoidFunctor { struct SigmoidFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp()); y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
} }
}; };
template <typename T> template <typename T>
struct SigmoidGradFunctor { struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * y * (static_cast<T>(1) - y); dx.device(d) = dy * y * (static_cast<T>(1) - y);
} }
}; };
// exp(x) = e^x // exp(x) = e^x
struct ExpFunctor { template <typename T>
struct ExpFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = x.exp(); y.device(d) = x.exp();
} }
}; };
struct ExpGradFunctor { template <typename T>
struct ExpGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * y; dx.device(d) = dy * y;
} }
}; };
// relu(x) = max(x, 0) // relu(x) = max(x, 0)
template <typename T> template <typename T>
struct ReluFunctor { struct ReluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = x.cwiseMax(static_cast<T>(0)); y.device(d) = x.cwiseMax(static_cast<T>(0));
} }
}; };
template <typename T> template <typename T>
struct ReluGradFunctor { struct ReluGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>(); dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
} }
}; };
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
struct TanhFunctor { template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = x.tanh(); y.device(d) = x.tanh();
} }
}; };
template <typename T> template <typename T>
struct TanhGradFunctor { struct TanhGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * (static_cast<T>(1) - y * y); dx.device(d) = dy * (static_cast<T>(1) - y * y);
} }
}; };
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = x - x.tanh();
}
};
template <typename T>
struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * (x.tanh() * x.tanh());
}
};
// sqrt(x) = x^(1/2) // sqrt(x) = x^(1/2)
struct SqrtFunctor { template <typename T>
struct SqrtFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = x.sqrt(); y.device(d) = x.sqrt();
} }
}; };
template <typename T> template <typename T>
struct SqrtGradFunctor { struct SqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
const Y y_conj = Eigen::numext::conj(y); const Y y_conj = Eigen::numext::conj(y);
dx.device(d) = static_cast<T>(0.5) * dy / y_conj; dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
} }
}; };
// abs(x) = |x| // abs(x) = |x|
struct AbsFunctor { template <typename T>
struct AbsFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = x.abs(); y.device(d) = x.abs();
} }
}; };
struct AbsGradFunctor { template <typename T>
struct AbsGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * x.sign(); dx.device(d) = dy * x.sign();
} }
}; };
// reciprocal(x) = 1 / x // reciprocal(x) = 1 / x
template <typename T> template <typename T>
struct ReciprocalFunctor { struct ReciprocalFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = static_cast<T>(1) / x; y.device(d) = static_cast<T>(1) / x;
} }
}; };
template <typename T> template <typename T>
struct ReciprocalGradFunctor { struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * static_cast<T>(-1) * y * y; dx.device(d) = dy * static_cast<T>(-1) * y * y;
} }
}; };
// log(x) = natural logarithm of x // log(x) = natural logarithm of x
struct LogFunctor { template <typename T>
struct LogFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = x.log(); y.device(d) = x.log();
} }
}; };
template <typename T> template <typename T>
struct LogGradFunctor { struct LogGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * (static_cast<T>(1) / x); dx.device(d) = dy * (static_cast<T>(1) / x);
} }
}; };
// square(x) = x^2 // square(x) = x^2
struct SquareFunctor { template <typename T>
struct SquareFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) const {
y.device(d) = x.square(); y.device(d) = x.square();
} }
}; };
template <typename T> template <typename T>
struct SquareGradFunctor { struct SquareGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * static_cast<T>(2) * x; dx.device(d) = dy * static_cast<T>(2) * x;
} }
}; };
template <typename T>
struct BReluFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
// NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
// not polymorphism for speed.
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) const {
y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
}
};
template <typename T>
struct BReluGradFunctor : public BaseActivationFunctor<T> {
float t_min;
float t_max;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"t_min", &t_min}, {"t_max", &t_max}};
}
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
}
};
// softsign(x) = x / (1 + |x|) // softsign(x) = x / (1 + |x|)
template <typename T> template <typename T>
struct SoftsignFunctor { struct SoftsignFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y> template <typename Device, typename X, typename Y>
void operator()(Device d, X x, Y y) { void operator()(Device d, X x, Y y) {
y.device(d) = x / (static_cast<T>(1) + x.abs()); y.device(d) = x / (static_cast<T>(1) + x.abs());
...@@ -213,7 +292,7 @@ struct SoftsignFunctor { ...@@ -213,7 +292,7 @@ struct SoftsignFunctor {
// d(softsign(x))/dx = 1 / (1 + |x|)^2 // d(softsign(x))/dx = 1 / (1 + |x|)^2
// Taken from https://en.wikipedia.org/wiki/Activation_function // Taken from https://en.wikipedia.org/wiki/Activation_function
template <typename T> template <typename T>
struct SoftsignGradFunctor { struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Y, typename dY, typename dX> template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) { void operator()(Device d, X x, Y y, dY dy, dX dx) {
dx.device(d) = dx.device(d) =
...@@ -221,153 +300,130 @@ struct SoftsignGradFunctor { ...@@ -221,153 +300,130 @@ struct SoftsignGradFunctor {
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class BReluKernel : public framework::OpKernel<T> { struct SoftReluFunctor : public BaseActivationFunctor<T> {
public: float threshold;
void Compute(const framework::ExecutionContext& context) const override { typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* X = context.Input<framework::Tensor>("X"); return {{"threshold", &threshold}};
auto* Y = context.Output<framework::Tensor>("Y"); }
auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
Y->mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(*X); template <typename Device, typename X, typename Y>
auto y = framework::EigenVector<T>::Flatten(*Y); void operator()(Device d, X x, Y y) const {
auto place = context.GetEigenDevice<Place>(); auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
y.device(place) = x.cwiseMax(t_min).cwiseMin(t_max); y.device(d) = (static_cast<T>(1) + temp.exp()).log();
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class BReluGradKernel : public framework::OpKernel<T> { struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
public: float threshold;
void Compute(const framework::ExecutionContext& context) const override { typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* X = context.Input<framework::Tensor>("X"); return {{"threshold", &threshold}};
auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y")); }
auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X")); template <typename Device, typename X, typename Y, typename dY, typename dX>
auto t_min = static_cast<T>(context.Attr<AttrType>("t_min")); void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto t_max = static_cast<T>(context.Attr<AttrType>("t_max")); auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
dX->mutable_data<T>(context.GetPlace()); dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
auto dy = framework::EigenVector<T>::Flatten(*dY);
auto x = framework::EigenVector<T>::Flatten(*X);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>();
dx.device(place) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class SoftReluKernel : public framework::OpKernel<T> { struct LeakyReluFunctor : public BaseActivationFunctor<T> {
public: float alpha;
void Compute(const framework::ExecutionContext& context) const override { typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* X = context.Input<framework::Tensor>("X"); return {{"alpha", &alpha}};
auto* Y = context.Output<framework::Tensor>("Y"); }
auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
Y->mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(*X); template <typename Device, typename X, typename Y>
auto y = framework::EigenVector<T>::Flatten(*Y); void operator()(Device d, X x, Y y) const {
auto place = context.GetEigenDevice<Place>(); y.device(d) = x.cwiseMax(alpha * x);
auto temp = x.cwiseMax(-threshold).cwiseMin(threshold).eval();
y.device(place) = (static_cast<T>(1) + temp.exp()).log();
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class SoftReluGradKernel : public framework::OpKernel<T> { struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
public: float alpha;
void Compute(const framework::ExecutionContext& context) const override { typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* X = context.Input<framework::Tensor>("X"); return {{"alpha", &alpha}};
auto* Y = context.Input<framework::Tensor>("Y"); }
auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y")); template <typename Device, typename X, typename Y, typename dY, typename dX>
auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X")); void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto threshold = static_cast<T>(context.Attr<AttrType>("threshold")); auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
dX->mutable_data<T>(context.GetPlace()); auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
dx.device(d) = dy * (temp1 + temp2).template cast<T>();
auto x = framework::EigenVector<T>::Flatten(*X);
auto y = framework::EigenVector<T>::Flatten(*Y);
auto dy = framework::EigenVector<T>::Flatten(*dY);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>();
auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
dx.device(place) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class PowKernel : public framework::OpKernel<T> { struct PowFunctor : public BaseActivationFunctor<T> {
public: float factor;
void Compute(const framework::ExecutionContext& context) const override { typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* X = context.Input<framework::Tensor>("X"); return {{"factor", &factor}};
auto* Y = context.Output<framework::Tensor>("Y"); }
auto factor = static_cast<T>(context.Attr<AttrType>("factor")); template <typename Device, typename X, typename Y>
Y->mutable_data<T>(context.GetPlace()); void operator()(Device d, X x, Y y) const {
y.device(d) = x.pow(factor);
auto x = framework::EigenVector<T>::Flatten(*X);
auto y = framework::EigenVector<T>::Flatten(*Y);
auto place = context.GetEigenDevice<Place>();
y.device(place) = x.pow(factor);
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class PowGradKernel : public framework::OpKernel<T> { struct PowGradFunctor : public BaseActivationFunctor<T> {
public: float factor;
void Compute(const framework::ExecutionContext& context) const override { typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* X = context.Input<framework::Tensor>("X"); return {{"factor", &factor}};
auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y")); }
auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X")); template <typename Device, typename X, typename Y, typename dY, typename dX>
auto factor = static_cast<T>(context.Attr<AttrType>("factor")); void operator()(Device d, X x, Y y, dY dy, dX dx) const {
dX->mutable_data<T>(context.GetPlace()); dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
auto dy = framework::EigenVector<T>::Flatten(*dY);
auto x = framework::EigenVector<T>::Flatten(*X);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>();
dx.device(place) = dy * factor * x.pow(factor - static_cast<T>(1));
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class STanhKernel : public framework::OpKernel<T> { struct STanhFunctor : public BaseActivationFunctor<T> {
public: float scale_a;
void Compute(const framework::ExecutionContext& context) const override { float scale_b;
auto* X = context.Input<framework::Tensor>("X"); typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* Y = context.Output<framework::Tensor>("Y"); return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a")); }
auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
Y->mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(*X); template <typename Device, typename X, typename Y>
auto y = framework::EigenVector<T>::Flatten(*Y); void operator()(Device d, X x, Y y) const {
auto place = context.GetEigenDevice<Place>(); y.device(d) = scale_b * (scale_a * x).tanh();
y.device(place) = scale_b * (scale_a * x).tanh();
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename T>
class STanhGradKernel : public framework::OpKernel<T> { struct STanhGradFunctor : public BaseActivationFunctor<T> {
public: float scale_a;
void Compute(const framework::ExecutionContext& context) const override { float scale_b;
auto* X = context.Input<framework::Tensor>("X"); typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y")); return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X")); }
auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
dX->mutable_data<T>(context.GetPlace());
auto dy = framework::EigenVector<T>::Flatten(*dY);
auto x = framework::EigenVector<T>::Flatten(*X);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>();
template <typename Device, typename X, typename Y, typename dY, typename dX>
void operator()(Device d, X x, Y y, dY dy, dX dx) const {
auto temp = (scale_a * x).tanh() * (scale_a * x).tanh(); auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
dx.device(place) = dy * scale_a * scale_b * (static_cast<T>(1) - temp); dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#define FOR_EACH_KERNEL_FUNCTOR(__macro) \
__macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \
__macro(exp, ExpFunctor, ExpGradFunctor); \
__macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
__macro(abs, AbsFunctor, AbsGradFunctor); \
__macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, LogFunctor, LogGradFunctor); \
__macro(square, SquareFunctor, SquareGradFunctor); \
__macro(brelu, BReluFunctor, BReluGradFunctor); \
__macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor); \
__macro(pow, PowFunctor, PowGradFunctor); \
__macro(stanh, STanhFunctor, STanhGradFunctor); \
__macro(softsign, SoftsignFunctor, SoftsignGradFunctor); \
__macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor); \
__macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/adadelta_op.h"
namespace paddle {
namespace operators {
class AdadeltaOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContextBase *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
"Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
"Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("AvgSquaredGradOut"),
"Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("AvgSquaredUpdateOut"),
"Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"),
"param and grad input of AdadeltaOp should have same dimension");
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
"Param and AvgSquaredGrad input of AdadeltaOp "
"should have same dimension");
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
"Param and AvgSquaredUpdate input of AdadeltaOp "
"should have same dimension");
ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
}
};
class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
public:
AdadeltaOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient");
AddInput("AvgSquaredGrad",
"(Tensor) Input expectation of squared gradient");
AddInput("AvgSquaredUpdate",
"(Tensor) Input expectation of squared parameter updates");
AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("AvgSquaredGradOut",
"(Tensor) Output expectation of squared gradient");
AddOutput("AvgSquaredUpdateOut",
"(Tensor) Output expectation of squared parameter updates");
AddAttr<float>("rho",
"(float, default 0.95) Exponential decay rate "
"for squared gradients.")
.SetDefault(0.95f);
AddAttr<float>("epsilon",
"(float, default 1.0e-6) Constant for "
"numerical stability")
.SetDefault(1.0e-6f);
AddComment(R"DOC(
Adadelta Updates Operator.
This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
adaptive learning rate method for gradient descent.
Adadelta updates:
avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
param_update = - sqrt((avg_squared_update + epsilon) /
(avg_squared_grad_out + epsilon)) * grad
avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
param_out = param + param_update
References:
[1] ADADELTA: An Adaptive Learning Rate Method
https://arxiv.org/abs/1212.5701
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
REGISTER_OP_CPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/add_op.h" #define EIGEN_USE_GPU
#include "paddle/operators/adadelta_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>); REGISTER_OP_GPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename Place, typename T>
class AdadeltaOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
auto avg_squared_grad_out_tensor =
ctx.Output<framework::Tensor>("AvgSquaredGradOut");
auto avg_squared_update_out_tensor =
ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
param_out_tensor->mutable_data<T>(ctx.GetPlace());
avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
float rho = ctx.Attr<float>("rho");
float epsilon = ctx.Attr<float>("epsilon");
auto param = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Param"));
auto grad = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Grad"));
// Squared gradient accumulator
auto avg_squared_grad = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("AvgSquaredGrad"));
// Squared updates accumulator
auto avg_squared_update = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
auto avg_squared_grad_out =
framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
auto avg_squared_update_out =
framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
auto place = ctx.GetEigenDevice<Place>();
avg_squared_grad_out.device(place) =
rho * avg_squared_grad + (1 - rho) * grad.square();
auto update =
-((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
.sqrt() *
grad;
avg_squared_update_out.device(place) =
rho * avg_squared_update + (1 - rho) * update.square();
param_out.device(place) = param + update;
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/adagrad_op.h"
namespace paddle {
namespace operators {
class AdagradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContextBase *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of AdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of AdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Moment"),
"Input(Moment) of AdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of AdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(ParamOut) of AdagradOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
"Output(MomentOut) of AdagradOp should not be null.");
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"LearningRate should have one element");
auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of AdagradOp should have the same dimension.");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment"),
"Param and Moment input of AdagradOp should have the same dimension.");
ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("MomentOut", param_dims);
}
};
class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
public:
AdagradOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient");
AddInput("Moment", "(Tensor) Second moment");
AddInput("LearningRate", "(Tensor) Learning rate");
AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("MomentOut", "(Tensor) Output second moment");
AddAttr<float>("epsilon",
"(float, default 1.0e-6) "
"Constant for numerical stability")
.SetDefault(1.0e-6f);
AddComment(R"DOC(
Adaptive Gradient Algorithm (Adagrad).
moment_out = moment + grad * grad
param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
does not have the epsilon attribute. It is added here for numerical stability
by avoiding division by zero.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
REGISTER_OP_CPU_KERNEL(adagrad,
ops::AdagradOpKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/adagrad_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(adagrad,
ops::AdagradOpKernel<paddle::platform::GPUPlace, float>);
...@@ -12,57 +12,44 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,57 +12,44 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/add_op.h" #pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
class AddOp : public framework::OperatorWithKernel { template <typename Place, typename T>
class AdagradOpKernel : public framework::OpKernel<T> {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; void Compute(const framework::ExecutionContext& ctx) const override {
auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
protected: auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
void InferShape(framework::InferShapeContextBase* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of AddOp should not be null."); param_out_tensor->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of AddOp should not be null."); moment_out_tensor->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of AddOp should not be null."); float epsilon = ctx.Attr<float>("epsilon");
auto x_dims = ctx->GetInputDim("X"); auto param = framework::EigenVector<T>::Flatten(
auto y_dims = ctx->GetInputDim("Y"); *ctx.Input<framework::Tensor>("Param"));
PADDLE_ENFORCE_EQ(x_dims, y_dims, auto grad = framework::EigenVector<T>::Flatten(
"Two input of Add Op's dimension must be same."); *ctx.Input<framework::Tensor>("Grad"));
ctx->SetOutputDim("Out", x_dims); auto moment = framework::EigenVector<T>::Flatten(
} *ctx.Input<framework::Tensor>("Moment"));
}; auto lr = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("LearningRate"));
class AddOpMaker : public framework::OpProtoAndCheckerMaker {
public: auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
AddOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
: OpProtoAndCheckerMaker(proto, op_checker) { auto place = ctx.GetEigenDevice<Place>();
AddInput("X", "The first input of add op");
AddInput("Y", "The second input of add op"); moment_out.device(place) = moment + grad * grad;
AddOutput("Out", "The output of add op"); Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
AddComment(R"DOC( param_out.device(place) =
Two Element Add Operator. param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
The equation is: Out = X + Y
)DOC");
} }
}; };
class AddOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContextBase* ctx) const override {}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
...@@ -14,12 +14,7 @@ limitations under the License. */ ...@@ -14,12 +14,7 @@ limitations under the License. */
#include "paddle/operators/cond_op.h" #include "paddle/operators/cond_op.h"
#include <cstring>
#include <sstream>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/gather.h" #include "paddle/operators/gather.h"
#include "paddle/operators/net_op.h"
#include "paddle/operators/scatter.h" #include "paddle/operators/scatter.h"
namespace paddle { namespace paddle {
...@@ -31,175 +26,183 @@ using Tensor = framework::Tensor; ...@@ -31,175 +26,183 @@ using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using DDim = framework::DDim; using DDim = framework::DDim;
void CondOp::CreateScope(const Scope& scope) const { framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
auto sub_scopes_var = scope.FindVar("SubScopes"); auto sub_scopes_var = scope.FindVar("SubScopes");
PADDLE_ENFORCE_NOT_NULL(sub_scopes_var, PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
"Output(SubScopes) of CondOp should not be null."); "Output(SubScopes) of CondOp should not be null.");
auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>(); auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
auto& sub_scope = scope.NewScope(); auto& sub_scope = scope.NewScope();
sub_scopes->push_back(&sub_scope); sub_scopes->push_back(&sub_scope);
return sub_scope;
} }
void CondOp::CreateIndexTensor(const Scope& scope) const { std::vector<framework::Scope*>& CondOp::GetSubScopes(
const framework::Scope& scope) const {
auto sub_scopes_var = scope.FindVar("SubScopes");
PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
"Output(SubScopes) of CondOp should not be null.");
return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
}
LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
auto index_tensors_var = scope.FindVar("IndexTensors"); auto index_tensors_var = scope.FindVar("IndexTensors");
PADDLE_ENFORCE_NOT_NULL(index_tensors_var, PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
"Output(IndexTensors) of CondOp should not be null."); "Output(IndexTensors) of CondOp should not be null.");
auto& index_tensors = auto& index_tensors =
*index_tensors_var->GetMutable<std::vector<LoDTensor>>(); *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
index_tensors.push_back(LoDTensor()); index_tensors.push_back(LoDTensor());
return index_tensors.back();
} }
void CondOp::InferShape(const Scope& scope) const { std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
auto sub_scopes_var = scope.FindVar("SubScopes"); const framework::Scope& scope) const {
PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
"Output(SubScopes) of CondOp should not be null.");
auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
for (int i = 0; i < 2; ++i) {
// Create two sub scopes for true and false branches
// sub_scopes[0] for the true branch and sub_scopes[1] for the false
// branch
CreateScope(scope);
// Create two tensors for true and false indices
// index_tensors[0] for the true branch and index_tensors[1] for the false
// branch
CreateIndexTensor(scope);
PADDLE_ENFORCE(!Inputs("Xs").empty(),
"Inputs(Xs) of CondOp can't be empty.");
for (auto& input : Inputs("Xs")) {
// Create a new tensor in sub-scope for input-type tensor
Variable* v = sub_scopes[i]->NewVar(input);
LoDTensor* sub_input = v->GetMutable<LoDTensor>();
sub_input->Resize(scope.FindVar(input)->GetMutable<LoDTensor>()->dims());
}
for (auto& output : (*sub_net_op_[i]).Outputs()) {
for (auto& var_name : output.second) {
sub_scopes[i]->NewVar(var_name);
}
}
// each net calls InferShape
// sub_net_op_[i]->InferShape(*sub_scopes[i]);
}
for (auto& output : Outputs("Outs")) {
LoDTensor* tensor_t_out =
sub_scopes[0]->FindVar(output)->GetMutable<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
LoDTensor* tensor_f_out =
sub_scopes[1]->FindVar(output)->GetMutable<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
auto* tensor_out_var = scope.FindVar(output);
PADDLE_ENFORCE_NOT_NULL(tensor_out_var, "Output not found");
LoDTensor* tensor_out = tensor_out_var->GetMutable<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
"True output tensor should not be NULL");
// check output size should be same
PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
"Outputs not of the same shape");
tensor_out->Resize(tensor_t_out->dims());
// tensor_out->mutable_data<float>(tensor_out->dims(),
// platform::CPUPlace());
tensor_out->mutable_data<float>(platform::CPUPlace());
}
}
void CondOp::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const {
auto* sub_scopes_var = scope.FindVar("SubScopes");
PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
"Output(SubScopes) of CondOp should not be null.");
auto sub_scopes = sub_scopes_var->Get<std::vector<Scope*>>();
auto* index_tensors_var = scope.FindVar("IndexTensors"); auto* index_tensors_var = scope.FindVar("IndexTensors");
PADDLE_ENFORCE_NOT_NULL(index_tensors_var, PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
"Output(IndexTensors) of CondOp should not be null."); "Output(IndexTensors) of CondOp should not be null.");
auto index_tensors = index_tensors_var->Get<std::vector<LoDTensor>>(); return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
}
void CondOp::PrepareDataForSubnet(
const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const {
PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
for (int i = 0; i < BRANCH_NUM; ++i) {
// Create two sub scopes for true and false branches
// sub_scopes[0] for the true branch
// sub_scopes[1] for the false branch
AddSubScope(scope);
// Create two tensors for true and false indices:
// index_tensors[0] for the true branch
// index_tensors[1] for the false branch
AddIndexTensor(scope);
}
std::string cond_name = Input("Cond"); Variable* cond_var = scope.FindVar(Input("Cond"));
Variable* cond_var = scope.FindVar(cond_name);
PADDLE_ENFORCE_NOT_NULL(cond_var, PADDLE_ENFORCE_NOT_NULL(cond_var,
"Input(Cond) of CondOp should not be null."); "Input(Cond) of CondOp should not be null.");
const LoDTensor* cond = cond_var->GetMutable<LoDTensor>(); const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
// Step 1: get the true/false index at runtime // get the true/false index at runtime according to cond tensor
// index_[0]: vector<int>, contains all index for cond[i] == true // index_vectors[0]: vector<int>, contains all index for cond[i] == true
// index_[1]: vector<int>, contains all index for cond[i] == false // index_vectors[1]: vector<int>, contains all index for cond[i] == false
for (int i = 0; i < 2; ++i) index_[i].clear(); std::vector<std::vector<int>> index_vectors;
index_vectors.resize(BRANCH_NUM);
const int* cond_data = cond->data<int>(); const int* cond_data = cond->data<int>();
for (int i = 0; i < cond->dims()[0]; ++i) { for (int i = 0; i < cond->dims()[0]; ++i) {
if (cond_data[i]) if (cond_data[i])
index_[0].push_back(i); index_vectors[TRUE_BRANCH].push_back(i);
else else
index_[1].push_back(i); index_vectors[FALSE_BRANCH].push_back(i);
} }
// put index_[0] and index_[1] into two tensors: // put index_vectors[0] and index_vectors[1] into two tensors:
// index_tensor_[0] and index_tensor_[1] // index_tensors[0] and index_tensors[1]
DDim dim = paddle::framework::make_ddim({0}); std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
for (int i = 0; i < 2; ++i) { std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
dim[0] = index_[i].size();
int* tmp_ptr = for (int i = 0; i < BRANCH_NUM; ++i) {
DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
int* index_tensor_data_ptr =
index_tensors[i].mutable_data<int>(dim, platform::CPUPlace()); index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
index_tensors[i].Resize(dim); memcpy(index_tensor_data_ptr, index_vectors[i].data(),
memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int)); dim[0] * sizeof(int));
} }
// Step 2: collect data by calling gather // create input in subscopes according to index_vectors
for (int i = 0; i < 2; ++i) {
// i= 0/i for True and False branches respectively
for (auto& input : Inputs("Xs")) { for (auto& input : Inputs("Xs")) {
// find Tensor Variable* var_parent = scope.FindVar(input);
Variable* v = scope.FindVar(input); PADDLE_ENFORCE_NOT_NULL(var_parent);
PADDLE_ENFORCE_NOT_NULL(v); const auto* tensor_parent = &var_parent->Get<LoDTensor>();
LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
v = sub_scopes[i]->FindVar(input); for (int i = 0; i < BRANCH_NUM; ++i) {
PADDLE_ENFORCE_NOT_NULL(v); Variable* var_child = sub_scopes[i]->FindVar(input);
LoDTensor* tensor_child = v->GetMutable<LoDTensor>(); PADDLE_ENFORCE_NOT_NULL(var_child);
auto* tensor_child = var_child->GetMutable<LoDTensor>();
// Resize child // Resize child
DDim dim = tensor_child->dims(); DDim dim = tensor_parent->dims();
dim[0] = index_[i].size(); dim[0] = index_tensors[i].dims()[0];
tensor_child->Resize(dim);
tensor_child->mutable_data<float>(dim, platform::CPUPlace()); tensor_child->mutable_data<float>(dim, platform::CPUPlace());
Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i], CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
tensor_child);
} }
} }
// Step 3: run // create output_tensors in subscope for sub_net
for (int i = 0; i < 2; ++i) { for (int i = 0; i < BRANCH_NUM; ++i) {
sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx); for (auto& output : (*sub_net_op_[i]).Outputs()) {
for (auto& var_name : output.second) {
sub_scopes[i]->NewVar(var_name);
}
} }
}
}
// Step 4: merge output results void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const {
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
const std::vector<framework::LoDTensor>& index_tensors =
GetIndexTensors(scope);
// Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
PADDLE_ENFORCE(!Outputs("Outs").empty(), PADDLE_ENFORCE(!Outputs("Outs").empty(),
"Outputs(Outs) of CondOp can't be empty."); "Outputs(Outs) of CondOp can't be empty.");
for (int i = 0; i < 2; ++i) {
// i= 0/i for True and False branches respectively
for (auto& output : Outputs("Outs")) { for (auto& output : Outputs("Outs")) {
// find Tensor const LoDTensor* tensor_t_out =
Variable* v = scope.FindVar(output); &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(v); PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
LoDTensor* tensor_parent = v->GetMutable<LoDTensor>(); const LoDTensor* tensor_f_out =
&sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
v = sub_scopes[i]->FindVar(output); auto* var_out = scope.FindVar(output);
PADDLE_ENFORCE_NOT_NULL(v); PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
LoDTensor* tensor_child = v->GetMutable<LoDTensor>(); LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
"True output tensor should not be NULL");
DDim true_dim = tensor_t_out->dims();
DDim false_dim = tensor_f_out->dims();
true_dim[0] = 0;
false_dim[0] = 0;
PADDLE_ENFORCE_EQ(true_dim, false_dim,
"Outputs not of the same shape except the first dim");
ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i], DDim out_dim = tensor_t_out->dims();
out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
tensor_out->Resize(out_dim);
tensor_out->mutable_data<float>(platform::CPUPlace());
}
// merge output results:
// output_tensor = true_output_tensor + false_output_tensor
for (auto& output : Outputs("Outs")) {
Variable* var_parent = scope.FindVar(output);
PADDLE_ENFORCE_NOT_NULL(var_parent);
auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
for (int i = 0; i < BRANCH_NUM; ++i) {
Variable* var_child = sub_scopes[i]->FindVar(output);
PADDLE_ENFORCE_NOT_NULL(var_child);
auto* tensor_child = &var_child->Get<LoDTensor>();
ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
tensor_parent); tensor_parent);
} }
} }
} }
void CondOp::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const {
PrepareDataForSubnet(scope, dev_ctx);
std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
for (int i = 0; i < BRANCH_NUM; ++i) {
sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
}
MergeDataFromSubnet(scope, dev_ctx);
}
class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker { class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
public: public:
CondOpProtoAndCheckerMaker(framework::OpProto* proto, CondOpProtoAndCheckerMaker(framework::OpProto* proto,
......
...@@ -40,8 +40,7 @@ class CondOp : public framework::OperatorBase { ...@@ -40,8 +40,7 @@ class CondOp : public framework::OperatorBase {
const framework::VariableNameMap& outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs) const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) { : OperatorBase(type, inputs, outputs, attrs) {
index_.resize(2); sub_net_op_.resize(BRANCH_NUM);
sub_net_op_.resize(2);
} }
CondOp(const CondOp& o) CondOp(const CondOp& o)
...@@ -51,42 +50,44 @@ class CondOp : public framework::OperatorBase { ...@@ -51,42 +50,44 @@ class CondOp : public framework::OperatorBase {
PADDLE_THROW("Not implemented"); PADDLE_THROW("Not implemented");
} }
void CreateScope(const framework::Scope& scope) const; framework::Scope& AddSubScope(const framework::Scope& scope) const;
std::vector<framework::Scope*>& GetSubScopes(
const framework::Scope& scope) const;
void CreateIndexTensor(const framework::Scope& scope) const; framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
std::vector<framework::LoDTensor>& GetIndexTensors(
const framework::Scope& scope) const;
/* void PrepareDataForSubnet(const framework::Scope& scope,
* InferShape must be called before Run. const platform::DeviceContext& dev_ctx) const;
* FIXME(yuyang18): Since InferShape has been removed, this implementation void MergeDataFromSubnet(const framework::Scope& scope,
* could be wrong. const platform::DeviceContext& dev_ctx) const;
*/
void InferShape(const framework::Scope& scope) const;
/* /*
* Set True Block * Set True Block
*/ */
void set_truenet(std::unique_ptr<OperatorBase>&& net) { void set_truenet(std::unique_ptr<OperatorBase>&& net) {
sub_net_op_[0] = std::move(net); sub_net_op_[TRUE_BRANCH] = std::move(net);
} }
/* /*
* Set False Block * Set False Block
*/ */
void set_falsenet(std::unique_ptr<OperatorBase>&& net) { void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
sub_net_op_[1] = std::move(net); sub_net_op_[FALSE_BRANCH] = std::move(net);
} }
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const override; const platform::DeviceContext& dev_ctx) const override;
private: private:
const int TRUE_BRANCH = 0;
const int FALSE_BRANCH = 1;
const int BRANCH_NUM = 2;
// sub_net_op_[0]: subnet_t // sub_net_op_[0]: subnet_t
// sub_net_op_[1]: subnet_f // sub_net_op_[1]: subnet_f
std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_; std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
// index_[0]: True_index;
// index_[1]: False_index;
mutable std::vector<std::vector<int>> index_;
}; };
} // namespace operators } // namespace operators
......
...@@ -34,7 +34,7 @@ struct StridedMemcpyFunctor<T, 1> { ...@@ -34,7 +34,7 @@ struct StridedMemcpyFunctor<T, 1> {
auto& cpu_place = boost::get<platform::CPUPlace>(place); auto& cpu_place = boost::get<platform::CPUPlace>(place);
memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
} else { } else {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
auto& gpu_place = boost::get<platform::GPUPlace>(place); auto& gpu_place = boost::get<platform::GPUPlace>(place);
auto& cuda_ctx = auto& cuda_ctx =
reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx); reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
......
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace operators {
using framework::Tensor;
using platform::Place;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
size_t index_size, size_t slice_size) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
int gather_i = indices[indices_i];
int params_i = gather_i * slice_size + slice_i;
*(output + i) = *(params + params_i);
}
}
/**
* A thin wrapper on gpu tensor
* Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T>
void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) {
// PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index.dims()[0];
auto src_dims = src.dims();
framework::DDim output_dims(src_dims);
output_dims[0] = index_size;
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
GatherCUDAKernel<T><<<
grid, block, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
p_src, p_index, p_output, index_size, slice_size);
}
} // namespace operators
} // namespace paddle
...@@ -24,49 +24,40 @@ limitations under the License. */ ...@@ -24,49 +24,40 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// Implementation of CPU copy using framework::Tensor;
template <typename T>
void CPUGather(const T* src, const int* indices, const int slice_size,
const int index_size, T* output) {
const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) {
int index_ = indices[i];
memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
}
}
// Implementation of GPU copy:
template <typename T>
void GPUGather(const T* src, const int* index, const int slice_size,
const int index_size, T* output);
/** /**
* A thin wrapper for gathering on cpu tensor
* Return a new tensor from source tensor, gathered according to index * Return a new tensor from source tensor, gathered according to index
* input[src]: type-T source Tensor * input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D) * input[index]: type-int index Tensor (1-D)
* return: output tensor * return: output tensor
*/ */
template <typename T> template <typename T>
void Gather(const platform::Place& place, const paddle::framework::Tensor* src, void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const paddle::framework::Tensor* index, const Tensor& index, Tensor* output) {
paddle::framework::Tensor* output) { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
// check index of shape 1-D // check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1); PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index->dims()[0]; int index_size = index.dims()[0];
auto src_dims = src->dims(); auto src_dims = src.dims();
framework::DDim output_dims(src_dims); framework::DDim output_dims(src_dims);
output_dims[0] = index_size; output_dims[0] = index_size;
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
// slice size // slice size
int slice_size = 1; int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
// Gathering const size_t slice_bytes = slice_size * sizeof(T);
if (platform::is_cpu_place(place)) {
CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size, for (int i = 0; i < index_size; ++i) {
output->data<T>()); int index_ = p_index[i];
memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
} }
} }
......
...@@ -31,6 +31,8 @@ class GatherOp : public framework::OperatorWithKernel { ...@@ -31,6 +31,8 @@ class GatherOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of GatherOp should not be null."); "Output(Out) of GatherOp should not be null.");
auto index_dims = ctx->GetInputDim("Index");
PADDLE_ENFORCE(index_dims.size() == 1);
int batch_size = ctx->GetInputDim("Index")[0]; int batch_size = ctx->GetInputDim("Index")[0];
PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0"); PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
framework::DDim output_dims(ctx->GetInputDim("X")); framework::DDim output_dims(ctx->GetInputDim("X"));
...@@ -79,8 +81,5 @@ Out = X[Index] ...@@ -79,8 +81,5 @@ Out = X[Index]
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad, REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
ops::GatherGradOp); ops::GatherGradOp);
REGISTER_OP_CPU_KERNEL(gather, REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
ops::GatherOpKernel<paddle::platform::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
REGISTER_OP_CPU_KERNEL(
gather_grad,
ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gather.cu.h"
#include "paddle/framework/eigen.h"
#include "paddle/operators/gather_op.h"
#include "scatter.cu.h"
namespace paddle {
namespace operators {
template <typename T>
class GatherOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *x = ctx.Input<Tensor>("X");
auto *index = ctx.Input<Tensor>("Index");
auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
GPUGather<T>(ctx.device_context(), *x, *index, output);
}
};
template <typename T>
class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *Index = ctx.Input<Tensor>("Index");
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *x = ctx.Input<Tensor>("X");
dX->mutable_data<T>(ctx.GetPlace());
auto dxt = framework::EigenVector<T>::Flatten(*dX);
auto place = ctx.GetEigenDevice<platform::GPUPlace>();
dxt.device(place) = dxt.constant(static_cast<T>(0));
GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
...@@ -23,29 +23,40 @@ namespace operators { ...@@ -23,29 +23,40 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename Place, typename T> template <typename T>
class GatherOpKernel : public framework::OpKernel<T> { class GatherOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto *X = ctx.Input<Tensor>("X"); PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
auto *Index = ctx.Input<Tensor>("Index"); "This kernel only runs on CPU.");
auto *Y = ctx.Output<Tensor>("Out");
auto *x = ctx.Input<Tensor>("X");
auto *index = ctx.Input<Tensor>("Index");
auto *output = ctx.Output<Tensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
Y->mutable_data<T>(ctx.GetPlace()); CPUGather<T>(ctx.device_context(), *x, *index, output);
Gather<T>(ctx.GetPlace(), X, Index, Y);
} }
}; };
template <typename Place, typename T> template <typename T>
class GatherGradientOpKernel : public framework::OpKernel<T> { class GatherGradientOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *Index = ctx.Input<Tensor>("Index"); auto *Index = ctx.Input<Tensor>("Index");
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X")); auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
dX->mutable_data<T>(ctx.GetPlace()); dX->mutable_data<T>(ctx.GetPlace());
ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX); auto dxt = framework::EigenVector<T>::Flatten(*dX);
auto place = ctx.GetEigenDevice<platform::CPUPlace>();
dxt.device(place) = dxt.constant(static_cast<T>(0));
ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
} }
}; };
......
...@@ -41,7 +41,9 @@ TEST(Gather, GatherData) { ...@@ -41,7 +41,9 @@ TEST(Gather, GatherData) {
int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace()); int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
Gather<int>(CPUPlace(), src, index, output); auto* cpu_place = new paddle::platform::CPUPlace();
paddle::platform::CPUDeviceContext ctx(*cpu_place);
CPUGather<int>(ctx, *src, *index, output);
for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4); for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
......
...@@ -47,7 +47,6 @@ class LstmUnitOp : public framework::OperatorWithKernel { ...@@ -47,7 +47,6 @@ class LstmUnitOp : public framework::OperatorWithKernel {
} }
}; };
template <typename AttrType>
class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker { class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
LstmUnitOpMaker(framework::OpProto* proto, LstmUnitOpMaker(framework::OpProto* proto,
...@@ -68,7 +67,7 @@ Equation: ...@@ -68,7 +67,7 @@ Equation:
H = C * sigm(o) H = C * sigm(o)
)DOC"); )DOC");
AddAttr<AttrType>("forget_bias", "The forget bias of Lstm Unit.") AddAttr<float>("forget_bias", "The forget bias of Lstm Unit.")
.SetDefault(0.0); .SetDefault(0.0);
} }
}; };
...@@ -93,9 +92,11 @@ class LstmUnitGradOp : public framework::OperatorWithKernel { ...@@ -93,9 +92,11 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker<float>, REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
lstm_unit_grad, ops::LstmUnitGradOp); ops::LstmUnitGradOp);
REGISTER_OP_CPU_KERNEL(lstm_unit, REGISTER_OP_CPU_KERNEL(lstm_unit,
ops::LstmUnitKernel<paddle::platform::CPUPlace, float>); ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>); lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
...@@ -89,7 +89,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim, ...@@ -89,7 +89,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
} }
} }
template <typename T, typename AttrType = T> template <typename T>
class LstmUnitOpCUDAKernel : public framework::OpKernel<T> { class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -101,7 +101,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> { ...@@ -101,7 +101,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
auto* c_tensor = ctx.Output<framework::Tensor>("C"); auto* c_tensor = ctx.Output<framework::Tensor>("C");
auto* h_tensor = ctx.Output<framework::Tensor>("H"); auto* h_tensor = ctx.Output<framework::Tensor>("H");
auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias")); auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
int b_size = c_tensor->dims()[0]; int b_size = c_tensor->dims()[0];
int D = c_tensor->dims()[1]; int D = c_tensor->dims()[1];
...@@ -120,7 +120,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> { ...@@ -120,7 +120,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
} }
}; };
template <typename T, typename AttrType = T> template <typename T>
class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> { class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -153,7 +153,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -153,7 +153,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
int N = c_tensor->dims()[0]; int N = c_tensor->dims()[0];
int D = c_tensor->dims()[1]; int D = c_tensor->dims()[1];
auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias")); auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
int block = 512; int block = 512;
int n = N * D; int n = N * D;
...@@ -169,5 +169,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -169,5 +169,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>); REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>); ops::LstmUnitOpCUDAKernel<double>);
REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
ops::LstmUnitGradOpCUDAKernel<double>);
...@@ -32,7 +32,7 @@ inline T tanh(T x) { ...@@ -32,7 +32,7 @@ inline T tanh(T x) {
return 2. * sigmoid(2. * x) - 1.; return 2. * sigmoid(2. * x) - 1.;
} }
template <typename Place, typename T, typename AttrType = T> template <typename Place, typename T>
class LstmUnitKernel : public framework::OpKernel<T> { class LstmUnitKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -44,7 +44,7 @@ class LstmUnitKernel : public framework::OpKernel<T> { ...@@ -44,7 +44,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
auto* c_tensor = ctx.Output<framework::Tensor>("C"); auto* c_tensor = ctx.Output<framework::Tensor>("C");
auto* h_tensor = ctx.Output<framework::Tensor>("H"); auto* h_tensor = ctx.Output<framework::Tensor>("H");
auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias")); auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
int b_size = c_tensor->dims()[0]; int b_size = c_tensor->dims()[0];
int D = c_tensor->dims()[1]; int D = c_tensor->dims()[1];
...@@ -75,7 +75,7 @@ class LstmUnitKernel : public framework::OpKernel<T> { ...@@ -75,7 +75,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
} }
}; };
template <typename Place, typename T, typename AttrType = T> template <typename Place, typename T>
class LstmUnitGradKernel : public framework::OpKernel<T> { class LstmUnitGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -108,7 +108,7 @@ class LstmUnitGradKernel : public framework::OpKernel<T> { ...@@ -108,7 +108,7 @@ class LstmUnitGradKernel : public framework::OpKernel<T> {
int N = c_tensor->dims()[0]; int N = c_tensor->dims()[0];
int D = c_tensor->dims()[1]; int D = c_tensor->dims()[1];
auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias")); auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
for (int d = 0; d < D; ++d) { for (int d = 0; d < D; ++d) {
......
if(WITH_GPU) if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu pooling.cc pooling.cu DEPS cblas device_context operator)
im2col.cu DEPS cblas device_context operator)
nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc cc_library(math_function SRCS math_function.cc im2col.cc pooling.cc DEPS cblas device_context operator)
DEPS cblas device_context operator)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(softmax SRCS softmax.cc DEPS operator)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
......
...@@ -71,7 +71,7 @@ void testIm2col() { ...@@ -71,7 +71,7 @@ void testIm2col() {
context = context =
new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
} else { } else {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
context = context =
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
#else #else
...@@ -116,7 +116,7 @@ void testIm2col() { ...@@ -116,7 +116,7 @@ void testIm2col() {
TEST(math, im2col) { TEST(math, im2col) {
testIm2col<paddle::platform::CPUPlace>(); testIm2col<paddle::platform::CPUPlace>();
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
testIm2col<paddle::platform::GPUPlace>(); testIm2col<paddle::platform::GPUPlace>();
#endif #endif
} }
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(math_function, notrans_mul_trans) { TEST(math_function, notrans_mul_trans) {
paddle::framework::Tensor input1; paddle::framework::Tensor input1;
paddle::framework::Tensor input1_gpu; paddle::framework::Tensor input1_gpu;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/pooling.h"
namespace paddle {
namespace operators {
namespace math {
template <typename PoolProcess, typename T>
class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output,
std::vector<int>& ksize, std::vector<int>& strides,
std::vector<int>& paddings, PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const int ksize_height = ksize[0];
const int ksize_width = ksize[1];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_stride = input_height * input_width;
const int output_stride = output_height * output_width;
const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
T ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
pool_process.compute(ele, input_data[h * input_width + w]);
}
}
int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, (static_cast<T>(pool_size)));
output_data[ph * output_width + pw] = ele;
}
}
input_data += input_stride;
output_data += output_stride;
}
}
}
};
template <typename PoolProcess, class T>
class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_grad_process) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const int ksize_height = ksize[0];
const int ksize_width = ksize[1];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_stride = input_height * input_width;
const int output_stride = output_height * output_width;
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
int pool_size = (hend - hstart) * (wend - wstart);
float scale = 1.0 / pool_size;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
pool_grad_process.compute(
input_data[h * input_width + w],
output_data[ph * output_width + pw],
output_grad_data[ph * output_width + pw],
input_grad_data[h * input_width + w],
static_cast<T>(scale));
}
}
}
}
input_data += input_stride;
output_data += output_stride;
input_grad_data += input_stride;
output_grad_data += output_stride;
}
}
}
};
template <class T>
class MaxPool2dGradFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const int ksize_height = ksize[0];
const int ksize_width = ksize[1];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const int input_stride = input_height * input_width;
const int output_stride = output_height * output_width;
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
bool stop = false;
for (int h = hstart; h < hend && !stop; ++h) {
for (int w = wstart; w < wend && !stop; ++w) {
int input_idx = h * input_width + w;
int output_idx = ph * output_width + pw;
if (input_data[input_idx] == output_data[output_idx]) {
input_grad_data[input_idx] += output_grad_data[output_idx];
stop = true;
}
}
}
}
}
input_data += input_stride;
output_data += output_stride;
input_grad_data += input_stride;
output_grad_data += output_stride;
}
}
}
};
template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
// template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
template class Pool2dFunctor<platform::CPUPlace,
paddle::operators::math::MaxPool<float>, float>;
template class Pool2dFunctor<platform::CPUPlace,
paddle::operators::math::AvgPool<float>, float>;
template class Pool2dGradFunctor<
platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
template class Pool2dGradFunctor<
platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
template class Pool2dFunctor<platform::CPUPlace,
paddle::operators::math::MaxPool<double>, double>;
template class Pool2dFunctor<platform::CPUPlace,
paddle::operators::math::AvgPool<double>, double>;
template class Pool2dGradFunctor<
platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
template class Pool2dGradFunctor<
platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
template <typename PoolProcess, class T>
class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output,
std::vector<int>& ksize, std::vector<int>& strides,
std::vector<int>& paddings, PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
const int input_width = input.dims()[4];
const int output_channels = output.dims()[1];
const int output_depth = output.dims()[2];
const int output_height = output.dims()[3];
const int output_width = output.dims()[4];
const int ksize_depth = ksize[0];
const int ksize_height = ksize[1];
const int ksize_width = ksize[2];
const int stride_depth = strides[0];
const int stride_height = strides[1];
const int stride_width = strides[2];
const int padding_depth = paddings[0];
const int padding_height = paddings[1];
const int padding_width = paddings[2];
const int input_stride = input_depth * input_height * input_width;
const int output_stride = output_depth * output_height * output_width;
const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
for (int pd = 0; pd < output_depth; ++pd) {
int dstart = pd * stride_depth - padding_depth;
int dend = std::min(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0);
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
int output_idx = (pd * output_height + ph) * output_width + pw;
T ele = pool_process.initial();
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
pool_process.compute(
ele,
input_data[(d * input_height + h) * input_width + w]);
}
}
}
int pool_size =
(dend - dstart) * (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, static_cast<T>(pool_size));
output_data[output_idx] = ele;
}
}
}
input_data += input_stride;
output_data += output_stride;
}
}
}
};
template <typename PoolProcess, class T>
class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_grad_process) {
const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
const int input_width = input.dims()[4];
const int output_channels = output.dims()[1];
const int output_depth = output.dims()[2];
const int output_height = output.dims()[3];
const int output_width = output.dims()[4];
const int ksize_depth = ksize[0];
const int ksize_height = ksize[1];
const int ksize_width = ksize[2];
const int stride_depth = strides[0];
const int stride_height = strides[1];
const int stride_width = strides[2];
const int padding_depth = paddings[0];
const int padding_height = paddings[1];
const int padding_width = paddings[2];
const int input_stride = input_depth * input_height * input_width;
const int output_stride = output_depth * output_height * output_width;
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
for (int pd = 0; pd < output_depth; ++pd) {
int dstart = pd * stride_depth - padding_depth;
int dend = std::min(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0);
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
int pool_size =
(dend - dstart) * (hend - hstart) * (wend - wstart);
float scale = 1.0 / pool_size;
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int input_idx = (d * input_height + h) * input_width + w;
int output_idx =
(pd * output_height + ph) * output_width + pw;
pool_grad_process.compute(
input_data[input_idx], output_data[output_idx],
output_grad_data[output_idx],
input_grad_data[input_idx], static_cast<T>(scale));
}
}
}
}
}
}
input_data += input_stride;
output_data += output_stride;
input_grad_data += input_stride;
output_grad_data += output_stride;
}
}
}
};
template <class T>
class MaxPool3dGradFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) {
const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
const int input_width = input.dims()[4];
const int output_channels = output.dims()[1];
const int output_depth = output.dims()[2];
const int output_height = output.dims()[3];
const int output_width = output.dims()[4];
const int ksize_depth = ksize[0];
const int ksize_height = ksize[1];
const int ksize_width = ksize[2];
const int stride_depth = strides[0];
const int stride_height = strides[1];
const int stride_width = strides[2];
const int padding_depth = paddings[0];
const int padding_height = paddings[1];
const int padding_width = paddings[2];
const int input_stride = input_depth * input_height * input_width;
const int output_stride = output_depth * output_height * output_width;
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
for (int pd = 0; pd < output_depth; ++pd) {
int dstart = pd * stride_depth - padding_depth;
int dend = std::min(dstart + ksize_depth, input_depth);
dstart = std::max(dstart, 0);
for (int ph = 0; ph < output_height; ++ph) {
int hstart = ph * stride_height - padding_height;
int hend = std::min(hstart + ksize_height, input_height);
hstart = std::max(hstart, 0);
for (int pw = 0; pw < output_width; ++pw) {
int wstart = pw * stride_width - padding_width;
int wend = std::min(wstart + ksize_width, input_width);
wstart = std::max(wstart, 0);
bool stop = false;
for (int d = dstart; d < dend && !stop; ++d) {
for (int h = hstart; h < hend && !stop; ++h) {
for (int w = wstart; w < wend && !stop; ++w) {
int input_idx = (d * input_height + h) * input_width + w;
int output_idx =
(pd * output_height + ph) * output_width + pw;
if (input_data[input_idx] == output_data[output_idx]) {
input_grad_data[input_idx] +=
output_grad_data[output_idx];
stop = true;
}
}
}
}
}
}
}
input_data += input_stride;
output_data += output_stride;
input_grad_data += input_stride;
output_grad_data += output_stride;
}
}
}
};
template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
// template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
template class Pool3dFunctor<platform::CPUPlace,
paddle::operators::math::MaxPool<float>, float>;
template class Pool3dFunctor<platform::CPUPlace,
paddle::operators::math::AvgPool<float>, float>;
template class Pool3dGradFunctor<
platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
template class Pool3dGradFunctor<
platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
template class Pool3dFunctor<platform::CPUPlace,
paddle::operators::math::MaxPool<double>, double>;
template class Pool3dFunctor<platform::CPUPlace,
paddle::operators::math::AvgPool<double>, double>;
template class Pool3dGradFunctor<
platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
template class Pool3dGradFunctor<
platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/pooling.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
template <typename PoolProcess, typename T>
__global__ void KernelPool2D(const int nthreads, const T* input_data,
T* output_data, const int channels,
const int input_height, const int input_width,
const int output_height, const int output_width,
const int ksize_height, const int ksize_width,
const int stride_height, const int stride_width,
const int padding_height, const int padding_width,
PoolProcess pool_process) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int pw = index % output_width;
int ph = (index / output_width) % output_height;
int c = (index / output_width / output_height) % channels;
int batch_idx = index / output_width / output_height / channels;
int hstart = ph * stride_height - padding_height;
int hend = min(hstart + ksize_height, input_height);
hstart = max(hstart, 0);
int wstart = pw * stride_width - padding_width;
int wend = min(wstart + ksize_width, input_width);
wstart = max(wstart, 0);
input_data += (batch_idx * channels + c) * input_height * input_width;
T ele = pool_process.initial();
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
pool_process.compute(ele, input_data[h * input_width + w]);
}
}
int pool_size = (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, (static_cast<T>(pool_size)));
output_data[index] = ele;
}
}
template <typename PoolProcess, typename T>
__global__ void KernelPool2DGrad(
const int nthreads, const T* input_data, const T* output_data,
const T* output_grad, T* input_grad, const int channels,
const int input_height, const int input_width, const int output_height,
const int output_width, const int ksize_height, const int ksize_width,
const int stride_height, const int stride_width, const int padding_height,
const int padding_width, PoolProcess pool_process) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int offsetW = index % input_width + padding_width;
int offsetH = (index / input_width) % input_height + padding_height;
int offsetC = (index / input_width / input_height) % channels;
int batch_idx = index / input_width / input_height / channels;
int phstart = (offsetH < ksize_height)
? 0
: (offsetH - ksize_height) / stride_height + 1;
int pwstart = (offsetW < ksize_width)
? 0
: (offsetW - ksize_width) / stride_width + 1;
int phend = min(offsetH / stride_height + 1, output_height);
int pwend = min(offsetW / stride_width + 1, output_width);
T gradient = 0;
T input = input_data[index];
int output_idx =
(batch_idx * channels + offsetC) * output_height * output_width;
output_data += output_idx;
output_grad += output_idx;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
int hstart = ph * stride_height - padding_height;
int wstart = pw * stride_width - padding_width;
int hend = min(hstart + ksize_height, input_height);
int wend = min(wstart + ksize_width, input_width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = (hend - hstart) * (wend - wstart);
int output_sub_idx = ph * output_width + pw;
pool_process.compute(input, output_data[output_sub_idx],
output_grad[output_sub_idx], gradient,
static_cast<T>(1.0 / pool_size));
}
}
input_grad[index] = gradient;
}
}
template <typename T>
__global__ void KernelMaxPool2DGrad(
const int nthreads, const T* input_data, const T* output_data,
const T* output_grad, T* input_grad, const int channels,
const int input_height, const int input_width, const int output_height,
const int output_width, const int ksize_height, const int ksize_width,
const int stride_height, const int stride_width, const int padding_height,
const int padding_width) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int pw = index % output_width;
int ph = (index / output_width) % output_height;
int c = (index / output_width / output_height) % channels;
int batch_idx = index / output_width / output_height / channels;
int hstart = ph * stride_height - padding_height;
int hend = min(hstart + ksize_height, input_height);
hstart = max(hstart, 0);
int wstart = pw * stride_width - padding_width;
int wend = min(wstart + ksize_width, input_width);
wstart = max(wstart, 0);
input_data += (batch_idx * channels + c) * input_height * input_width;
input_grad += (batch_idx * channels + c) * input_height * input_width;
T ele = output_data[index];
int maxIndex = -1;
bool stop = false;
for (int h = hstart; h < hend && !stop; ++h) {
for (int w = wstart; w < wend && !stop; ++w) {
if (ele == input_data[h * input_width + w]) {
maxIndex = h * input_width + w;
stop = true;
}
}
}
if (maxIndex != -1) {
// atomic add
atomicAdd(input_grad + maxIndex, output_grad[index]);
}
}
}
template <typename PoolProcess, typename T>
class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output,
std::vector<int>& ksize, std::vector<int>& strides,
std::vector<int>& paddings, PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const int ksize_height = ksize[0];
const int ksize_width = ksize[1];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelPool2D<
PoolProcess,
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(nthreads, input_data, output_data, input_channels,
input_height, input_width, output_height,
output_width, ksize_height, ksize_width,
stride_height, stride_width, padding_height,
padding_width, pool_process);
}
};
template <typename PoolProcess, typename T>
class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const int ksize_height = ksize[0];
const int ksize_width = ksize[1];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
int nthreads = batch_size * input_channels * input_height * input_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelPool2DGrad<
PoolProcess,
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
nthreads, input_data, output_data, output_grad_data, input_grad_data,
input_channels, input_height, input_width, output_height, output_width,
ksize_height, ksize_width, stride_height, stride_width, padding_height,
padding_width, pool_process);
}
};
template <typename T>
class MaxPool2dGradFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
const int output_channels = output.dims()[1];
const int output_height = output.dims()[2];
const int output_width = output.dims()[3];
const int ksize_height = ksize[0];
const int ksize_width = ksize[1];
const int stride_height = strides[0];
const int stride_width = strides[1];
const int padding_height = paddings[0];
const int padding_width = paddings[1];
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_height * output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelMaxPool2DGrad<
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
nthreads, input_data, output_data, output_grad_data, input_grad_data,
input_channels, input_height, input_width, output_height, output_width,
ksize_height, ksize_width, stride_height, stride_width, padding_height,
padding_width);
}
};
template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
// template class MaxPool2dGradFunctor<platform::GPUPlace, double>; // The
// 64-bit floating-point version of atomicAdd() is only supported by devices of
// compute capability 6.x and higher.
template class Pool2dFunctor<platform::GPUPlace,
paddle::operators::math::MaxPool<float>, float>;
template class Pool2dFunctor<platform::GPUPlace,
paddle::operators::math::AvgPool<float>, float>;
template class Pool2dGradFunctor<
platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
template class Pool2dGradFunctor<
platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
template class Pool2dFunctor<platform::GPUPlace,
paddle::operators::math::MaxPool<double>, double>;
template class Pool2dFunctor<platform::GPUPlace,
paddle::operators::math::AvgPool<double>, double>;
template class Pool2dGradFunctor<
platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
template class Pool2dGradFunctor<
platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
template <typename PoolProcess, typename T>
__global__ void KernelPool3D(
const int nthreads, const T* input_data, T* output_data, const int channels,
const int input_depth, const int input_height, const int input_width,
const int output_depth, const int output_height, const int output_width,
const int ksize_depth, const int ksize_height, const int ksize_width,
const int stride_depth, const int stride_height, const int stride_width,
const int padding_depth, const int padding_height, const int padding_width,
PoolProcess pool_process) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int pw = index % output_width;
int ph = (index / output_width) % output_height;
int pd = (index / output_width / output_height) % output_depth;
int c = (index / output_width / output_height / output_depth) % channels;
int batch_idx =
index / output_width / output_height / output_depth / channels;
int dstart = pd * stride_depth - padding_depth;
int hstart = ph * stride_height - padding_height;
int wstart = pw * stride_width - padding_width;
int dend = min(dstart + ksize_depth, input_depth);
int hend = min(hstart + ksize_height, input_height);
int wend = min(wstart + ksize_width, input_width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
T ele = pool_process.initial();
input_data +=
(batch_idx * channels + c) * input_depth * input_height * input_width;
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
pool_process.compute(
ele, input_data[(d * input_height + h) * input_width + w]);
}
}
}
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
pool_process.finalize(ele, static_cast<T>(pool_size));
output_data[index] = ele;
}
}
template <typename PoolProcess, typename T>
__global__ void KernelPool3DGrad(
const int nthreads, const T* input_data, const T* output_data,
const T* output_grad, T* input_grad, const int channels,
const int input_depth, const int input_height, const int input_width,
const int output_depth, const int output_height, const int output_width,
const int ksize_depth, const int ksize_height, const int ksize_width,
const int stride_depth, const int stride_height, const int stride_width,
const int padding_depth, const int padding_height, const int padding_width,
PoolProcess pool_process) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int offsetW = index % input_width + padding_width;
int offsetH = (index / input_width) % input_height + padding_height;
int offsetD =
(index / input_width / input_height) % input_depth + padding_depth;
int offsetC = (index / input_width / input_height / input_depth) % channels;
int batch_idx = index / input_width / input_height / input_depth / channels;
int pdstart = (offsetD < ksize_depth)
? 0
: (offsetD - ksize_depth) / stride_depth + 1;
int phstart = (offsetH < ksize_height)
? 0
: (offsetH - ksize_height) / stride_height + 1;
int pwstart = (offsetW < ksize_width)
? 0
: (offsetW - ksize_width) / stride_width + 1;
int pdend = min((offsetD) / stride_depth + 1, output_depth);
int phend = min((offsetH) / stride_height + 1, output_height);
int pwend = min((offsetW) / stride_width + 1, output_width);
T gradient = 0;
T input = input_data[index];
int output_idx = (batch_idx * channels + offsetC) * output_depth *
output_height * output_width;
output_data += output_idx;
output_grad += output_idx;
for (int pd = pdstart; pd < pdend; ++pd) {
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int dstart = pd * stride_depth - padding_depth;
int hstart = ph * stride_height - padding_height;
int wstart = pw * stride_width - padding_width;
int dend = min(dstart + ksize_depth, input_depth);
int hend = min(hstart + ksize_height, input_height);
int wend = min(wstart + ksize_width, input_width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
int output_sub_idx = (pd * output_height + ph) * output_width + pw;
pool_process.compute(input, output_data[output_sub_idx],
output_grad[output_sub_idx], gradient,
static_cast<T>(1.0 / pool_size));
}
}
}
input_grad[index] = gradient;
}
}
template <typename T>
__global__ void KernelMaxPool3DGrad(
const int nthreads, const T* input_data, const T* output_data,
const T* output_grad, T* input_grad, const int channels,
const int input_depth, const int input_height, const int input_width,
const int output_depth, const int output_height, const int output_width,
const int ksize_depth, const int ksize_height, const int ksize_width,
const int stride_depth, const int stride_height, const int stride_width,
const int padding_depth, const int padding_height,
const int padding_width) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
index += blockDim.x * gridDim.x) {
int pw = index % output_width;
int ph = (index / output_width) % output_height;
int pd = (index / output_width / output_height) % output_depth;
int c = (index / output_width / output_height / output_depth) % channels;
int batch_idx =
index / output_width / output_height / output_depth / channels;
int dstart = pd * stride_depth - padding_depth;
int hstart = ph * stride_height - padding_height;
int wstart = pw * stride_width - padding_width;
int dend = min(dstart + ksize_depth, input_depth);
int hend = min(hstart + ksize_height, input_height);
int wend = min(wstart + ksize_width, input_width);
dstart = max(dstart, 0);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
T ele = output_data[index];
bool stop = false;
int maxIdx = -1;
input_data +=
(batch_idx * channels + c) * input_depth * input_height * input_width;
input_grad +=
(batch_idx * channels + c) * input_depth * input_height * input_width;
for (int d = dstart; d < dend && !stop; ++d) {
for (int h = hstart; h < hend && !stop; ++h) {
for (int w = wstart; w < wend && !stop; ++w) {
if (ele == input_data[(d * input_height + h) * input_width + w]) {
stop = true;
maxIdx = (d * input_height + h) * input_width + w;
}
}
}
}
if (maxIdx != -1) {
// atomic add
atomicAdd(input_grad + maxIdx, output_grad[index]);
}
}
}
template <typename PoolProcess, class T>
class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output,
std::vector<int>& ksize, std::vector<int>& strides,
std::vector<int>& paddings, PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
const int input_width = input.dims()[4];
const int output_channels = output.dims()[1];
const int output_depth = output.dims()[2];
const int output_height = output.dims()[3];
const int output_width = output.dims()[4];
const int ksize_depth = ksize[0];
const int ksize_height = ksize[1];
const int ksize_width = ksize[2];
const int stride_depth = strides[0];
const int stride_height = strides[1];
const int stride_width = strides[2];
const int padding_depth = paddings[0];
const int padding_height = paddings[1];
const int padding_width = paddings[2];
const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_depth * output_height *
output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelPool3D<
PoolProcess,
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
nthreads, input_data, output_data, input_channels, input_depth,
input_height, input_width, output_depth, output_height, output_width,
ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
stride_width, padding_depth, padding_height, padding_width,
pool_process);
}
};
template <typename PoolProcess, class T>
class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_process) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
const int input_width = input.dims()[4];
const int output_channels = output.dims()[1];
const int output_depth = output.dims()[2];
const int output_height = output.dims()[3];
const int output_width = output.dims()[4];
const int ksize_depth = ksize[0];
const int ksize_height = ksize[1];
const int ksize_width = ksize[2];
const int stride_depth = strides[0];
const int stride_height = strides[1];
const int stride_width = strides[2];
const int padding_depth = paddings[0];
const int padding_height = paddings[1];
const int padding_width = paddings[2];
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
int nthreads =
batch_size * input_channels * input_depth * input_height * input_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelPool3DGrad<
PoolProcess,
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
nthreads, input_data, output_data, output_grad_data, input_grad_data,
input_channels, input_depth, input_height, input_width, output_depth,
output_height, output_width, ksize_depth, ksize_height, ksize_width,
stride_depth, stride_height, stride_width, padding_depth,
padding_height, padding_width, pool_process);
}
};
template <class T>
class MaxPool3dGradFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_depth = input.dims()[2];
const int input_height = input.dims()[3];
const int input_width = input.dims()[4];
const int output_channels = output.dims()[1];
const int output_depth = output.dims()[2];
const int output_height = output.dims()[3];
const int output_width = output.dims()[4];
const int ksize_depth = ksize[0];
const int ksize_height = ksize[1];
const int ksize_width = ksize[2];
const int stride_depth = strides[0];
const int stride_height = strides[1];
const int stride_width = strides[2];
const int padding_depth = paddings[0];
const int padding_height = paddings[1];
const int padding_width = paddings[2];
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
int nthreads = batch_size * output_channels * output_depth * output_height *
output_width;
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
KernelMaxPool3DGrad<
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
nthreads, input_data, output_data, output_grad_data, input_grad_data,
input_channels, input_depth, input_height, input_width, output_depth,
output_height, output_width, ksize_depth, ksize_height, ksize_width,
stride_depth, stride_height, stride_width, padding_depth,
padding_height, padding_width);
}
};
template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
// template class MaxPool3dGradFunctor<platform::GPUPlace, double>; // The
// 64-bit floating-point version of atomicAdd() is only supported by devices of
// compute capability 6.x and higher.
template class Pool3dFunctor<platform::GPUPlace,
paddle::operators::math::MaxPool<float>, float>;
template class Pool3dFunctor<platform::GPUPlace,
paddle::operators::math::AvgPool<float>, float>;
template class Pool3dGradFunctor<
platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
template class Pool3dGradFunctor<
platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
template class Pool3dFunctor<platform::GPUPlace,
paddle::operators::math::MaxPool<double>, double>;
template class Pool3dFunctor<platform::GPUPlace,
paddle::operators::math::AvgPool<double>, double>;
template class Pool3dGradFunctor<
platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
template class Pool3dGradFunctor<
platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/hostdevice.h"
namespace paddle {
namespace operators {
namespace math {
//////////////////////
#define FLT_MAX __FLT_MAX__ //
template <class T>
class MaxPool {
public:
DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
DEVICE inline void finalize(T& y, const T& poo_size) {}
};
template <class T>
class AvgPool {
public:
DEVICE inline T initial() { return static_cast<T>(0); }
DEVICE inline void compute(T& y, const T& x) { y += x; }
DEVICE inline void finalize(T& y, const T& poo_size) { y /= poo_size; }
};
template <class T>
class MaxPoolGrad {
public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
T scale) {
dx += dy * (x == y);
}
};
template <class T>
class AvgPoolGrad {
public:
DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
T scale) {
dx += (scale * dy);
}
};
template <typename Place, typename PoolProcess, typename T>
class Pool2dFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output,
std::vector<int>& ksize, std::vector<int>& strides,
std::vector<int>& paddings, PoolProcess pool_compute);
};
template <typename Place, typename PoolProcess, typename T>
class Pool2dGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_compute);
};
template <typename Place, class T>
class MaxPool2dGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings);
};
template <typename Place, typename PoolProcess, typename T>
class Pool3dFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output,
std::vector<int>& ksize, std::vector<int>& strides,
std::vector<int>& paddings, PoolProcess pool_compute);
};
template <typename Place, typename PoolProcess, typename T>
class Pool3dGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_compute);
};
template <typename Place, class T>
class MaxPool3dGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings);
};
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -36,7 +36,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -36,7 +36,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of mean op"); AddInput("X", "The input of mean op");
AddOutput("Out", "The output of mean op").NotInGradient(); AddOutput("Out", "The output of mean op");
AddComment(R"DOC( Mean Operator AddComment(R"DOC( Mean Operator
)DOC"); )DOC");
} }
...@@ -52,11 +52,27 @@ class MeanGradOp : public framework::OperatorWithKernel { ...@@ -52,11 +52,27 @@ class MeanGradOp : public framework::OperatorWithKernel {
} }
}; };
class MeanGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDescBind> Apply() const override {
auto* grad_op = new framework::OpDescBind();
grad_op->SetType("mean_grad");
grad_op->SetInput("X", Input("X"));
grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
return std::unique_ptr<framework::OpDescBind>(grad_op);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, mean_grad, ops::MeanGradOp); REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
REGISTER_OP_CPU_KERNEL(mean, REGISTER_OP_CPU_KERNEL(mean,
ops::MeanKernel<paddle::platform::CPUPlace, float>); ops::MeanKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mean_grad, REGISTER_OP_CPU_KERNEL(mean_grad,
......
...@@ -49,9 +49,9 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -49,9 +49,9 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The left tensor of minus operator.").NotInGradient(); AddInput("X", "The left tensor of minus operator.");
AddInput("Y", "The right tensor of minus operator.").NotInGradient(); AddInput("Y", "The right tensor of minus operator.");
AddOutput("Out", "The output tensor of minus operator.").NotInGradient(); AddOutput("Out", "The output tensor of minus operator.");
AddComment(R"DOC(Minus Operator AddComment(R"DOC(Minus Operator
...@@ -64,26 +64,35 @@ or not. But the output only shares the LoD with input `X`. ...@@ -64,26 +64,35 @@ or not. But the output only shares the LoD with input `X`.
)DOC"); )DOC");
} }
}; };
template <typename AttrType>
class MinusGradOp : public NetOp { class MinusGradMaker : public framework::GradOpDescMakerBase {
public: public:
MinusGradOp(const std::string &type, const framework::VariableNameMap &inputs, using framework::GradOpDescMakerBase::GradOpDescMakerBase;
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs) std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
: NetOp(type, inputs, outputs, attrs) { const override {
auto out_grad = Input(framework::GradVarName("Out")); std::vector<std::unique_ptr<framework::OpDescBind>> ops;
auto x_grad = Output(framework::GradVarName("X")); auto x_g = InputGrad("X");
auto y_grad = Output(framework::GradVarName("Y")); if (!x_g.empty()) {
auto *x_g_op = new framework::OpDescBind();
// x_grad = out_grad x_g_op->SetType("scale");
AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}}, x_g_op->SetInput("X", OutputGrad("Out"));
{{"Y", {x_grad}}}, {})); x_g_op->SetOutput("Out", x_g);
x_g_op->SetAttr("scale", 1.0f);
framework::AttributeMap scale_attr; ops.emplace_back(x_g_op);
scale_attr["scale"] = static_cast<AttrType>(-1); }
AppendOp(framework::OpRegistry::CreateOp("scale", {{"X", {out_grad}}},
{{"Out", {y_grad}}}, scale_attr)); auto y_g = InputGrad("Y");
CompleteAddOp(false); if (!y_g.empty()) {
auto *y_g_op = new framework::OpDescBind();
y_g_op->SetType("scale");
y_g_op->SetInput("X", OutputGrad("Out"));
y_g_op->SetOutput("Out", y_g);
y_g_op->SetAttr("scale", -1.0f);
ops.emplace_back(y_g_op);
}
return ops;
} }
}; };
...@@ -91,7 +100,6 @@ class MinusGradOp : public NetOp { ...@@ -91,7 +100,6 @@ class MinusGradOp : public NetOp {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad, REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
ops::MinusGradOp<float>);
REGISTER_OP_CPU_KERNEL(minus, REGISTER_OP_CPU_KERNEL(minus,
ops::MinusKernel<paddle::platform::CPUPlace, float>); ops::MinusKernel<paddle::platform::CPUPlace, float>);
...@@ -56,8 +56,7 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -56,8 +56,7 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
"The input should be a k-D tensor(k > 0 and k < 7)"); "The input should be a k-D tensor(k > 0 and k < 7)");
AddOutput("Out", AddOutput("Out",
"The output of pad op." "The output of pad op."
"A tensor with the same shape as X.") "A tensor with the same shape as X.");
.NotInGradient();
AddComment(R"DOC( AddComment(R"DOC(
Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example: Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:
...@@ -111,11 +110,29 @@ class PadOpGrad : public framework::OperatorWithKernel { ...@@ -111,11 +110,29 @@ class PadOpGrad : public framework::OperatorWithKernel {
} }
}; };
class PadOpGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDescBind> Apply() const override {
auto* bind = new framework::OpDescBind();
bind->SetInput("X", Input("X"));
bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
bind->SetAttrMap(Attrs());
bind->SetType("pad_grad");
return std::unique_ptr<framework::OpDescBind>(bind);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(pad, ops::PadOp, ops::PadOpMaker, pad_grad, ops::PadOpGrad);
REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(pad_grad, REGISTER_OP_CPU_KERNEL(pad_grad,
ops::PadGradKernel<paddle::platform::CPUPlace, float>); ops::PadGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/pool_op.h"
namespace paddle {
namespace operators {
int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
return output_size;
}
class PoolOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContextBase *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"X(Input) of Pooling should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Out(Output) of Pooling should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
"pooling_type should be 'max' or 'avg'");
PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
"Pooling intput should be 4-D or 5-D");
if (ctx->Attrs().Get<bool>("globalPooling")) {
ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
for (size_t i = 0; i < ksize.size(); ++i)
ksize[i] = static_cast<int>(in_x_dims[i + 2]);
}
PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
"Input size and Pooling size should be consistent.");
PADDLE_ENFORCE(ksize.size() == 2 || ksize.size() == 3,
"Pooling size should be 2 elements. or 3 elements.");
PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
"strides size and pooling size should be the same.");
PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
"paddings size and pooling size should be the same.");
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
for (size_t i = 0; i < ksize.size(); ++i) {
output_shape.push_back(
OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
}
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
}
};
class PoolOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContextBase *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"X(Input) of Pooling should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input@Grad of Pooling should not be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Pool2dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"X",
"The input tensor of pooling operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddOutput("Out",
"The output tensor of pooling operator."
"The format of output tensor is also NCHW.");
AddAttr<std::string>("poolingType",
"PoolingType of pooling operator."
"Str constant equal to 'max' or 'avg'.")
.InEnum({"max", "avg"});
AddAttr<std::vector<int>>(
"ksize",
"Pooling size(depth, height, width) of pooling operator."
"If globalPooling = true, ksize is ignored and need not be "
"specified."); // TODO(Add checker)
AddAttr<bool>(
"globalPooling",
"Whether to use the globalPooling."
"Bool constant equal to false or true."
"Default false."
"If globalPooling = true, ksize is ignored and need not be specified.")
.SetDefault(false);
AddAttr<std::vector<int>>("strides",
"Strides(height, width) of pooling operator."
"Default {1,1}")
.SetDefault({1, 1}); // TODO(Add checker)
AddAttr<std::vector<int>>("paddings",
"Paddings(height, width) of pooling operator."
"Default {0,0}.")
.SetDefault({0, 0}); // TODO(Add checker)
AddComment(R"DOC(
The pooling2d operation calculates the output based on
the input, poolingType and ksize, strides, paddings parameters.
)DOC");
}
};
class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Pool3dOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"The input tensor of pooling operator. "
"The format of input tensor is NCDHW. Where N is batch size, C is "
"the "
"number of channels, D, H and W is the depth, height and width of "
"feature.");
AddOutput("Out",
"The output tensor of pooling operator."
"The format of output tensor is also NCDHW.");
AddAttr<std::string>("poolingType",
"PoolingType of pooling operator."
"str constant equal to 'max' or 'avg'.")
.InEnum({"max", "avg"});
AddAttr<std::vector<int>>(
"ksize",
"Pooling size(depth, height, width) of pooling operator."
"If globalPooling = true, ksize is ignored and need not be "
"specified."); // TODO(Add checker)
AddAttr<bool>(
"globalPooling",
"Whether to use the globalPooling."
"Bool constant equal to false or true."
"Default false."
"If globalPooling = true, ksize is ignored and need not be specified.")
.SetDefault(false);
AddAttr<std::vector<int>>(
"strides",
"Strides(depth, height, width) of pooling operator."
"Default {1,1,1}.")
.SetDefault({1, 1, 1}); // TODO(Add checker)
AddAttr<std::vector<int>>(
"paddings",
"Paddings(depth, height, width) of pooling operator."
"Default {0,0,0}.")
.SetDefault({0, 0, 0}); // TODO(Add checker)
AddComment(R"DOC(
The pooling3d operation calculates the output based on
the input, poolingType and ksize, strides, paddings parameters.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
ops::PoolOpGrad);
REGISTER_OP_CPU_KERNEL(pool2d,
ops::PoolKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(pool2d_grad,
ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
ops::PoolOpGrad);
REGISTER_OP_CPU_KERNEL(pool3d,
ops::PoolKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(pool3d_grad,
ops::PoolGradKernel<paddle::platform::CPUPlace, float>);
...@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #include "paddle/operators/pool_op.h"
#include "paddle/framework/op_desc.h" namespace ops = paddle::operators;
#include "paddle/framework/operator.h"
namespace paddle { REGISTER_OP_GPU_KERNEL(pool2d,
namespace framework { ops::PoolKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(pool2d_grad,
ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
OperatorBase* BuildGradOp(const OperatorBase* op); REGISTER_OP_GPU_KERNEL(pool3d,
ops::PoolKernel<paddle::platform::GPUPlace, float>);
void CompleteGradOpDesc(const OpDescBind* forw_op, OpDescBind* grad_op); REGISTER_OP_GPU_KERNEL(pool3d_grad,
ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/pooling.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
class PoolKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* in_x = context.Input<Tensor>("X");
Tensor* out = context.Output<Tensor>("Out");
std::string pooling_type = context.Attr<std::string>("poolingType");
std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
if (context.Attr<bool>("globalPooling")) {
for (size_t i = 0; i < ksize.size(); ++i) {
ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
}
}
switch (ksize.size()) {
case 2: {
if (pooling_type == "max") {
paddle::operators::math::Pool2dFunctor<
Place, paddle::operators::math::MaxPool<T>, T>
pool2d_forward;
paddle::operators::math::MaxPool<T> pool_process;
pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
paddings, pool_process);
} else if (pooling_type == "avg") {
paddle::operators::math::Pool2dFunctor<
Place, paddle::operators::math::AvgPool<T>, T>
pool2d_forward;
paddle::operators::math::AvgPool<T> pool_process;
pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
paddings, pool_process);
}
} break;
case 3: {
if (pooling_type == "max") {
paddle::operators::math::Pool3dFunctor<
Place, paddle::operators::math::MaxPool<T>, T>
pool3d_forward;
paddle::operators::math::MaxPool<T> pool_process;
pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
paddings, pool_process);
} else if (pooling_type == "avg") {
paddle::operators::math::Pool3dFunctor<
Place, paddle::operators::math::AvgPool<T>, T>
pool3d_forward;
paddle::operators::math::AvgPool<T> pool_process;
pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
paddings, pool_process);
}
} break;
}
}
};
template <typename Place, typename T>
class PoolGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* in_x = context.Input<Tensor>("X");
const Tensor* out = context.Input<Tensor>("Out");
const Tensor* out_grad =
context.Input<Tensor>(framework::GradVarName("Out"));
Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
std::string pooling_type = context.Attr<std::string>("poolingType");
std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
if (context.Attr<bool>("globalPooling")) {
for (size_t i = 0; i < ksize.size(); ++i)
ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
}
if (in_x_grad) {
in_x_grad->mutable_data<T>(context.GetPlace());
auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
temp.device(context.GetEigenDevice<Place>()) =
temp.constant(static_cast<T>(0));
switch (ksize.size()) {
case 2: {
if (pooling_type == "max") {
paddle::operators::math::MaxPool2dGradFunctor<Place, T>
pool2d_backward;
pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
*out_grad, ksize, strides, paddings);
} else if (pooling_type == "avg") {
paddle::operators::math::Pool2dGradFunctor<
Place, paddle::operators::math::AvgPoolGrad<T>, T>
pool2d_backward;
paddle::operators::math::AvgPoolGrad<T> pool_process;
pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
*out_grad, ksize, strides, paddings, pool_process);
}
} break;
case 3: {
if (pooling_type == "max") {
paddle::operators::math::MaxPool3dGradFunctor<Place, T>
pool3d_backward;
pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
*out_grad, ksize, strides, paddings);
} else if (pooling_type == "avg") {
paddle::operators::math::Pool3dGradFunctor<
Place, paddle::operators::math::AvgPoolGrad<T>, T>
pool3d_backward;
paddle::operators::math::AvgPoolGrad<T> pool_process;
pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
*out_grad, ksize, strides, paddings, pool_process);
}
} break;
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -30,36 +30,39 @@ using LoDTensor = framework::LoDTensor; ...@@ -30,36 +30,39 @@ using LoDTensor = framework::LoDTensor;
void RecurrentAlgorithm::Run(const Scope& scope, void RecurrentAlgorithm::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const { const platform::DeviceContext& dev_ctx) const {
auto step_scopes = GetStepScopes(scope); auto* input0 = scope.FindVar(arg_->inlinks[0]);
rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, PADDLE_ENFORCE_NOT_NULL(input0);
false /*infer_shape_mode*/); size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
InitMemories(step_scopes[0], false /*infer_shape_mode*/); PADDLE_ENFORCE_GT(seq_len, 0);
for (size_t step_id = 0; step_id < seq_len_; step_id++) { CreateScopes(scope, seq_len);
// create output alias variables auto& step_scopes = GetStepScopes(scope);
rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
InitMemories(step_scopes[0]);
for (size_t step_id = 0; step_id < seq_len; step_id++) {
if (step_id > 0) { if (step_id > 0) {
rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1, rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
false /*infer_shape_mode*/);
} }
(*stepnet_)->Run(*step_scopes[step_id], dev_ctx); (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
} }
rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len);
false /*infer_shape_mode*/);
} }
void RecurrentAlgorithm::CreateScopes(const Scope& scope) const { void RecurrentAlgorithm::CreateScopes(const Scope& scope,
size_t seq_len) const {
// TODO(superjom) Only two scopes are needed for inference, this case will be // TODO(superjom) Only two scopes are needed for inference, this case will be
// supported later. // supported later.
auto step_scopes_var = scope.FindVar(arg_->step_scopes); auto* step_scopes_var = scope.FindVar(arg_->step_scopes);
PADDLE_ENFORCE(step_scopes_var != nullptr, ""); PADDLE_ENFORCE(step_scopes_var != nullptr, "");
auto step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>(); auto* step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
// Now all variables in scope must be created outside of op. // Now all variables in scope must be created outside of op.
PADDLE_ENFORCE_NOT_NULL(stepnet_); PADDLE_ENFORCE_NOT_NULL(stepnet_);
PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs"); PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
if (seq_len_ > step_scopes->size()) { if (seq_len > step_scopes->size()) {
for (size_t i = step_scopes->size(); i < seq_len_; ++i) { for (size_t i = step_scopes->size(); i < seq_len; ++i) {
auto& step_scope = scope.NewScope(); auto& step_scope = scope.NewScope();
// create step net's temp inputs // create step net's temp inputs
...@@ -82,8 +85,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const { ...@@ -82,8 +85,7 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
} }
} }
void RecurrentAlgorithm::InitMemories(Scope* step_scope, void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
bool infer_shape_mode) const {
for (auto& attr : arg_->memories) { for (auto& attr : arg_->memories) {
auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>(); auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr, PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
...@@ -91,13 +93,10 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope, ...@@ -91,13 +93,10 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope,
attr.boot_var); attr.boot_var);
auto* boot_mem = auto* boot_mem =
step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>(); step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
if (infer_shape_mode) {
pre_mem->Resize(boot_mem->dims()); pre_mem->Resize(boot_mem->dims());
PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2); PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
} else {
pre_mem->ShareDataWith<float>(*boot_mem); pre_mem->ShareDataWith<float>(*boot_mem);
} }
}
} }
const rnn::ArgumentName RecurrentOp::kArgName{ const rnn::ArgumentName RecurrentOp::kArgName{
...@@ -146,23 +145,23 @@ class RecurrentAlgorithmProtoAndCheckerMaker ...@@ -146,23 +145,23 @@ class RecurrentAlgorithmProtoAndCheckerMaker
void RecurrentGradientAlgorithm::Run( void RecurrentGradientAlgorithm::Run(
const Scope& scope, const platform::DeviceContext& dev_ctx) const { const Scope& scope, const platform::DeviceContext& dev_ctx) const {
auto step_scopes = GetStepScopes(scope); auto* input0 = scope.FindVar(arg_->inlinks[0]);
rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_, PADDLE_ENFORCE_NOT_NULL(input0);
false /*infer_shape_mode*/); size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) { auto& step_scopes = GetStepScopes(scope);
if (static_cast<size_t>(step_id) != seq_len_ - 1) { rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1, for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
false /*infer_shape_mode*/); if (step_id != seq_len - 1) {
rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
} }
(*stepnet_)->Run(*step_scopes[step_id], dev_ctx); (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
} }
LinkBootMemoryGradients(step_scopes[0], false); rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len);
rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_, LinkBootMemoryGradients(step_scopes[0]);
false /*infer_shape_mode*/);
} }
void RecurrentGradientAlgorithm::LinkBootMemoryGradients( void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
Scope* step_scope, bool infer_shape_mode) const { Scope* step_scope) const {
for (auto& attr : arg_->memories) { for (auto& attr : arg_->memories) {
PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr, PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
"memory variable [%s] does not exists", attr.var); "memory variable [%s] does not exists", attr.var);
...@@ -171,12 +170,9 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients( ...@@ -171,12 +170,9 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>(); auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
auto* boot_mem_grad = auto* boot_mem_grad =
step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>(); step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
if (infer_shape_mode) {
boot_mem_grad->Resize(mem_grad->dims()); boot_mem_grad->Resize(mem_grad->dims());
} else {
boot_mem_grad->ShareDataWith<float>(*mem_grad); boot_mem_grad->ShareDataWith<float>(*mem_grad);
} }
}
} }
RecurrentGradientOp::RecurrentGradientOp( RecurrentGradientOp::RecurrentGradientOp(
......
...@@ -48,7 +48,7 @@ class RecurrentAlgorithm { ...@@ -48,7 +48,7 @@ class RecurrentAlgorithm {
* NOTE the scopes are reused in both the forward and backward, so just * NOTE the scopes are reused in both the forward and backward, so just
* create once and expand its size if more steps need. * create once and expand its size if more steps need.
*/ */
void CreateScopes(const framework::Scope& scope) const; void CreateScopes(const framework::Scope& scope, size_t seq_len) const;
const std::vector<framework::Scope*>& GetStepScopes( const std::vector<framework::Scope*>& GetStepScopes(
const framework::Scope& scope) const { const framework::Scope& scope) const {
...@@ -56,12 +56,11 @@ class RecurrentAlgorithm { ...@@ -56,12 +56,11 @@ class RecurrentAlgorithm {
->GetMutable<std::vector<framework::Scope*>>(); ->GetMutable<std::vector<framework::Scope*>>();
} }
void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const; void InitMemories(framework::Scope* step_scopes) const;
private: private:
std::unique_ptr<framework::OperatorBase>* stepnet_; std::unique_ptr<framework::OperatorBase>* stepnet_;
rnn::Argument* arg_; rnn::Argument* arg_;
mutable size_t seq_len_;
}; };
class RecurrentGradientAlgorithm { class RecurrentGradientAlgorithm {
...@@ -86,8 +85,7 @@ class RecurrentGradientAlgorithm { ...@@ -86,8 +85,7 @@ class RecurrentGradientAlgorithm {
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::DeviceContext& dev_ctx) const; const platform::DeviceContext& dev_ctx) const;
void LinkBootMemoryGradients(framework::Scope* step_scopes, void LinkBootMemoryGradients(framework::Scope* step_scopes) const;
bool infer_shape_mode) const;
protected: protected:
inline const std::vector<framework::Scope*>& GetStepScopes( inline const std::vector<framework::Scope*>& GetStepScopes(
...@@ -98,7 +96,6 @@ class RecurrentGradientAlgorithm { ...@@ -98,7 +96,6 @@ class RecurrentGradientAlgorithm {
private: private:
rnn::Argument* arg_; rnn::Argument* arg_;
mutable size_t seq_len_;
std::unique_ptr<framework::OperatorBase>* stepnet_; std::unique_ptr<framework::OperatorBase>* stepnet_;
}; };
...@@ -123,6 +120,7 @@ class RecurrentOp : public framework::OperatorBase { ...@@ -123,6 +120,7 @@ class RecurrentOp : public framework::OperatorBase {
void set_stepnet(std::unique_ptr<OperatorBase> net) { void set_stepnet(std::unique_ptr<OperatorBase> net) {
stepnet_ = std::move(net); stepnet_ = std::move(net);
} }
const OperatorBase& stepnet() const { return *stepnet_; } const OperatorBase& stepnet() const { return *stepnet_; }
static const rnn::ArgumentName kArgName; static const rnn::ArgumentName kArgName;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/rmsprop_op.h"
namespace paddle {
namespace operators {
class RmspropOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContextBase *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(Param) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
"Input(MeanSquare) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Input(LearningRate) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Moment"),
"Input(Moment) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
"Output(param_out) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
"Output(Momentum_out) of RmspropOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
"Output(MeanSquareOut) of RmspropOp should not be null.");
auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"),
"Param and grad input of RmspropOp should have the same dimension.");
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
"Param and Momentum input of RmspropOp "
"should have the same dimension.");
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
"Param and Momentum input of RmspropOp "
"should have the same dimension.");
auto lr_dim = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
"Learning Rate should be a scalar.");
ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("MomentOut", param_dim);
ctx->SetOutputDim("MeanSquareOut", param_dim);
}
};
class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
public:
RmspropOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param",
"(Tensor, default Tensor<float>) "
"Input parameter value that has to be updated");
AddInput("MeanSquare",
"(Tensor, default Tensor<float>)"
" The mean square value that gets updated");
AddInput("LearningRate",
"(Tensor, default Tensor<float>) "
"The learning rate should be a tensor of size 1");
AddInput("Grad",
"(Tensor, default Tensor<float>) "
"Input gradient of the parameter");
AddInput("Moment",
"(Tensor, default Tensor<float>) The moment that gets updated");
AddOutput("ParamOut", "(Tensor) Output updated parameter value");
AddOutput("MomentOut", "(Tensor) Output updated moment");
AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value");
AddAttr<float>("epsilon",
"(float, default 1e-10) Constant "
"for numerical stability.")
.SetDefault(1.0e-10f);
AddAttr<float>("decay",
"(float, default 0.9) "
"Discounting factor for coming gradient.")
.SetDefault(0.9f);
AddAttr<float>("momentum", "(float, default 0.0) Constant value")
.SetDefault(0.0f);
AddComment(R"DOC(
RMSprop
MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad
MomentOut = momentum * Moment +
LearningRate * Grad / sqrt(MeanSquareOut + epsilon)
ParamOut = Param - MomentOut
The original slides that proposed RMSprop: Slide 29 of
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
REGISTER_OP_CPU_KERNEL(rmsprop,
ops::RmspropOpKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/rmsprop_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(rmsprop,
ops::RmspropOpKernel<paddle::platform::GPUPlace, float>);
...@@ -25,22 +25,41 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -25,22 +25,41 @@ template <typename T, int MajorType = Eigen::RowMajor,
using EigenVector = framework::EigenVector<T, MajorType, IndexType>; using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
class AddKernel : public framework::OpKernel<T> { class RmspropOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* input0 = context.Input<Tensor>("X"); auto* param_out = ctx.Output<Tensor>("ParamOut");
auto* input1 = context.Input<Tensor>("Y"); auto* moment_out = ctx.Output<Tensor>("MomentOut");
auto* output = context.Output<Tensor>("Out"); auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
output->mutable_data<T>(context.GetPlace()); auto grad = ctx.Input<Tensor>("Grad");
auto X = EigenVector<T>::Flatten(*input0); param_out->mutable_data<T>(ctx.GetPlace());
auto Y = EigenVector<T>::Flatten(*input1); moment_out->mutable_data<T>(ctx.GetPlace());
auto Z = EigenVector<T>::Flatten(*output); mean_square_out->mutable_data<T>(ctx.GetPlace());
auto place = context.GetEigenDevice<Place>(); float epsilon = ctx.Attr<float>("epsilon");
float rho = ctx.Attr<float>("decay");
float momentum = ctx.Attr<float>("momentum");
Z.device(place) = X + Y; auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto g = EigenVector<T>::Flatten(*grad);
auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
auto p_out = EigenVector<T>::Flatten(*param_out);
auto mom_out = EigenVector<T>::Flatten(*moment_out);
auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
auto place = ctx.GetEigenDevice<Place>();
Eigen::DSizes<int, 1> grad_dsize(grad->numel());
ms_out.device(place) = rho * ms + (1 - rho) * g * g;
mom_out.device(place) =
momentum * mom +
lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
p_out.device(place) = p - mom_out;
} }
}; };
......
...@@ -25,7 +25,7 @@ using LoDTensor = framework::LoDTensor; ...@@ -25,7 +25,7 @@ using LoDTensor = framework::LoDTensor;
void SegmentInputs(const std::vector<Scope*>& step_scopes, void SegmentInputs(const std::vector<Scope*>& step_scopes,
const std::vector<std::string>& inlinks, const std::vector<std::string>& inlinks,
const size_t seq_len, bool infer_shape_mode) { const size_t seq_len) {
PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided."); PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
for (size_t i = 0; i < inlinks.size(); ++i) { for (size_t i = 0; i < inlinks.size(); ++i) {
// global inputs // global inputs
...@@ -41,11 +41,9 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes, ...@@ -41,11 +41,9 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
for (size_t j = 0; j < seq_len; j++) { for (size_t j = 0; j < seq_len; j++) {
Tensor* step_input = Tensor* step_input =
step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>(); step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>();
if (!infer_shape_mode) {
// The input of operators of each step is Tensor here. // The input of operators of each step is Tensor here.
// Maybe need to modify Slice function. // Maybe need to modify Slice function.
*step_input = input->Slice<float>(j, j + 1); *step_input = input->Slice<float>(j, j + 1);
}
step_input->Resize(step_dims); step_input->Resize(step_dims);
} }
} }
...@@ -53,22 +51,20 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes, ...@@ -53,22 +51,20 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
void ConcatOutputs(const std::vector<Scope*>& step_scopes, void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<std::string>& outlinks, const std::vector<std::string>& outlinks,
const size_t seq_len, bool infer_shape_mode) { const size_t seq_len) {
for (size_t i = 0; i < outlinks.size(); i++) { for (size_t i = 0; i < outlinks.size(); i++) {
auto output_var = step_scopes[0]->parent().FindVar(outlinks[i]); auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.", PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
outlinks[i]); outlinks[i]);
LoDTensor* output = output_var->GetMutable<LoDTensor>(); LoDTensor* output = output_var->GetMutable<LoDTensor>();
if (infer_shape_mode) { auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
auto step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]); PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
f::DDim step_dims = f::DDim step_dims =
step_scope_var->template GetMutable<LoDTensor>()->dims(); step_scope_var->template GetMutable<LoDTensor>()->dims();
std::vector<int64_t> dims_vec = vectorize(step_dims); std::vector<int64_t> dims_vec = vectorize(step_dims);
dims_vec.insert(dims_vec.begin(), seq_len); dims_vec.insert(dims_vec.begin(), seq_len);
output->Resize(f::make_ddim(dims_vec)); output->Resize(f::make_ddim(dims_vec));
} else {
output->mutable_data<float>(platform::CPUPlace()); output->mutable_data<float>(platform::CPUPlace());
for (size_t j = 0; j < seq_len; j++) { for (size_t j = 0; j < seq_len; j++) {
LoDTensor* step_output = LoDTensor* step_output =
...@@ -79,13 +75,11 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes, ...@@ -79,13 +75,11 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
.CopyFrom<float>(*step_output, platform::CPUPlace()); .CopyFrom<float>(*step_output, platform::CPUPlace());
} }
} }
}
} }
void LinkMemories(const std::vector<Scope*>& scopes, void LinkMemories(const std::vector<Scope*>& scopes,
const std::vector<rnn::MemoryAttr>& memories, const std::vector<rnn::MemoryAttr>& memories,
const size_t step_id, const int offset, const size_t step_id, const int offset) {
bool infer_shape_mode) {
PADDLE_ENFORCE_LT(step_id, scopes.size(), PADDLE_ENFORCE_LT(step_id, scopes.size(),
"step [%d] is out of range of step scopes' size [%d]", "step [%d] is out of range of step scopes' size [%d]",
step_id, scopes.size()); step_id, scopes.size());
...@@ -95,17 +89,14 @@ void LinkMemories(const std::vector<Scope*>& scopes, ...@@ -95,17 +89,14 @@ void LinkMemories(const std::vector<Scope*>& scopes,
step_id + offset, scopes.size(), step_id + offset, scopes.size(),
"offset [%d] is out of range, it must be less than (%d - %d)", offset, "offset [%d] is out of range, it must be less than (%d - %d)", offset,
scopes.size(), step_id); scopes.size(), step_id);
auto scope = scopes[step_id]; auto* scope = scopes[step_id];
auto linked_scope = scopes[step_id + offset]; auto* linked_scope = scopes[step_id + offset];
for (auto& attr : memories) { for (auto& attr : memories) {
auto mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>(); auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>(); auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
if (infer_shape_mode) {
mem->Resize(linked_mem->dims()); mem->Resize(linked_mem->dims());
} else {
mem->ShareDataWith<float>(*linked_mem); mem->ShareDataWith<float>(*linked_mem);
} }
}
} }
void InitArgument(const ArgumentName& name, Argument* arg, void InitArgument(const ArgumentName& name, Argument* arg,
...@@ -115,11 +106,11 @@ void InitArgument(const ArgumentName& name, Argument* arg, ...@@ -115,11 +106,11 @@ void InitArgument(const ArgumentName& name, Argument* arg,
arg->inlinks = op.Inputs(name.inlinks); arg->inlinks = op.Inputs(name.inlinks);
arg->outlinks = op.Outputs(name.outlinks); arg->outlinks = op.Outputs(name.outlinks);
auto boot_memories = auto& boot_memories =
is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories); is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
// attributes // attributes
auto memories = op.Attr<std::vector<std::string>>(name.memories); auto& memories = op.Attr<std::vector<std::string>>(name.memories);
auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories); auto& pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
PADDLE_ENFORCE(memories.size() == boot_memories.size(), PADDLE_ENFORCE(memories.size() == boot_memories.size(),
"the size of memories, boot_memories don't match:%d,%d", "the size of memories, boot_memories don't match:%d,%d",
......
...@@ -64,18 +64,18 @@ struct ArgumentName { ...@@ -64,18 +64,18 @@ struct ArgumentName {
*/ */
void SegmentInputs(const std::vector<Scope*>& step_scopes, void SegmentInputs(const std::vector<Scope*>& step_scopes,
const std::vector<std::string>& inlinks, const std::vector<std::string>& inlinks,
const size_t seq_len, bool infer_shape_mode); const size_t seq_len);
/** /**
* Process outputs of step nets and merge to variables. * Process outputs of step nets and merge to variables.
*/ */
void ConcatOutputs(const std::vector<Scope*>& step_scopes, void ConcatOutputs(const std::vector<Scope*>& step_scopes,
const std::vector<std::string>& outlinks, const std::vector<std::string>& outlinks,
const size_t seq_len, bool infer_shape_mode); const size_t seq_len);
void LinkMemories(const std::vector<Scope*>& step_scopes, void LinkMemories(const std::vector<Scope*>& step_scopes,
const std::vector<MemoryAttr>& memories, const size_t step_id, const std::vector<MemoryAttr>& memories, const size_t step_id,
const int offset, bool infer_shape_mode); const int offset);
void InitArgument(const ArgumentName& name, Argument* arg, void InitArgument(const ArgumentName& name, Argument* arg,
const framework::OperatorBase& op, bool is_grad = false); const framework::OperatorBase& op, bool is_grad = false);
......
...@@ -41,8 +41,8 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -41,8 +41,8 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of scale operator.").NotInGradient(); AddInput("X", "The input tensor of scale operator.");
AddOutput("Out", "The output tensor of scale operator.").NotInGradient(); AddOutput("Out", "The output tensor of scale operator.");
AddComment(R"DOC(Scale operator AddComment(R"DOC(Scale operator
The equation is: Out = scale*X The equation is: Out = scale*X
...@@ -52,21 +52,18 @@ The equation is: Out = scale*X ...@@ -52,21 +52,18 @@ The equation is: Out = scale*X
} }
}; };
// The operator to calculate gradients of a scale operator is just the scale class ScaleGradMaker : public framework::SingleGradOpDescMaker {
// operator itself.
// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
template <typename AttrType>
class ScaleGradOp : public NetOp {
public: public:
ScaleGradOp(const std::string &type, const framework::VariableNameMap &inputs, using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs) protected:
: NetOp(type, inputs, outputs, attrs) { std::unique_ptr<framework::OpDescBind> Apply() const override {
AppendOp(framework::OpRegistry::CreateOp( auto *grad_op = new framework::OpDescBind();
"scale", {{"X", {Input(framework::GradVarName("Out"))}}}, grad_op->SetType("scale");
{{"Out", {Output(framework::GradVarName("X"))}}}, grad_op->SetInput("X", OutputGrad("Out"));
{{"scale", Attr<AttrType>("scale")}})); grad_op->SetOutput("Out", InputGrad("X"));
CompleteAddOp(false); grad_op->SetAttr("scale", GetAttr("scale"));
return std::unique_ptr<framework::OpDescBind>(grad_op);
} }
}; };
...@@ -75,7 +72,7 @@ class ScaleGradOp : public NetOp { ...@@ -75,7 +72,7 @@ class ScaleGradOp : public NetOp {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad, REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
ops::ScaleGradOp<float>); ops::ScaleGradMaker);
REGISTER_OP_CPU_KERNEL(scale, REGISTER_OP_CPU_KERNEL(scale,
ops::ScaleKernel<paddle::platform::CPUPlace, float>); ops::ScaleKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void ScatterCUDAKernel(const T* params, const int* indices,
T* output, size_t index_size,
size_t slice_size) {
CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
int indices_i = i / slice_size;
int slice_i = i - indices_i * slice_size; // offset inside the slice
int scatter_i = indices[indices_i];
int out_i = scatter_i * slice_size + slice_i;
*(output + out_i) = *(params + i);
}
}
/**
* A thin wrapper on gpu tensor
* Return a new updated tensor from source tensor, scatter-assigned according to
* index
* input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D)
* return: output tensor
*/
template <typename T>
void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) {
// PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index.dims()[0];
auto src_dims = src.dims();
framework::DDim output_dims(src_dims);
output_dims[0] = index_size;
// slice size
int slice_size = 1;
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
int block = 512;
int n = slice_size * index_size;
int grid = (n + block - 1) / block;
ScatterCUDAKernel<T><<<
grid, block, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
p_src, p_index, p_output, index_size, slice_size);
}
} // namespace operators
} // namespace paddle
...@@ -24,67 +24,42 @@ namespace paddle { ...@@ -24,67 +24,42 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
// Implementation of CPU copy
template <typename T>
void CPUScatterUpdate(const paddle::framework::Tensor* src, const int* index,
const size_t index_size,
paddle::framework::Tensor* output) {
paddle::framework::DDim output_dims = output->dims();
for (size_t i = 0; i < index_size; ++i) {
int index_ = index[i];
paddle::framework::Tensor src_ = *src;
paddle::framework::Tensor output_ = *output;
if (index_size > 1) src_ = src->Slice<T>(i, i + 1);
if (output_dims[0] > 1) output_ = output->Slice<T>(index_, index_ + 1);
auto X = EigenVector<T>::Flatten(src_);
auto Y = EigenVector<T>::Flatten(output_);
Y = X + Y;
}
}
// Implementation of GPU scatter:
template <typename T>
void GPUScatterUpdate(const T* src, const int* index, const int slice_size,
const int index_size, T* output);
/** /**
* Return a updated tensor from source tensor, scattered according to index: * Return a updated tensor from source tensor, scattered according to index:
* dst[i] += src[index[i]] * dst[i] = src[index[i]]
* input[src]: type-T source Tensor * input[src]: type-T source Tensor
* input[index]: type-int index Tensor (1-D) * input[index]: type-int index Tensor (1-D)
* return: output tensor * return: output tensor
*/ */
template <typename T> template <typename T>
void ScatterUpdate(const platform::Place& place, void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
const paddle::framework::Tensor* src, const Tensor& index, Tensor* output) {
const paddle::framework::Tensor* index, PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
paddle::framework::Tensor* output) {
// check index of shape 1-D // check index of shape 1-D
PADDLE_ENFORCE(index->dims().size() == 1); PADDLE_ENFORCE(index.dims().size() == 1);
int index_size = index->dims()[0]; int index_size = index.dims()[0];
auto src_dims = src->dims(); auto src_dims = src.dims();
auto dst_dims = output->dims(); auto dst_dims = output->dims();
const T* p_src = src.data<T>();
const int* p_index = index.data<int>();
T* p_output = output->data<T>();
// check src shape and dst shape should match // check src shape and dst shape should match
for (int i = 1; i < src_dims.size(); i++) for (int i = 1; i < src_dims.size(); i++)
PADDLE_ENFORCE(src_dims[i] == dst_dims[i]); PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
// slice size // slice size
size_t slice_size = 1; size_t slice_size = 1;
for (int i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i]; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
const size_t slice_bytes = slice_size * sizeof(T);
if (platform::is_cpu_place(place)) { for (int i = 0; i < index_size; ++i) {
CPUScatterUpdate<T>(src, index->data<int>(), index_size, output); int index_ = p_index[i];
} else { memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
} }
} }
......
...@@ -97,8 +97,5 @@ Out[Index] = Ref[Index] + Updates ...@@ -97,8 +97,5 @@ Out[Index] = Ref[Index] + Updates
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad, REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
ops::ScatterGradOp); ops::ScatterGradOp);
REGISTER_OP_CPU_KERNEL(scatter, REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
ops::ScatterOpKernel<paddle::platform::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
REGISTER_OP_CPU_KERNEL(
scatter_grad,
ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gather.cu.h"
#include "paddle/operators/gather_op.h"
#include "scatter.cu.h"
namespace paddle {
namespace operators {
template <typename T>
class ScatterOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *Ref = ctx.Input<Tensor>("Ref");
auto *Index = ctx.Input<Tensor>("Index");
auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out");
Out->ShareDataWith<T>(*Ref);
GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
}
};
template <typename T>
class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO
dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index]
GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
...@@ -23,10 +23,12 @@ namespace operators { ...@@ -23,10 +23,12 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename Place, typename T> template <typename T>
class ScatterOpKernel : public framework::OpKernel<T> { class ScatterOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *Ref = ctx.Input<Tensor>("Ref"); auto *Ref = ctx.Input<Tensor>("Ref");
auto *Index = ctx.Input<Tensor>("Index"); auto *Index = ctx.Input<Tensor>("Index");
auto *Updates = ctx.Input<Tensor>("Updates"); auto *Updates = ctx.Input<Tensor>("Updates");
...@@ -35,14 +37,16 @@ class ScatterOpKernel : public framework::OpKernel<T> { ...@@ -35,14 +37,16 @@ class ScatterOpKernel : public framework::OpKernel<T> {
// In place output: Out = Ref, Out[Index] += Updates // In place output: Out = Ref, Out[Index] += Updates
Out->ShareDataWith<T>(*Ref); Out->ShareDataWith<T>(*Ref);
// Apply ScatterUpdate: Out[index] += Updates[:] // Apply ScatterUpdate: Out[index] += Updates[:]
ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out); ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
} }
}; };
template <typename Place, typename T> template <typename T>
class ScatterGradientOpKernel : public framework::OpKernel<T> { class ScatterGradientOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"This kernel only runs on CPU.");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref")); auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates")); auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index"); auto *Index = ctx.Input<Tensor>("Index");
...@@ -52,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> { ...@@ -52,7 +56,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
dRef->ShareDataWith<T>(*dOut); dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace()); dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index] // Gradient by Gather: dUpdates += dO[Index]
Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates); CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
} }
}; };
......
...@@ -40,7 +40,9 @@ TEST(scatter, ScatterUpdate) { ...@@ -40,7 +40,9 @@ TEST(scatter, ScatterUpdate) {
float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace()); float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
ScatterUpdate<float>(CPUPlace(), src, index, output); auto* cpu_place = new paddle::platform::CPUPlace();
paddle::platform::CPUDeviceContext ctx(*cpu_place);
ScatterAssign<float>(ctx, *src, *index, output);
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0)); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0)); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
......
...@@ -23,17 +23,22 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -23,17 +23,22 @@ class SGDOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContextBase *ctx) const override { void InferShape(framework::InferShapeContextBase *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("param"), PADDLE_ENFORCE(ctx->HasInput("Param"),
"Input(param) of SGDOp should not be null."); "Input(Param) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("grad"), PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(grad) of SGDOp should not be null."); "Input(Grad) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("param_out"), PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
"Output(param_out) of SGDOp should not be null."); "Input(LearningRate) of SGDOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
auto param_dim = ctx->GetInputDim("param"); "Output(ParamOut) of SGDOp should not be null.");
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("grad"),
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element");
auto param_dim = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
"Two input of SGD Op's dimension must be same."); "Two input of SGD Op's dimension must be same.");
ctx->SetOutputDim("param_out", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
} }
}; };
...@@ -41,10 +46,10 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -41,10 +46,10 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("param", "input parameter"); AddInput("Param", "Input parameter");
AddInput("grad", "input gradient"); AddInput("LearningRate", "Learning rate of SGD");
AddOutput("param_out", "output parameter"); AddInput("Grad", "Input gradient");
AddAttr<float>("learning_rate", "learning rate of sgd"); AddOutput("ParamOut", "output parameter");
AddComment(R"DOC( AddComment(R"DOC(
Simplest sgd algorithm. Simplest sgd algorithm.
......
...@@ -19,28 +19,25 @@ limitations under the License. */ ...@@ -19,28 +19,25 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
class SGDOpKernel : public framework::OpKernel<T> { class SGDOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto param = ctx.Input<Tensor>("param"); auto param = ctx.Input<framework::Tensor>("Param");
auto grad = ctx.Input<Tensor>("grad"); auto grad = ctx.Input<framework::Tensor>("Grad");
auto param_out = ctx.Output<Tensor>("param_out"); auto param_out = ctx.Output<framework::Tensor>("ParamOut");
float lr = ctx.Attr<float>("learning_rate"); auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
auto p = EigenVector<T>::Flatten(*param); auto p = framework::EigenVector<T>::Flatten(*param);
auto g = EigenVector<T>::Flatten(*grad); auto g = framework::EigenVector<T>::Flatten(*grad);
auto o = EigenVector<T>::Flatten(*param_out); auto o = framework::EigenVector<T>::Flatten(*param_out);
auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
auto place = ctx.GetEigenDevice<Place>(); auto place = ctx.GetEigenDevice<Place>();
o.device(place) = p - lr * g; Eigen::DSizes<int, 1> grad_dsize(grad->numel());
o.device(place) = p - lr.broadcast(grad_dsize) * g;
} }
}; };
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/operators/softmax_with_cross_entropy_op.h" #include "paddle/operators/softmax_with_cross_entropy_op.h"
#include <paddle/function/TensorType.h> #include <paddle/function/TensorType.h>
#include <iostream>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,13 +28,12 @@ class SoftmaxWithCrossEntropyOpMaker ...@@ -27,13 +28,12 @@ class SoftmaxWithCrossEntropyOpMaker
AddInput("Logits", AddInput("Logits",
"(Tensor, default: Tensor<float>), The unscaled log probabilities " "(Tensor, default: Tensor<float>), The unscaled log probabilities "
"which is a 2-D tensor with shape [N x K]. N is the batch_size, " "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
"and K is the class number.") "and K is the class number.");
.NotInGradient(); AddInput("Label",
AddInput(
"Label",
"(Tensor, default: Tensor<int>), The ground truth which is a 2-D " "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
"tensor. " "tensor. "
"If softLable is set to 0, Label is a Tensor<int> with shape [N x 1]. " "If softLable is set to 0, Label is a Tensor<int> with shape [N x "
"1]. "
"If softLable is set to 1, Label is a Tensor<float/double> " "If softLable is set to 1, Label is a Tensor<float/double> "
"with shape [N x K]."); "with shape [N x K].");
AddOutput( AddOutput(
...@@ -163,14 +163,33 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { ...@@ -163,14 +163,33 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
} }
}; };
class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDescBind> Apply() const override {
auto* grad_op = new framework::OpDescBind();
grad_op->SetType("softmax_with_cross_entropy_grad");
grad_op->SetInput("Label", Input("Label"));
grad_op->SetInput("Softmax", Output("Softmax"));
grad_op->SetInput("Loss", Output("Loss"));
grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDescBind>(grad_op);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp, REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
softmax_with_cross_entropy_grad, REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyOpGrad); ops::SoftmaxWithCrossEntropyOpGrad);
REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy, REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
ops::SoftmaxWithCrossEntropyKernel<float>); ops::SoftmaxWithCrossEntropyKernel<float>);
......
...@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) { ...@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) {
} }
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(StridedMemcpy, GPUCrop) { TEST(StridedMemcpy, GPUCrop) {
// clang-format off // clang-format off
int src[] = { int src[] = {
......
...@@ -11,6 +11,7 @@ limitations under the License. */ ...@@ -11,6 +11,7 @@ limitations under the License. */
#include "paddle/operators/sum_op.h" #include "paddle/operators/sum_op.h"
#include <vector> #include <vector>
#include "paddle/operators/net_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -22,14 +23,15 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -22,14 +23,15 @@ class SumOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContextBase* ctx) const override { void InferShape(framework::InferShapeContextBase* ctx) const override {
PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
auto x_dims = ctx->GetInputsDim("X"); auto x_dims = ctx->GetInputsDim("X");
PADDLE_ENFORCE(!x_dims.empty(), "Input(X) of SumOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of SumOp should not be null."); "Output(Out) of SumOp should not be null.");
auto in_dim = x_dims[0];
size_t N = x_dims.size(); size_t N = x_dims.size();
PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
auto in_dim = x_dims[0];
for (size_t i = 1; i < N; i++) { for (size_t i = 1; i < N; i++) {
auto dim = x_dims[i]; auto dim = x_dims[i];
PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape"); PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
...@@ -54,21 +56,26 @@ or not. But the output only shares the LoD with the first input. ...@@ -54,21 +56,26 @@ or not. But the output only shares the LoD with the first input.
} }
}; };
class SumGradOp : public framework::OperatorWithKernel { class SumGradMaker : public framework::GradOpDescMakerBase {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::GradOpDescMakerBase::GradOpDescMakerBase;
protected: std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
void InferShape(framework::InferShapeContextBase* ctx) const override { const override {
auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto x_grads = InputGrad("X");
auto x_grad_names = ctx->Outputs(framework::GradVarName("X")); std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
size_t x_length = x_grad_names.size(); grad_ops.reserve(x_grads.size());
std::vector<framework::DDim> x_grad_dims; auto og = OutputGrad("Out");
x_grad_dims.reserve(x_length); std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
for (size_t i = 0; i < x_length; ++i) { [&og](const std::string& x_grad) {
x_grad_dims.push_back(out_grad_dims); auto* grad_op = new framework::OpDescBind();
} grad_op->SetType("scale");
ctx->SetOutputsDim(framework::GradVarName("X"), x_grad_dims); grad_op->SetInput("X", og);
grad_op->SetOutput("Out", {x_grad});
grad_op->SetAttr("scale", 1.0f);
return std::unique_ptr<framework::OpDescBind>(grad_op);
});
return grad_ops;
} }
}; };
...@@ -76,7 +83,6 @@ class SumGradOp : public framework::OperatorWithKernel { ...@@ -76,7 +83,6 @@ class SumGradOp : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(sum, ops::SumOp, ops::SumOpMaker, sum_grad, ops::SumGradOp);
REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker);
REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(sum_grad,
ops::SumGradKernel<paddle::platform::CPUPlace, float>);
...@@ -14,5 +14,3 @@ limitations under the License. */ ...@@ -14,5 +14,3 @@ limitations under the License. */
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>); REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(sum_grad,
ops::SumGradKernel<paddle::platform::GPUPlace, float>);
...@@ -42,24 +42,5 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -42,24 +42,5 @@ class SumKernel : public framework::OpKernel<T> {
} }
}; };
template <typename Place, typename T>
class SumGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<Tensor>(framework::GradVarName("Out"));
auto outs = context.MultiOutput<Tensor>(framework::GradVarName("X"));
for (auto out : outs) {
out->mutable_data<T>(context.GetPlace());
}
auto place = context.GetEigenDevice<Place>();
auto in = EigenVector<T>::Flatten(*input);
for (auto out : outs) {
auto result = EigenVector<T>::Flatten(*out);
result.device(place) = in;
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -35,7 +35,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { ...@@ -35,7 +35,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
template <> template <>
Eigen::GpuDevice* Eigen::GpuDevice*
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include "paddle/platform/dynload/cublas.h" #include "paddle/platform/dynload/cublas.h"
#include "paddle/platform/dynload/cudnn.h" #include "paddle/platform/dynload/cudnn.h"
#include "paddle/platform/gpu_info.h" #include "paddle/platform/gpu_info.h"
...@@ -61,7 +61,7 @@ class CPUDeviceContext : public DeviceContext { ...@@ -61,7 +61,7 @@ class CPUDeviceContext : public DeviceContext {
std::unique_ptr<Eigen::DefaultDevice> eigen_device_; std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
}; };
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
template <> template <>
struct EigenDeviceConverter<platform::GPUPlace> { struct EigenDeviceConverter<platform::GPUPlace> {
using EigenDeviceType = Eigen::GpuDevice; using EigenDeviceType = Eigen::GpuDevice;
......
...@@ -20,7 +20,7 @@ TEST(Device, Init) { ...@@ -20,7 +20,7 @@ TEST(Device, Init) {
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::GPUPlace;
int count = paddle::platform::GetDeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
Eigen::GpuDevice* gpu_device = Eigen::GpuDevice* gpu_device =
...@@ -34,7 +34,7 @@ TEST(Device, CUDADeviceContext) { ...@@ -34,7 +34,7 @@ TEST(Device, CUDADeviceContext) {
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
using paddle::platform::GPUPlace; using paddle::platform::GPUPlace;
int count = paddle::platform::GetDeviceCount(); int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
Eigen::GpuDevice* gpu_device = device_context->eigen_device(); Eigen::GpuDevice* gpu_device = device_context->eigen_device();
......
...@@ -29,7 +29,7 @@ limitations under the License. */ ...@@ -29,7 +29,7 @@ limitations under the License. */
#include <cxxabi.h> // for __cxa_demangle #include <cxxabi.h> // for __cxa_demangle
#endif #endif
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include "paddle/platform/dynload/cublas.h" #include "paddle/platform/dynload/cublas.h"
#include "paddle/platform/dynload/cudnn.h" #include "paddle/platform/dynload/cudnn.h"
...@@ -113,7 +113,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( ...@@ -113,7 +113,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
} }
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
template <typename... Args> template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error( inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
...@@ -185,7 +185,7 @@ inline void throw_on_error(T e) { ...@@ -185,7 +185,7 @@ inline void throw_on_error(T e) {
std::make_exception_ptr( \ std::make_exception_ptr( \
std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \ std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
__FILE__, __LINE__); \ __FILE__, __LINE__); \
} while (0) } while (false)
#define PADDLE_ENFORCE(...) \ #define PADDLE_ENFORCE(...) \
do { \ do { \
...@@ -195,7 +195,7 @@ inline void throw_on_error(T e) { ...@@ -195,7 +195,7 @@ inline void throw_on_error(T e) {
throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
__FILE__, __LINE__); \ __FILE__, __LINE__); \
} \ } \
} while (0) } while (false)
/* /*
* Some enforce helpers here, usage: * Some enforce helpers here, usage:
......
...@@ -26,11 +26,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95, ...@@ -26,11 +26,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
namespace paddle { namespace paddle {
namespace platform { namespace platform {
int GetDeviceCount() { int GetCUDADeviceCount() {
int count; int count;
PADDLE_ENFORCE( PADDLE_ENFORCE(
cudaGetDeviceCount(&count), cudaGetDeviceCount(&count),
"cudaGetDeviceCount failed in paddle::platform::GetDeviceCount"); "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
return count; return count;
} }
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stddef.h> #include <stddef.h>
...@@ -28,7 +28,7 @@ const std::string kEnvFractionGpuMemoryToUse = ...@@ -28,7 +28,7 @@ const std::string kEnvFractionGpuMemoryToUse =
"PADDLE_FRACTION_GPU_MEMORY_TO_USE"; "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
//! Get the total number of GPU devices in system. //! Get the total number of GPU devices in system.
int GetDeviceCount(); int GetCUDADeviceCount();
//! Get the current GPU device id in system. //! Get the current GPU device id in system.
int GetCurrentDeviceId(); int GetCurrentDeviceId();
......
...@@ -2,8 +2,10 @@ ...@@ -2,8 +2,10 @@
#ifdef __CUDACC__ #ifdef __CUDACC__
#define HOSTDEVICE __host__ __device__ #define HOSTDEVICE __host__ __device__
#define DEVICE __device__
#define HOST __host__ #define HOST __host__
#else #else
#define HOSTDEVICE #define HOSTDEVICE
#define DEVICE
#define HOST #define HOST
#endif #endif
...@@ -19,5 +19,7 @@ limitations under the License. */ ...@@ -19,5 +19,7 @@ limitations under the License. */
#define DISABLE_COPY_AND_ASSIGN(classname) \ #define DISABLE_COPY_AND_ASSIGN(classname) \
private: \ private: \
classname(const classname&) = delete; \ classname(const classname&) = delete; \
classname& operator=(const classname&) = delete classname(const classname&&) = delete; \
classname& operator=(const classname&) = delete; \
classname& operator=(const classname&&) = delete
#endif #endif
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <boost/config.hpp> #include <boost/config.hpp>
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
// Because boost's variadic templates has bug on nvcc, boost will disable // Because boost's variadic templates has bug on nvcc, boost will disable
// variadic template support when GPU enabled on nvcc. // variadic template support when GPU enabled on nvcc.
......
...@@ -215,7 +215,7 @@ int main(int argc, char** argv) { ...@@ -215,7 +215,7 @@ int main(int argc, char** argv) {
uint64_t dataSize = FLAGS_dim * sizeof(real); uint64_t dataSize = FLAGS_dim * sizeof(real);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
GpuVector gpuParam(FLAGS_dim); GpuVector gpuParam(FLAGS_dim);
GpuVector gpuGrad(FLAGS_dim); GpuVector gpuGrad(FLAGS_dim);
#else #else
......
...@@ -99,7 +99,7 @@ TEST(ProtoServer, regular) { ...@@ -99,7 +99,7 @@ TEST(ProtoServer, regular) {
} }
TEST(ProtoServer, extended) { TEST(ProtoServer, extended) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
ProtoClient* client; ProtoClient* client;
if (FLAGS_rdma_tcp == "rdma") if (FLAGS_rdma_tcp == "rdma")
client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA); client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc SRCS pybind.cc exception.cc protobuf.cc
DEPS pybind python backward proto_desc DEPS pybind python backward proto_desc tensor_array
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
endif(WITH_PYTHON) endif(WITH_PYTHON)
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/framework/backward.h" #include "paddle/framework/backward.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor_array.h"
#include "paddle/operators/cond_op.h" #include "paddle/operators/cond_op.h"
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h" #include "paddle/operators/recurrent_op.h"
...@@ -34,7 +35,7 @@ static size_t UniqueIntegerGenerator() { ...@@ -34,7 +35,7 @@ static size_t UniqueIntegerGenerator() {
} }
bool IsCompileGPU() { bool IsCompileGPU() {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
return false; return false;
#else #else
return true; return true;
...@@ -78,7 +79,7 @@ PYBIND11_PLUGIN(core) { ...@@ -78,7 +79,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCPUTensorSetFromArray<float>) .def("set", PyCPUTensorSetFromArray<float>)
.def("set", PyCPUTensorSetFromArray<int>) .def("set", PyCPUTensorSetFromArray<int>)
.def("set", PyCPUTensorSetFromArray<double>) .def("set", PyCPUTensorSetFromArray<double>)
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
.def("set", PyCUDATensorSetFromArray<float>) .def("set", PyCUDATensorSetFromArray<float>)
.def("set", PyCUDATensorSetFromArray<int>) .def("set", PyCUDATensorSetFromArray<int>)
.def("set", PyCUDATensorSetFromArray<double>) .def("set", PyCUDATensorSetFromArray<double>)
...@@ -96,7 +97,7 @@ PYBIND11_PLUGIN(core) { ...@@ -96,7 +97,7 @@ PYBIND11_PLUGIN(core) {
.def( .def(
"__init__", "__init__",
[](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
new (&instance) LoDTensor(lod); new (&instance) LoDTensor(lod);
#else #else
LoD new_lod; LoD new_lod;
...@@ -107,7 +108,7 @@ PYBIND11_PLUGIN(core) { ...@@ -107,7 +108,7 @@ PYBIND11_PLUGIN(core) {
}) })
.def("set_lod", .def("set_lod",
[](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
self.set_lod(lod); self.set_lod(lod);
#else #else
LoD new_lod; LoD new_lod;
...@@ -117,7 +118,7 @@ PYBIND11_PLUGIN(core) { ...@@ -117,7 +118,7 @@ PYBIND11_PLUGIN(core) {
#endif #endif
}) })
.def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> { .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
return self.lod(); return self.lod();
#else #else
auto lod = self.lod(); auto lod = self.lod();
...@@ -143,6 +144,13 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -143,6 +144,13 @@ All parameter, weight, gradient are variables in Paddle.
.def("set_int", .def("set_int",
[](Variable &var, int val) -> void { *var.GetMutable<int>() = val; }) [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
.def("get_int", [](const Variable &var) -> int { return var.Get<int>(); }) .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
.def("is_float", [](const Variable &var) { return var.IsType<float>(); })
.def("set_float",
[](Variable &var, float val) -> void {
*var.GetMutable<float>() = val;
})
.def("get_float",
[](const Variable &var) -> float { return var.Get<float>(); })
.def("get_tensor", .def("get_tensor",
[](Variable &self) -> LoDTensor * { [](Variable &self) -> LoDTensor * {
return self.GetMutable<LoDTensor>(); return self.GetMutable<LoDTensor>();
...@@ -196,7 +204,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -196,7 +204,7 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create", .def_static("create",
[](paddle::platform::GPUPlace& place) [](paddle::platform::GPUPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("GPUPlace is not supported in CPU device."); PADDLE_THROW("GPUPlace is not supported in CPU device.");
#else #else
return new paddle::platform::CUDADeviceContext(place); return new paddle::platform::CUDADeviceContext(place);
...@@ -223,6 +231,21 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -223,6 +231,21 @@ All parameter, weight, gradient are variables in Paddle.
desc.InitializationErrorString()); desc.InitializationErrorString());
return OpRegistry::CreateOp(desc); return OpRegistry::CreateOp(desc);
}) })
.def_static("infer_shape",
[](OpDescBind &op_desc, BlockDescBind &block) {
auto op = OpRegistry::CreateOp(*op_desc.Proto());
auto *op_with_kernel =
dynamic_cast<OperatorWithKernel *>(op.get());
if (op_with_kernel != nullptr) {
auto ctx = CompileTimeInferShapeContext(op_desc, block);
op_with_kernel->InferShape(&ctx);
} else {
PADDLE_THROW(
"OP(%s) is not type of OperatorWithKernel, "
"should not call this function",
op_desc.Type());
}
})
.def("backward", .def("backward",
[](const OperatorBase &forwardOp, [](const OperatorBase &forwardOp,
const std::unordered_set<std::string> &no_grad_vars) { const std::unordered_set<std::string> &no_grad_vars) {
...@@ -264,6 +287,56 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -264,6 +287,56 @@ All parameter, weight, gradient are variables in Paddle.
self->CompleteAddOp(); self->CompleteAddOp();
}); });
py::class_<framework::TensorArray>(m, "TensorArray")
.def("__init__",
[](TensorArray &instance) { new (&instance) TensorArray(); })
.def("read",
[](TensorArray &self, size_t index) { return self.Read(index); })
.def("write", [](TensorArray &self, size_t index,
LoDTensor &value) { self.Write(index, value); })
.def("write_shared",
[](TensorArray &self, size_t index, const LoDTensor &value) {
self.WriteShared(index, value);
})
.def("size", [](TensorArray &self) { return self.size(); })
.def("pack",
[](TensorArray &self, size_t level,
const std::vector<std::vector<size_t>> &meta_info,
const std::vector<std::vector<size_t>> &lod) {
std::vector<DySeqMeta> meta;
for (auto &info : meta_info) {
PADDLE_ENFORCE_EQ(info.size(), 3UL);
meta.emplace_back(info[0], info[1], info[2]);
}
#ifndef PADDLE_WITH_CUDA
return self.Pack(level, meta, lod);
#else
LoD new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return self.Pack(level, meta, new_lod);
#endif
})
.def("unpack",
[](TensorArray &self, const LoDTensor &source, int level,
bool length_descend) {
auto metas = self.Unpack(source, level, length_descend);
std::vector<std::vector<size_t>> meta_info;
for (auto meta : metas) {
meta_info.emplace_back(
std::vector<size_t>({meta.begin, meta.end, meta.ori_idx}));
}
return meta_info;
})
.def("stack", [](TensorArray &self) { return self.Stack(); })
.def("unstack",
[](TensorArray &self, const LoDTensor &source) {
return self.Unstack(source);
})
.def("unstack_shared", [](TensorArray &self, const LoDTensor &source) {
return self.UnstackShared(source);
});
// recurrent_op // recurrent_op
py::class_<operators::RecurrentOp, OperatorBase>(m, "RecurrentOp") py::class_<operators::RecurrentOp, OperatorBase>(m, "RecurrentOp")
.def_static( .def_static(
......
...@@ -106,7 +106,7 @@ void PyCPUTensorSetFromArray( ...@@ -106,7 +106,7 @@ void PyCPUTensorSetFromArray(
std::memcpy(dst, array.data(), sizeof(T) * array.size()); std::memcpy(dst, array.data(), sizeof(T) * array.size());
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
template <typename T> template <typename T>
void PyCUDATensorSetFromArray( void PyCUDATensorSetFromArray(
framework::Tensor &self, framework::Tensor &self,
......
...@@ -29,7 +29,7 @@ int main(int argc, char** argv) { ...@@ -29,7 +29,7 @@ int main(int argc, char** argv) {
initMain(argc, argv); initMain(argc, argv);
initPython(argc, argv); initPython(argc, argv);
string confFile = TrainerConfigHelper::getConfigNameFromPath(FLAGS_model_dir); string confFile = TrainerConfigHelper::getConfigNameFromPath(FLAGS_model_dir);
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
FLAGS_use_gpu = false; FLAGS_use_gpu = false;
#endif #endif
auto config = std::make_shared<TrainerConfigHelper>(confFile); auto config = std::make_shared<TrainerConfigHelper>(confFile);
......
...@@ -146,7 +146,7 @@ void compareGradient(comData& comDataCpu, comData& comDataGpu) { ...@@ -146,7 +146,7 @@ void compareGradient(comData& comDataCpu, comData& comDataGpu) {
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
exit(0); exit(0);
#endif #endif
paddle::initMain(argc, argv); paddle::initMain(argc, argv);
......
...@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) { ...@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
FLAGS_local = local; FLAGS_local = local;
FLAGS_ports_num_for_sparse = 5; FLAGS_ports_num_for_sparse = 5;
for (bool useGpu : {false, true}) { for (bool useGpu : {false, true}) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) continue; if (useGpu) continue;
#endif #endif
FLAGS_parallel_nn = useGpu; FLAGS_parallel_nn = useGpu;
...@@ -198,7 +198,7 @@ TEST(compareSparse, NeuralNetwork) { ...@@ -198,7 +198,7 @@ TEST(compareSparse, NeuralNetwork) {
FLAGS_local = local; FLAGS_local = local;
FLAGS_ports_num_for_sparse = 5; FLAGS_ports_num_for_sparse = 5;
for (bool useGpu : {false, true}) { for (bool useGpu : {false, true}) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
if (useGpu) continue; if (useGpu) continue;
#endif #endif
FLAGS_parallel_nn = useGpu; FLAGS_parallel_nn = useGpu;
......
...@@ -51,7 +51,7 @@ void checkGradientTest(const string& configFile, ...@@ -51,7 +51,7 @@ void checkGradientTest(const string& configFile,
TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); } TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); } TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
TEST(checkGradient, multiGpu) { TEST(checkGradient, multiGpu) {
...@@ -97,7 +97,7 @@ TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); } ...@@ -97,7 +97,7 @@ TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
TEST(checkGradient, chunk) { TEST(checkGradient, chunk) {
checkGradientTest(configFile3, false, false); checkGradientTest(configFile3, false, false);
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
checkGradientTest(configFile3, true, true); checkGradientTest(configFile3, true, true);
#endif #endif
} }
......
...@@ -79,7 +79,7 @@ void trainerOnePassTest(const string& configFile, ...@@ -79,7 +79,7 @@ void trainerOnePassTest(const string& configFile,
// 1. test trainer (cpu, gpu). // 1. test trainer (cpu, gpu).
TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); } TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); } TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); } TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
...@@ -94,7 +94,7 @@ TEST(trainerOnePass, parallel) { ...@@ -94,7 +94,7 @@ TEST(trainerOnePass, parallel) {
#endif #endif
// 2. test average_window. // 2. test average_window.
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(average_window, gpu) { TEST(average_window, gpu) {
trainerOnePassTest(configFile1, true, false, 4, 0.01); trainerOnePassTest(configFile1, true, false, 4, 0.01);
} }
...@@ -266,7 +266,7 @@ TEST(checkRemoteUpdater, cpuTrainerOldUpdater) { ...@@ -266,7 +266,7 @@ TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true); checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
} }
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
TEST(checkRemoteUpdater, gpuTrainer) { TEST(checkRemoteUpdater, gpuTrainer) {
checkRemoteParameterUpdaterTest(configFile1, true, false); checkRemoteParameterUpdaterTest(configFile1, true, false);
} }
......
...@@ -113,7 +113,7 @@ void testGeneration(const string& configFile, ...@@ -113,7 +113,7 @@ void testGeneration(const string& configFile,
#ifndef PADDLE_TYPE_DOUBLE #ifndef PADDLE_TYPE_DOUBLE
TEST(RecurrentGradientMachine, test_generation) { TEST(RecurrentGradientMachine, test_generation) {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
const auto useGpuConfs = {false}; const auto useGpuConfs = {false};
#else #else
const auto useGpuConfs = {true, false}; const auto useGpuConfs = {true, false};
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include "Flags.h" #include "Flags.h"
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
DEFINE_bool(use_gpu, false, "Only support CPU training"); DEFINE_bool(use_gpu, false, "Only support CPU training");
#else #else
DEFINE_bool(use_gpu, true, "Whether to use GPU for training"); DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
......
...@@ -218,7 +218,7 @@ protected: ...@@ -218,7 +218,7 @@ protected:
* *d2* is peer device to enable direct access to by the d1 device. * *d2* is peer device to enable direct access to by the d1 device.
*/ */
inline void enablePeerAccess(int d1, int d2) { inline void enablePeerAccess(int d1, int d2) {
#ifndef PADDLE_ONLY_CPU #ifdef PADDLE_WITH_CUDA
if (hl_device_can_access_peer(d1, d2)) { if (hl_device_can_access_peer(d1, d2)) {
SetDevice dev(d1); SetDevice dev(d1);
hl_device_enable_peer_access(d2); hl_device_enable_peer_access(d2);
......
...@@ -48,7 +48,7 @@ void printVersion(std::ostream& os); ...@@ -48,7 +48,7 @@ void printVersion(std::ostream& os);
* @return return true if paddle compiled with GPU * @return return true if paddle compiled with GPU
*/ */
constexpr bool isWithGpu() { constexpr bool isWithGpu() {
#ifdef PADDLE_ONLY_CPU #ifndef PADDLE_WITH_CUDA
return false; return false;
#else #else
return true; return true;
......
...@@ -142,6 +142,7 @@ __all__ = [ ...@@ -142,6 +142,7 @@ __all__ = [
'img_pool3d_layer', 'img_pool3d_layer',
'scale_shift_layer', 'scale_shift_layer',
'img_conv3d_layer', 'img_conv3d_layer',
'resize_layer',
] ]
...@@ -250,6 +251,8 @@ class LayerType(object): ...@@ -250,6 +251,8 @@ class LayerType(object):
KMAX_SEQ_SCORE = 'kmax_seq_score' KMAX_SEQ_SCORE = 'kmax_seq_score'
SCALE_SHIFT_LAYER = 'scale_shift' SCALE_SHIFT_LAYER = 'scale_shift'
RESIZE = 'resize'
@staticmethod @staticmethod
def is_layer_type(type_name): def is_layer_type(type_name):
""" """
...@@ -6932,3 +6935,23 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None): ...@@ -6932,3 +6935,23 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
bias=ParamAttr.to_bias(bias_attr)) bias=ParamAttr.to_bias(bias_attr))
return LayerOutput( return LayerOutput(
name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size) name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
@wrap_name_default("resize")
def resize_layer(input, size, name=None):
"""
The resize layer resizes the input matrix with a shape of [Height, Width]
into the output matrix with a shape of [Height x Width / size, size],
where size is the parameter of this layer indicating the output dimension.
:param input: The input to this layer.
:type input: LayerOutput.
:param name: The name of this layer. It is optional.
:type name: basestring
:param size: The resized output dimesion of this layer.
:type size: int
:return: A LayerOutput object.
:rtype: LayerOutput
"""
Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
...@@ -10,6 +10,6 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la ...@@ -10,6 +10,6 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
test_conv3d_layer test_deconv3d_layer test_BatchNorm3D) test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
export whole_configs=(test_split_datasource) export whole_configs=(test_split_datasource)
type: "nn"
layers {
name: "input"
type: "data"
size: 300
active_type: ""
}
layers {
name: "__resize_0__"
type: "resize"
size: 150
active_type: ""
inputs {
input_layer_name: "input"
}
}
input_layer_names: "input"
output_layer_names: "__resize_0__"
sub_models {
name: "root"
layer_names: "input"
layer_names: "__resize_0__"
input_layer_names: "input"
output_layer_names: "__resize_0__"
is_recurrent_layer_group: false
}
from paddle.trainer_config_helpers import *
data = data_layer(name='input', size=300)
resized = resize_layer(input=data, size=150)
outputs(resized)
...@@ -46,12 +46,17 @@ def create_op(scope, op_type, inputs, outputs, attrs): ...@@ -46,12 +46,17 @@ def create_op(scope, op_type, inputs, outputs, attrs):
def set_input(scope, op, inputs, place): def set_input(scope, op, inputs, place):
def __set_input__(var_name, var): def __set_input__(var_name, var):
if isinstance(var, tuple) or isinstance(var, np.ndarray):
tensor = scope.find_var(var_name).get_tensor() tensor = scope.find_var(var_name).get_tensor()
if isinstance(var, tuple): if isinstance(var, tuple):
tensor.set_lod(var[1]) tensor.set_lod(var[1])
var = var[0] var = var[0]
tensor.set_dims(var.shape) tensor.set_dims(var.shape)
tensor.set(var, place) tensor.set(var, place)
elif isinstance(var, float):
scope.find_var(var_name).set_float(var)
elif isinstance(var, int):
scope.find_var(var_name).set_int(var)
for in_name, in_dup in Operator.get_op_inputs(op.type()): for in_name, in_dup in Operator.get_op_inputs(op.type()):
if in_name in inputs: if in_name in inputs:
......
...@@ -48,6 +48,21 @@ class TestTanh(OpTest): ...@@ -48,6 +48,21 @@ class TestTanh(OpTest):
self.check_grad(['X'], 'Y', max_relative_error=0.007) self.check_grad(['X'], 'Y', max_relative_error=0.007)
class TestTanhShrink(OpTest):
def setUp(self):
self.op_type = "tanh_shrink"
self.inputs = {
'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
}
self.outputs = {'Y': self.inputs['X'] - np.tanh(self.inputs['X'])}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Y', max_relative_error=0.008)
class TestSqrt(OpTest): class TestSqrt(OpTest):
def setUp(self): def setUp(self):
self.op_type = "sqrt" self.op_type = "sqrt"
...@@ -122,6 +137,23 @@ class TestBRelu(OpTest): ...@@ -122,6 +137,23 @@ class TestBRelu(OpTest):
self.check_grad(['X'], 'Y', max_relative_error=0.02) self.check_grad(['X'], 'Y', max_relative_error=0.02)
class TestLeakyRelu(OpTest):
def setUp(self):
self.op_type = "leaky_relu"
alpha = 0.02
self.attrs = {'alpha': alpha}
self.inputs = {'X': np.random.uniform(-3, 3, [4, 4]).astype("float32")}
self.outputs = {
'Y': np.maximum(self.inputs['X'], alpha * self.inputs['X'])
}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Y', max_relative_error=0.007)
class TestSoftRelu(OpTest): class TestSoftRelu(OpTest):
def setUp(self): def setUp(self):
self.op_type = "soft_relu" self.op_type = "soft_relu"
......
import unittest
import numpy as np
from op_test import OpTest
class TestAdadeltaOp1(OpTest):
def setUp(self):
self.op_type = "adadelta"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
# The squared gradient is positive
avg_squared_grad = np.random.random((102, 105)).astype("float32")
# The squared update is positive
avg_squared_update = np.random.random((102, 105)).astype("float32")
rho = 0.95
epsilon = 1e-6
self.inputs = {
'Param': param,
'Grad': grad,
'AvgSquaredGrad': avg_squared_grad,
'AvgSquaredUpdate': avg_squared_update
}
self.attrs = {'rho': rho, 'epsilon': epsilon}
avg_squared_grad_out = rho * avg_squared_grad + \
(1 - rho) * np.square(grad)
update = -np.multiply(
np.sqrt(
np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
epsilon)), grad)
avg_squared_update_out = rho * avg_squared_update + \
(1 - rho) * np.square(update)
param_out = param + update
self.outputs = {
'ParamOut': param_out,
'AvgSquaredGradOut': avg_squared_grad_out,
'AvgSquaredUpdateOut': avg_squared_update_out
}
def test_check_output(self):
self.check_output()
class TestAdadeltaOp2(OpTest):
'''Test Adadelta op with default attribute values
'''
def setUp(self):
self.op_type = "adadelta"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
# The squared gradient is positive
avg_squared_grad = np.random.random((102, 105)).astype("float32")
# The squared update is positive
avg_squared_update = np.random.random((102, 105)).astype("float32")
rho = 0.95
epsilon = 1e-6
self.inputs = {
'Param': param,
'Grad': grad,
'AvgSquaredGrad': avg_squared_grad,
'AvgSquaredUpdate': avg_squared_update
}
avg_squared_grad_out = rho * avg_squared_grad + \
(1 - rho) * np.square(grad)
update = -np.multiply(
np.sqrt(
np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
epsilon)), grad)
avg_squared_update_out = rho * avg_squared_update + \
(1 - rho) * np.square(update)
param_out = param + update
self.outputs = {
'ParamOut': param_out,
'AvgSquaredGradOut': avg_squared_grad_out,
'AvgSquaredUpdateOut': avg_squared_update_out
}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
class TestAdagradOp1(OpTest):
''' Test Adagrad operator with explicit attributes
'''
def setUp(self):
self.op_type = "adagrad"
param = np.random.random((123, 321)).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
moment = np.zeros((123, 321)).astype("float32")
lr = 0.01
epsilon = 1e-8
self.inputs = {
'Param': param,
'Grad': grad,
'Moment': moment,
'LearningRate': np.array([lr]).astype("float32")
}
self.attrs = {'epsilon': epsilon}
moment_out = moment + grad * grad
param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
def test_check_output(self):
self.check_output()
class TestAdagradOp2(OpTest):
''' Test Adagrad operator with default attributes
'''
def setUp(self):
self.op_type = "adagrad"
param = np.random.random((123, 321)).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
moment = np.zeros((123, 321)).astype("float32")
lr = 0.01
epsilon = 1e-6
self.inputs = {
'Param': param,
'Grad': grad,
'Moment': moment,
'LearningRate': np.array([lr]).astype("float32")
}
self.attrs = {'epsilon': epsilon}
moment_out = moment + grad * grad
param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
class TestAddOp(OpTest):
def setUp(self):
self.op_type = "add"
self.inputs = {
'X': np.random.random((102, 105)).astype("float32"),
'Y': np.random.random((102, 105)).astype("float32")
}
self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
...@@ -15,7 +15,7 @@ class PySimpleCond(object): ...@@ -15,7 +15,7 @@ class PySimpleCond(object):
for i in range(1, 10, 2): for i in range(1, 10, 2):
array[i] = 0 array[i] = 0
self.cond = np.array(array) self.cond = np.array(array)
self.x = np.ones(shape=(10, 1)) self.x = np.ones(shape=(10, 1)).astype("float32")
def forward(self): def forward(self):
self.index_t = np.where(self.cond == 1) self.index_t = np.where(self.cond == 1)
...@@ -112,7 +112,4 @@ class TestCondOp(unittest.TestCase): ...@@ -112,7 +112,4 @@ class TestCondOp(unittest.TestCase):
if __name__ == "__main__": if __name__ == "__main__":
exit(
0
) # FIXME(yuyang18): Since infer_shape has been removed, cond op may error
unittest.main() unittest.main()
import unittest
import numpy as np
import paddle.v2.framework.core as core
from op_test import get_numeric_gradient
from op_test import create_op
class GetNumericGradientTest(unittest.TestCase):
def test_add_op(self):
x = np.random.random((10, 1)).astype("float32")
y = np.random.random((10, 1)).astype("float32")
z = x + y
scope = core.Scope()
add_op = create_op(scope, "add", {'X': x, 'Y': y}, {'Out': z}, dict())
arr = get_numeric_gradient(scope, add_op, {'X': x,
'Y': y}, 'X', ['Out'])
self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4)
def test_softmax_op(self):
def stable_softmax(x):
"""Compute the softmax of vector x in a numerically stable way."""
shiftx = x - np.max(x)
exps = np.exp(shiftx)
return exps / np.sum(exps)
def label_softmax_grad(Y, dY):
dX = Y * 0.0
for i in range(Y.shape[0]):
d = np.dot(Y[i, :], dY[i, :])
dX[i, :] = Y[i, :] * (dY[i, :] - d)
return dX
X = np.random.random((2, 2)).astype("float32")
Y = np.apply_along_axis(stable_softmax, 1, X)
dY = np.ones(Y.shape)
dX = label_softmax_grad(Y, dY)
scope = core.Scope()
softmax_op = create_op(scope, "softmax", {"X": X}, {"Y": Y}, dict())
arr = get_numeric_gradient(scope, softmax_op, {"X": X}, "X", "Y")
np.testing.assert_almost_equal(arr, dX, decimal=1e-2)
if __name__ == "__main__":
unittest.main()
import unittest
import paddle.v2.framework.core as core
from paddle.v2.framework.op import Operator
class TestInferShape(unittest.TestCase):
def test_sum_op(self):
prog = core.ProgramDesc.__create_program_desc__()
self.assertIsNotNone(prog)
block = prog.block(0)
self.assertIsNotNone(block)
shape = [10, 20]
# prepare input/output
x1 = block.new_var("x1")
x1.set_shape(shape)
x2 = block.new_var("x2")
x2.set_shape(shape)
out = block.new_var("out")
# prepare the operator
sum_op_desc = block.append_op()
sum_op_desc.set_type("sum")
sum_op_desc.set_input("X", ["x1", "x2"])
sum_op_desc.set_output("Out", ["out"])
core.Operator.infer_shape(sum_op_desc, block)
self.assertEqual(out.shape(), shape)
def test_mul_op(self):
prog = core.ProgramDesc.__create_program_desc__()
self.assertIsNotNone(prog)
block = prog.block(0)
self.assertIsNotNone(block)
x_shape = [10, 20]
y_shape = [20, 30]
# prepare input/output
x1 = block.new_var("x")
x1.set_shape(x_shape)
x2 = block.new_var("y")
x2.set_shape(y_shape)
out = block.new_var("out")
# prepare the operator
mul_op_desc = block.append_op()
mul_op_desc.set_type("mul")
mul_op_desc.set_input("X", ["x"])
mul_op_desc.set_input("Y", ["y"])
mul_op_desc.set_output("Out", ["out"])
mul_op_desc.set_attr("x_num_col_dims", 1)
mul_op_desc.set_attr("y_num_col_dims", 1)
core.Operator.infer_shape(mul_op_desc, block)
self.assertEqual(out.shape(), [x_shape[0], y_shape[1]])
if __name__ == '__main__':
unittest.main()
...@@ -14,8 +14,8 @@ def tanh_np(x): ...@@ -14,8 +14,8 @@ def tanh_np(x):
class LstmUnitTest(OpTest): class LstmUnitTest(OpTest):
def setUp(self): def setUp(self):
self.op_type = "lstm_unit" self.op_type = "lstm_unit"
x_np = np.random.normal(size=(5, 16)).astype("float32") x_np = np.random.normal(size=(5, 16)).astype("float64")
c_np = np.random.normal(size=(5, 4)).astype("float32") c_np = np.random.normal(size=(5, 4)).astype("float64")
i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1) i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
forget_bias_np = 0. forget_bias_np = 0.
self.attrs = {'forget_bias': 0.} self.attrs = {'forget_bias': 0.}
...@@ -31,7 +31,7 @@ class LstmUnitTest(OpTest): ...@@ -31,7 +31,7 @@ class LstmUnitTest(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X', 'C_prev'], ['C', 'H'], max_relative_error=0.01) self.check_grad(['X', 'C_prev'], ['C', 'H'])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,7 +15,7 @@ def fc(X, W, Y): ...@@ -15,7 +15,7 @@ def fc(X, W, Y):
class TestNet(unittest.TestCase): class TestNet(unittest.TestCase):
def test_net_all(self): def test_net_all(self):
net = core.Net.create() net = core.Net.create()
op1 = Operator("add", X="X", Y="Y", Out="Out") op1 = Operator("sum", X=["X", "Y"], Out="Out")
net.append_op(op1) net.append_op(op1)
net2 = core.Net.create() net2 = core.Net.create()
...@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase): ...@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
expected = ''' expected = '''
Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}. Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}. Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}. Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}. Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}. Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
......
...@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase): ...@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
class TestOpCreations(unittest.TestCase): class TestOpCreations(unittest.TestCase):
def test_all(self): def test_all(self):
add_op = op.Operator("add", X="a", Y="b", Out="z") add_op = op.Operator("sum", X=["a", "b"], Out="z")
self.assertIsNotNone(add_op) self.assertIsNotNone(add_op)
# Invoke C++ DebugString() # Invoke C++ DebugString()
self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.', self.assertEqual('Op(sum), inputs:{X[a, b]}, outputs:{Out[z]}.',
str(add_op)) str(add_op))
......
import unittest
import numpy as np
from op_test import OpTest
def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
N, C, H, W = x.shape
if global_pool == 1:
ksize = [H, W]
H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
out = np.zeros((N, C, H_out, W_out))
for i in xrange(H_out):
for j in xrange(W_out):
r_start = np.max((i * strides[0] - paddings[0], 0))
r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
c_start = np.max((j * strides[1] - paddings[1], 0))
c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
x_masked = x[:, :, r_start:r_end, c_start:c_end]
out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
return out
def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
N, C, H, W = x.shape
if global_pool == 1:
ksize = [H, W]
H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
out = np.zeros((N, C, H_out, W_out))
for i in xrange(H_out):
for j in xrange(W_out):
r_start = np.max((i * strides[0] - paddings[0], 0))
r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
c_start = np.max((j * strides[1] - paddings[1], 0))
c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
x_masked = x[:, :, r_start:r_end, c_start:c_end]
out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
(r_end - r_start) * (c_end - c_start))
return out
class TestPool2d_Op(OpTest):
def setUp(self):
self.initTestCase()
input = np.random.random(self.shape).astype("float32")
output = self.pool2D_forward_naive(input, self.ksize, self.strides,
self.paddings, self.global_pool)
self.inputs = {'X': input}
self.attrs = {
'strides': self.strides,
'paddings': self.paddings,
'ksize': self.ksize,
'poolingType': self.pool_type,
'globalPooling': self.global_pool,
}
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
if self.pool_type != "max":
self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
def initTestCase(self):
self.global_pool = True
self.op_type = "pool2d"
self.pool_type = "avg"
self.pool2D_forward_naive = avg_pool2D_forward_naive
self.shape = [2, 3, 5, 5]
self.ksize = [3, 3]
self.strides = [1, 1]
self.paddings = [0, 0]
class TestCase1(TestPool2d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool2d"
self.pool_type = "avg"
self.pool2D_forward_naive = avg_pool2D_forward_naive
self.shape = [2, 3, 7, 7]
self.ksize = [3, 3]
self.strides = [1, 1]
self.paddings = [0, 0]
class TestCase2(TestPool2d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool2d"
self.pool_type = "avg"
self.pool2D_forward_naive = avg_pool2D_forward_naive
self.shape = [2, 3, 7, 7]
self.ksize = [3, 3]
self.strides = [1, 1]
self.paddings = [1, 1]
class TestCase3(TestPool2d_Op):
def initTestCase(self):
self.global_pool = True
self.op_type = "pool2d"
self.pool_type = "max"
self.pool2D_forward_naive = max_pool2D_forward_naive
self.shape = [2, 3, 5, 5]
self.ksize = [3, 3]
self.strides = [1, 1]
self.paddings = [0, 0]
class TestCase4(TestPool2d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool2d"
self.pool_type = "max"
self.pool2D_forward_naive = max_pool2D_forward_naive
self.shape = [2, 3, 7, 7]
self.ksize = [3, 3]
self.strides = [1, 1]
self.paddings = [0, 0]
class TestCase5(TestPool2d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool2d"
self.pool_type = "max"
self.pool2D_forward_naive = max_pool2D_forward_naive
self.shape = [2, 3, 7, 7]
self.ksize = [3, 3]
self.strides = [1, 1]
self.paddings = [1, 1]
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
from op_test import OpTest
def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
N, C, D, H, W = x.shape
if global_pool == 1:
ksize = [D, H, W]
D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
out = np.zeros((N, C, D_out, H_out, W_out))
for k in xrange(D_out):
d_start = np.max((k * strides[0] - paddings[0], 0))
d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
for i in xrange(H_out):
h_start = np.max((i * strides[0] - paddings[0], 0))
h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
for j in xrange(W_out):
w_start = np.max((j * strides[1] - paddings[1], 0))
w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
return out
def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
N, C, D, H, W = x.shape
if global_pool == 1:
ksize = [D, H, W]
D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
out = np.zeros((N, C, D_out, H_out, W_out))
for k in xrange(D_out):
d_start = np.max((k * strides[0] - paddings[0], 0))
d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
for i in xrange(H_out):
h_start = np.max((i * strides[0] - paddings[0], 0))
h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
for j in xrange(W_out):
w_start = np.max((j * strides[1] - paddings[1], 0))
w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / (
(d_end - d_start) * (h_end - h_start) * (w_end - w_start))
return out
class TestPool3d_Op(OpTest):
def setUp(self):
self.initTestCase()
input = np.random.random(self.shape).astype("float32")
output = self.pool3D_forward_naive(input, self.ksize, self.strides,
self.paddings, self.global_pool)
self.inputs = {'X': input}
self.attrs = {
'strides': self.strides,
'paddings': self.paddings,
'ksize': self.ksize,
'poolingType': self.pool_type,
'globalPooling': self.global_pool,
}
self.outputs = {'Out': output}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
if self.pool_type != "max":
self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
def initTestCase(self):
self.global_pool = True
self.op_type = "pool3d"
self.pool_type = "avg"
self.pool3D_forward_naive = avg_pool3D_forward_naive
self.shape = [2, 3, 5, 5, 5]
self.ksize = [3, 3, 3]
self.strides = [1, 1, 1]
self.paddings = [0, 0, 0]
class TestCase1(TestPool3d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool3d"
self.pool_type = "avg"
self.pool3D_forward_naive = avg_pool3D_forward_naive
self.shape = [2, 3, 7, 7, 7]
self.ksize = [3, 3, 3]
self.strides = [1, 1, 1]
self.paddings = [0, 0, 0]
class TestCase2(TestPool3d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool3d"
self.pool_type = "avg"
self.pool3D_forward_naive = avg_pool3D_forward_naive
self.shape = [2, 3, 7, 7, 7]
self.ksize = [3, 3, 3]
self.strides = [1, 1, 1]
self.paddings = [1, 1, 1]
class TestCase3(TestPool3d_Op):
def initTestCase(self):
self.global_pool = True
self.op_type = "pool3d"
self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive
self.shape = [2, 3, 5, 5, 5]
self.ksize = [3, 3, 3]
self.strides = [1, 1, 1]
self.paddings = [0, 0, 0]
class TestCase4(TestPool3d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool3d"
self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive
self.shape = [2, 3, 7, 7, 7]
self.ksize = [3, 3, 3]
self.strides = [1, 1, 1]
self.paddings = [0, 0, 0]
class TestCase5(TestPool3d_Op):
def initTestCase(self):
self.global_pool = False
self.op_type = "pool3d"
self.pool_type = "max"
self.pool3D_forward_naive = max_pool3D_forward_naive
self.shape = [2, 3, 7, 7, 7]
self.ksize = [3, 3, 3]
self.strides = [1, 1, 1]
self.paddings = [1, 1, 1]
if __name__ == '__main__':
unittest.main()
...@@ -16,14 +16,17 @@ class PySimpleRNN(object): ...@@ -16,14 +16,17 @@ class PySimpleRNN(object):
''' '''
def __init__(self, input_dim=30, batch_size=50, weight_dim=15, sent_len=11): def __init__(self, input_dim=30, batch_size=50, weight_dim=15, sent_len=11):
self.x = np.random.normal(size=(sent_len, batch_size, input_dim)) self.x = np.random.normal(size=(sent_len, batch_size,
self.W = np.random.normal(size=(input_dim, input_dim)) input_dim)).astype("float32")
self.U = np.random.normal(size=(input_dim, input_dim)) self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
self.h_boot = np.random.normal(size=(batch_size, input_dim)) self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
self.h_boot = np.random.normal(size=(batch_size,
input_dim)).astype("float32")
# memories # memories
self.mems = [ self.mems = [
np.zeros(shape=(batch_size, input_dim)) for i in range(sent_len) np.zeros(shape=(batch_size, input_dim)).astype("float32")
for i in range(sent_len)
] ]
def forward(self): def forward(self):
...@@ -36,7 +39,7 @@ class PySimpleRNN(object): ...@@ -36,7 +39,7 @@ class PySimpleRNN(object):
return [self.x[i] for i in range(self.x.shape[0])] return [self.x[i] for i in range(self.x.shape[0])]
def concat_outputs(self): def concat_outputs(self):
return np.array(self.mems) return np.array(self.mems).astype("float32")
def step(self, step_id, x): def step(self, step_id, x):
''' '''
...@@ -47,8 +50,8 @@ class PySimpleRNN(object): ...@@ -47,8 +50,8 @@ class PySimpleRNN(object):
pre_mem = self.mems[step_id - 1] pre_mem = self.mems[step_id - 1]
else: else:
pre_mem = self.h_boot pre_mem = self.h_boot
xW = np.matmul(x, self.W) xW = np.matmul(x, self.W).astype("float32")
hU = np.matmul(pre_mem, self.U) hU = np.matmul(pre_mem, self.U).astype("float32")
sum = xW + hU sum = xW + hU
self.mems[step_id] = py_sigmoid(sum) self.mems[step_id] = py_sigmoid(sum)
...@@ -102,7 +105,8 @@ class RecurrentOpTest(unittest.TestCase): ...@@ -102,7 +105,8 @@ class RecurrentOpTest(unittest.TestCase):
self.create_step_net() self.create_step_net()
ctx = core.DeviceContext.create(core.CPUPlace()) ctx = core.DeviceContext.create(core.CPUPlace())
self.rnnop.run(self.scope, ctx) self.rnnop.run(self.scope, ctx)
return np.array(self.scope.find_var("h@mem").get_tensor()) return np.array(self.scope.find_var("h@mem").get_tensor()).astype(
"float32")
def create_global_variables(self): def create_global_variables(self):
# create inlink # create inlink
...@@ -142,7 +146,7 @@ class RecurrentOpTest(unittest.TestCase): ...@@ -142,7 +146,7 @@ class RecurrentOpTest(unittest.TestCase):
stepnet = core.Net.create() stepnet = core.Net.create()
x_fc_op = Operator("mul", X="x", Y="W", Out="Wx") x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
sum_op = Operator("add", X="Wx", Y="Uh", Out="sum") sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
sig_op = Operator("sigmoid", X="sum", Y="h@mem") sig_op = Operator("sigmoid", X="sum", Y="h@mem")
for op in [x_fc_op, h_fc_op, sum_op, sig_op]: for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
...@@ -179,7 +183,7 @@ class RecurrentGradientOpTest(unittest.TestCase): ...@@ -179,7 +183,7 @@ class RecurrentGradientOpTest(unittest.TestCase):
stepnet = core.Net.create() stepnet = core.Net.create()
x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx") x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
sum_op = Operator("add", X="Wx", Y="Uh", Out="sum") sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
sig_op = Operator("sigmoid", X="sum", Y="h@alias") sig_op = Operator("sigmoid", X="sum", Y="h@alias")
for op in [x_fc_op, h_fc_op, sum_op, sig_op]: for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
...@@ -197,7 +201,4 @@ class RecurrentGradientOpTest(unittest.TestCase): ...@@ -197,7 +201,4 @@ class RecurrentGradientOpTest(unittest.TestCase):
if __name__ == '__main__': if __name__ == '__main__':
exit(
0
) # FIXME(yuyang18): InferShape has been removed, this unittest may error
unittest.main() unittest.main()
import unittest
import numpy as np
from op_test import OpTest
class TestRmspropOp1(OpTest):
''' Test RMSProp with explicit inputs
'''
def setUp(self):
self.op_type = "rmsprop"
param = np.random.random((123, 321)).astype("float32")
mean_square = np.random.random((123, 321)).astype("float32")
learning_rate = np.array([0.01]).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
moment = np.zeros((123, 321)).astype("float32")
epsilon = 1e-6
decay = 0.9
momentum = 0.0
self.inputs = {
'Param': param,
'MeanSquare': mean_square,
'LearningRate': learning_rate,
'Grad': grad,
'Moment': moment,
}
self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
ms_out = decay * mean_square + (1 - decay) * grad * grad
moment_out = momentum * moment + \
learning_rate * grad / np.sqrt(ms_out + epsilon)
param_out = param - moment_out
self.outputs = {
'ParamOut': param_out,
'MomentOut': moment_out,
'MeanSquareOut': ms_out
}
def test_check_output(self):
self.check_output()
class TestRmspropOp2(OpTest):
'''Test RMSProp with defaukt values for attributes
'''
def setUp(self):
self.op_type = "rmsprop"
param = np.random.random((123, 321)).astype("float32")
mean_square = np.random.random((123, 321)).astype("float32")
learning_rate = np.array([0.01]).astype("float32")
grad = np.random.random((123, 321)).astype("float32")
moment = np.zeros((123, 321)).astype("float32")
epsilon = 1.0e-10
decay = 0.9
momentum = 0.0
self.inputs = {
'Param': param,
'MeanSquare': mean_square,
'LearningRate': learning_rate,
'Grad': grad,
'Moment': moment,
}
ms_out = decay * mean_square + (1 - decay) * grad * grad
moment_out = momentum * moment + \
learning_rate * grad / np.sqrt(ms_out + epsilon)
param_out = param - moment_out
self.outputs = {
'ParamOut': param_out,
'MomentOut': moment_out,
'MeanSquareOut': ms_out
}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
...@@ -10,7 +10,7 @@ class TestScatterOp(OpTest): ...@@ -10,7 +10,7 @@ class TestScatterOp(OpTest):
index_np = np.array([1, 2]).astype("int32") index_np = np.array([1, 2]).astype("int32")
updates_np = np.random.random((2, 3)).astype("float32") updates_np = np.random.random((2, 3)).astype("float32")
output_np = np.copy(ref_np) output_np = np.copy(ref_np)
output_np[index_np] += updates_np output_np[index_np] = updates_np
self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np} self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
self.outputs = {'Out': output_np} self.outputs = {'Out': output_np}
...@@ -18,7 +18,7 @@ class TestScatterOp(OpTest): ...@@ -18,7 +18,7 @@ class TestScatterOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['Updates', 'Ref'], 'Out', in_place=True) self.check_grad(['Updates'], 'Out', in_place=True)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -8,11 +8,10 @@ class TestSGDOp(OpTest): ...@@ -8,11 +8,10 @@ class TestSGDOp(OpTest):
self.op_type = "sgd" self.op_type = "sgd"
w = np.random.random((102, 105)).astype("float32") w = np.random.random((102, 105)).astype("float32")
g = np.random.random((102, 105)).astype("float32") g = np.random.random((102, 105)).astype("float32")
lr = 0.1 lr = np.array([0.1]).astype("float32")
self.inputs = {'param': w, 'grad': g} self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
self.attrs = {'learning_rate': lr} self.outputs = {'ParamOut': w - lr * g}
self.outputs = {'param_out': w - lr * g}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
......
import logging
import paddle.v2.framework.core as core
import unittest
import numpy as np
class TestTensorArray(unittest.TestCase):
def setUp(self):
self.ta = core.TensorArray()
self.batch_size = 10
self.dim = 2
# create a LoDTensor
self.scope = core.Scope()
var = self.scope.new_var("test_tensor")
self.place = core.CPUPlace()
tensor = var.get_tensor()
tensor.set_dims([self.batch_size, self.dim])
tensor.alloc_float(self.place)
tensor_array = np.array(tensor)
tensor_array[0, 0] = 0
tensor_array[1, 0] = 1
tensor_array[2, 0] = 2
tensor_array[3, 0] = 3
tensor_array[4, 0] = 4
tensor_array[5, 0] = 5
tensor_array[6, 0] = 6
tensor_array[7, 0] = 7
tensor_array[8, 0] = 8
tensor_array[9, 0] = 9
lod_py = [[0, 2, 5, 10]]
lod_tensor = core.LoDTensor(lod_py)
lod_tensor.set(tensor_array, self.place)
self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]]
self.tensor = lod_tensor
def test_unstack(self):
self.ta.unstack(self.tensor)
self.assertEqual(self.tensor.get_dims()[0], self.ta.size())
def test_read(self):
self.ta.unstack(self.tensor)
for i in range(self.batch_size):
tensor = self.ta.read(i)
def test_write(self):
self.ta.unstack(self.tensor)
# create a tensor with shape of [1, self.dim]
var = self.scope.new_var("hell")
tensor = var.get_tensor()
tensor.set_dims([1, self.dim])
tensor.alloc_float(self.place)
tensor_array = np.array(tensor)
for i in range(self.dim):
tensor_array[0, i] = i
tensor.set(tensor_array, self.place)
self.ta.write(2, tensor)
ta_tensor = self.ta.read(2)
ta_tensor_array = np.array(ta_tensor)
self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
self.assertTrue((tensor_array == ta_tensor_array).all())
def test_write_shared(self):
self.ta.unstack(self.tensor)
# create a tensor with shape of [1, self.dim]
var = self.scope.new_var("hell")
tensor = var.get_tensor()
tensor.set_dims([1, self.dim])
tensor.alloc_float(self.place)
tensor_array = np.array(tensor)
for i in range(self.dim):
tensor_array[0, i] = i
tensor.set(tensor_array, self.place)
self.ta.write_shared(2, tensor)
ta_tensor = self.ta.read(2)
ta_tensor_array = np.array(ta_tensor)
self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
self.assertTrue((tensor_array == ta_tensor_array).all())
def test_unpack(self):
meta = self.ta.unpack(self.tensor, 0, True)
self.assertEqual(self.ta.size(), 5)
self.assertEqual(meta, self.py_seq_meta)
def test_pack(self):
meta = self.ta.unpack(self.tensor, 0, True)
print "meta", meta
tensor = self.ta.pack(0, meta, self.tensor.lod())
print np.array(self.tensor)
print np.array(tensor)
self.assertTrue((np.array(self.tensor) == np.array(tensor)).all())
self.assertTrue(tensor.lod(), self.tensor.lod())
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册