提交 b4d710ce 编写于 作者: L Liu Yiqun

Merge branch 'develop' into build_ios

......@@ -4,7 +4,6 @@ cache:
- $HOME/.ccache
- $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party
- $TRAVIS_BUILD_DIR/build_android/third_party
- $TRAVIS_BUILD_DIR/build_ios/third_party
sudo: required
dist: trusty
......@@ -13,7 +12,6 @@ os:
env:
- JOB=build_doc
- JOB=check_style
- JOB=build_android
matrix:
include:
- env: JOB=build_ios
......@@ -30,7 +28,6 @@ addons:
- python
- python-pip
- python2.7-dev
- python-numpy
- python-wheel
- libboost-dev
- curl
......
......@@ -65,8 +65,8 @@ if(NOT CMAKE_BUILD_TYPE)
endif()
if(ANDROID OR IOS)
if(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
if(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
endif()
set(WITH_GPU OFF CACHE STRING
......
......@@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables
ARG ANDROID_ABI
ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
RUN apt-get update && \
apt-get install -y \
......@@ -15,12 +21,11 @@ RUN apt-get update && \
apt-get clean -y
# Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tgz && \
RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -xz -C /usr/local && \
mkdir /root/gopath && \
mkdir /root/gopath/bin && \
mkdir /root/gopath/src && \
rm go.tgz
mkdir /root/gopath/src
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
......@@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
rm -rf /opt/android-ndk-tmp && \
rm -rf ${ANDROID_NDK_HOME}
......
......@@ -20,6 +20,7 @@
# The supported variables are listed belows:
#
# ANDROID_STANDALONE_TOOLCHAIN
# ANDROID_TOOLCHAIN
# ANDROID_ABI
# ANDROID_NATIVE_API_LEVEL
# ANDROID_ARM_MODE
......@@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
ENDIF()
ENDIF()
IF(NOT DEFINED ANDROID_TOOLCHAIN)
SET(ANDROID_TOOLCHAIN clang)
ENDIF()
IF(NOT DEFINED ANDROID_ABI)
SET(ANDROID_ABI "armeabi-v7a")
ENDIF()
......@@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
"${CMAKE_VERSION}), when cross-compiling for Android.")
IF(ANDROID_STANDALONE_TOOLCHAIN)
# Use standalone toolchain
SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
IF(NOT CMAKE_SYSTEM_VERSION)
......@@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ENDIF()
# Toolchain
SET(ANDROID_TOOLCHAIN "gcc")
SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
IF(ANDROID_ABI STREQUAL "armeabi")
SET(CMAKE_SYSTEM_PROCESSOR armv5te)
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
SET(CMAKE_SYSTEM_PROCESSOR aarch64)
ELSE(ANDROID_NDK)
# TODO: use android ndk
ENDIF()
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
IF(ANDROID_ABI STREQUAL "armeabi")
SET(CMAKE_SYSTEM_PROCESSOR armv5te)
SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
SET(CMAKE_SYSTEM_PROCESSOR aarch64)
SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
IF(ANDROID_TOOLCHAIN STREQUAL clang)
SET(ANDROID_C_COMPILER_NAME clang)
SET(ANDROID_CXX_COMPILER_NAME clang++)
SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
SET(ANDROID_C_COMPILER_NAME gcc)
SET(ANDROID_CXX_COMPILER_NAME g++)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
ENDIF()
# C compiler
IF(NOT CMAKE_C_COMPILER)
SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
ELSE()
GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
ENDIF()
......@@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
# CXX compiler
IF(NOT CMAKE_CXX_COMPILER)
SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
ELSE()
GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
ENDIF()
......@@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
# Toolchain and ABI specific flags.
SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
IF(ANDROID_ABI STREQUAL "armeabi")
......@@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
-march=armv5te
-mtune=xscale
-msoft-float)
ENDIF()
IF(ANDROID_ABI STREQUAL "armeabi-v7a")
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
LIST(APPEND ANDROID_COMPILER_FLAGS
-march=armv7-a
-mfloat-abi=softfp)
......@@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
ENDIF()
LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
ENDIF()
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
......@@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ELSE()
LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
ENDIF()
IF(ANDROID_TOOLCHAIN STREQUAL clang)
# Disable integrated-as for better compatibility.
LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
ENDIF()
ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
IF(ANDROID_TOOLCHAIN STREQUAL clang)
# CMake automatically forwards all compiler flags to the linker,
# and clang doesn't like having -Wa flags being used for linking.
# To prevent CMake from doing this would require meddling with
# the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
ENDIF()
STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
......
......@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(USE_EIGEN_FOR_BLAS)
return()
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND})
......
......@@ -434,9 +434,9 @@ lambda_cost
.. autoclass:: paddle.v2.layer.lambda_cost
:noindex:
mse_cost
square_error_cost
--------
.. autoclass:: paddle.v2.layer.mse_cost
.. autoclass:: paddle.v2.layer.square_error_cost
:noindex:
rank_cost
......
......@@ -86,12 +86,13 @@ def layer.fc(X):
We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example:
```
| C++ functions/functors | mul | add | | |
|------------------------|--------------|--------------|-------------|----------|
| C++ operator class | mulOp | addOp | FCOp | |
| Python binding | operator.mul | operator.add | operator.fc | |
| Python function | | | | layer.fc |
```
This is how we differentiate layer and operators in PaddlePaddle:
......
# Design Doc: Computations as a Graph
A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
This document explains that the construction of a graph as three steps:
- construct the forward part
- construct the backward part
- construct the optimization part
## The Construction of a Graph
Let us take the problem of image classification as a simple example. The application program that trains the model looks like:
```python
x = layer.data("images")
l = layer.data("label")
y = layer.fc(x)
cost = layer.mse(y, l)
optimize(cost)
train(cost, reader=mnist.train())
```
### Forward Part
The first four lines of above program build the forward part of the graph.
![](images/graph_construction_example_forward_only.png)
In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message.
### Backward Part
The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
![](images/graph_construction_example_forward_backward.png)
According to the chain rule of gradient computation, `ConstructBackwardGraph` would
1. create a gradient operator G for each operator F,
1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
1. make all these gradients as outputs of G.
### Optimization Part
For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph:
![](images/graph_construction_example_all.png)
## Block and Graph
The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
A Block keeps operators in an array `BlockDesc::ops`
```protobuf
message BlockDesc {
repeated OpDesc ops = 1;
repeated VarDesc vars = 2;
}
```
in the order that there appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
cat ./graph_construction_example.dot | \
sed 's/color=red/color=red, style=invis/g' | \
sed 's/color=green/color=green, style=invis/g' | \
dot -Tpng > graph_construction_example_forward_only.png
cat ./graph_construction_example.dot | \
sed 's/color=green/color=green, style=invis/g' | \
dot -Tpng > graph_construction_example_forward_backward.png
cat ./graph_construction_example.dot | \
dot -Tpng > graph_construction_example_all.png
digraph ImageClassificationGraph {
///////// The forward part /////////
FeedX [label="Feed", color=blue, shape=box];
FeedY [label="Feed", color=blue, shape=box];
InitW [label="Init", color=blue, shape=diamond];
Initb [label="Init", color=blue, shape=diamond];
FC [label="FC", color=blue, shape=box];
MSE [label="MSE", color=blue, shape=box];
x [label="x", color=blue, shape=oval];
l [label="l", color=blue, shape=oval];
y [label="y", color=blue, shape=oval];
W [label="W", color=blue, shape=doublecircle];
b [label="b", color=blue, shape=doublecircle];
cost [label="cost", color=blue, shape=oval];
FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
FeedY -> l [color=blue];
InitW -> W [color=blue];
Initb -> b [color=blue];
W -> FC [color=blue];
b -> FC [color=blue];
l -> MSE [color=blue];
////////// The backward part /////////
MSE_Grad [label="MSE_grad", color=red, shape=box];
FC_Grad [label="FC_grad", color=red, shape=box];
d_cost [label="d cost", color=red, shape=oval];
d_y [label="d y", color=red, shape=oval];
d_b [label="d b", color=red, shape=oval];
d_W [label="d W", color=red, shape=oval];
cost -> MSE_Grad [color=red];
d_cost -> MSE_Grad [color=red];
x -> MSE_Grad [color=red];
l -> MSE_Grad [color=red];
y -> MSE_Grad -> d_y [color=red];
x -> FC_Grad [color=red];
y -> FC_Grad [color=red];
d_y -> FC_Grad [color=red];
W -> FC_Grad -> d_W [color=red];
b -> FC_Grad -> d_b [color=red];
////////// The optimizaiton part //////////
OPT_W [label="SGD", color=green, shape=box];
OPT_b [label="SGD", color=green, shape=box];
W -> OPT_W [color=green];
b -> OPT_b [color=green];
d_W -> OPT_W -> W [color=green];
d_b -> OPT_b -> b [color=green];
////////// Groupings //////////
subgraph clusterMSE {
style=invis;
MSE;
MSE_Grad;
}
subgraph clusterFC {
style=invis;
FC;
FC_Grad;
}
}
# Design Doc: Operation Graph Based Parameter Server
## Abstract
We propose an approach to implement the parameter server. In this
approach, there is no fundamental difference between the trainer and
the parameter server: they both run subgraphs, but subgraphs of
different purposes.
## Background
The previous implementations of the parameter server does not run a
subgraph. parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the
trainer and the parameter server.
It would be great if we can write code once and use them on both the
trainer and the parameter server: reduces code duplication and
improves extensibility. Given that after the current refactor, we are
representing everything as a computing graph on the
trainer. Representing everything as a computing graph on the parameter
server becomes a natural extension.
## Design
### Graph Converter
The *graph converter* converts the user-defined operation (OP) graph
into subgraphs to be scheduled on different nodes with the following
steps:
1. OP placement: the OPs will be placed on different nodes according
to heuristic that minimizes estimated total computation
time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer
workers.
1. Add communication OPs to enable the communication between nodes.
We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
Below is an example of converting the user defined graph to the
subgraphs for the trainer and the parameter server:
<img src="src/local-graph.png" width="300"/>
After converting:
<img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
1. Operators are added to the subgraphs.
- *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark
the *Recv* OP runnable automatically).
- *Enueue* enqueues the input variable, it can block until space
become available in the queue.
- *Dequeue* outputs configurable numbers of tensors from the
queue. It will block until the queue have the required number of
tensors.
### Benefits
- Model parallelism become easier to implement: it's an extension to
the trainer - parameter server approach. we already have the
communication OPs, but need to extend the graph converter's
placement functionality.
- User-defined optimizer is easier to add - user can now express it as
a subgraph.
- No more duplication logic inside the trainer and the parameter
server mentioned in the background section.
### Challenges
- It might be hard for the graph converter to cut a general graph
(without any hint for which subgraph is the optimizer). We may need
to label which subgraph inside the OP graph is the optimizer.
- It's important to balance the parameter shards of on multiple
parameter server. If a single parameter is very big (some
word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends
on the parameter variable).
### Discussion
- In the "Aync SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently, what is our locking strategy?
E.g., each variable have a lock cpp method to be invoked by every
OP, or, have a lock OP.
- Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the
`min_count` attribute), does our current design support it? (similar
question for the *Add* OP)
### References:
[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
......@@ -147,7 +147,7 @@ class CosineOp {
struct CosineOpProtoMaker : public OpProtoMaker {
CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
AddInput("input", "input of cosine op");
AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0);
AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
AddType("cos");
AddComment("This is cos op");
}
......
## Background
PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
PaddlePaddle use proto message to describe compile time graph for
1. Computation graph should be able to be saved to a file.
1. In distributed training, the graph will be serialized and send to multiple workers.
The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
| |compile time|runtime|
|---|---|---|
|Data|VarDesc(proto)|Variable(cpp)|
|Operation|OpDesc(proto)|Operator(cpp)|
## Definition of VarDesc
A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
```proto
message VarDesc {
required string name = 1;
optional LoDTensorDesc lod_tensor = 2;
}
```
## Definition of LodTensorDesc
```proto
enum DataType {
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LoDTensorDesc {
required DataType data_type = 1;
repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
optional int32 lod_level = 3 [default=0];
}
```
## Definition of Variable in Python
In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
```python
image = Variable(dims=[-1, 640, 480])
# fc1 and fc2 are both Variable
fc1 = layer.fc(input=image, output_size=10)
fc2 = layer.fc(input=fc1, output_size=20)
```
### what should class `Variable` Have
1. `name`.a name of string type is used to mark the value of the Variable.
1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
1. `operator`. Variable should record which operator produce itself. The reaon is:
- we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
```python
import VarDesc
import LoDTensorDesc
import framework
def AddInitialOperator(variable, initializer):
# add an initialize Operator to block to init this Variable
class Variable(object):
def __init__(self, name, dims, type, initializer):
self._block = get_default_block()
self._name = name
self.op = None
tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
_var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
self._var = framework.CreateVar(_var_desc)
self._block.add_var(self)
# add initial op according to initializer
if initializer is not None:
AddInitialOperator(self, initializer)
def dims(self):
return self._var.dims()
def data_type(self):
return self._var.data_type()
def to_proto(self):
pass
```
Then we can use this Variable to create a fc layer in Python.
```python
import paddle as pd
def flatten_size(X, num_flatten_dims):
prod = 1 # of last num_flatten_dims
for i in xrange(num_flatten_dims):
prod = prod * X.dims[-i-1]
return prod
def layer.fc(X, output_size, num_flatten_dims):
W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
out = Variable(type=FP32)
y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
pd.InferShape(y)
return out
x = Variable(dims=[-1, 640, 480])
y = layer.fc(x, output_size=100)
z = layer.fc(y, output_size=200)
paddle.eval(targets=[z], ...)
print(z)
```
......@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
# 线性计算网络层: ȳ = wx + b
ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
# 计算误差函数,即 ȳ 和真实 y 之间的距离
cost = mse_cost(input= ȳ, label=y)
cost = square_error_cost(input= ȳ, label=y)
outputs(cost)
......@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
- **数据层**:数据层 `data_layer` 是神经网络的入口,它读入数据并将它们传输到接下来的网络层。这里数据层有两个,分别对应于变量 `x` 和 `y`。
- **全连接层**:全连接层 `fc_layer` 是基础的计算单元,这里利用它建模变量之间的线性关系。计算单元是神经网络的核心,PaddlePaddle支持大量的计算单元和任意深度的网络连接,从而可以拟合任意的函数来学习复杂的数据关系。
- **回归误差代价层**:回归误差代价层 `mse_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。
- **回归误差代价层**:回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。
定义了网络结构并保存为 `trainer_config.py` 之后,运行以下训练命令:
......
......@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
x = data_layer(name='x', size=1)
y = data_layer(name='y', size=1)
y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
cost = mse_cost(input=y_predict, label=y)
cost = square_error_cost(input=y_predict, label=y)
outputs(cost)
Some of the most fundamental usages of PaddlePaddle are demonstrated:
......
......@@ -8,7 +8,7 @@ paddle.init(use_gpu=False)
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.mse_cost(input=y_predict, label=y)
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
# create parameters
parameters = paddle.parameters.create(cost)
......
......@@ -81,9 +81,9 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和
.. code-block:: bash
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
cost = paddle.layer.mse_cost(input=y_predict, label=y)
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上方误差层。
其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上方误差层。
最后一层cost中记录了神经网络的所有拓扑结构,通过组合不同的layer,我们即可完成神经网络的搭建。
......@@ -147,4 +147,4 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和
.. literalinclude:: src/train.py
:linenos:
有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
\ No newline at end of file
有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
此差异已折叠。
## 在Paddle中如何使用Eigen
神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。
### Eigen Tensor模块
Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md)[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
### paddle::framework::Tensor
Paddle Tensor定义在framework目录下,其主要接口如下:
```cpp
class Tensor {
public:
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T* data();
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
*
* @param[in] dims The dimensions of the memory block.
* @param[in] place The place of the memory block.
*
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(DDim dims, platform::Place place);
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
private:
/*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_;
/*! points to dimensions of memory block. */
DDim dim_;
};
```
`Placeholder`的作用是延迟分配内存,即我们可以先定义一个Tensor,然后使用Resize接口设置Tensor的大小,最后再调用mutable_data接口分配实际的内存。
```cpp
paddle::framework::Tensor t;
paddle::platform::CPUPlace place;
// set size first
t.Resize({2, 3});
// allocate memory on CPU later
t.mutable_data(place);
```
### paddle::framework::Tensor使用样例
下面以AddOp为例说明Tensor的使用过程:
- InferShape
在运行神经网络计算图时,我们先调用每个`Operator``InferShape`接口,根据输入Tensor的大小来设置输出Tensor的大小,`Resize`接口会被调用。
```cpp
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
ctx.Input<Tensor>("Y")->dims(),
"Two input of Add Op's dimension must be same.");
ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
}
```
- Run
`Operator``Run`接口最终会调用对应`OpKernel``Compute`接口,在这时真正的分配内存,`mutable_data`接口会被调用。
```cpp
void Compute(const framework::ExecutionContext& context) const override {
auto* input0 = context.Input<Tensor>("X");
auto* input1 = context.Input<Tensor>("Y");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
auto x = EigenVector<T>::Flatten(*input0);
auto y = EigenVector<T>::Flatten(*input1);
auto z = EigenVector<T>::Flatten(*output);
auto place = context.GetEigenDevice<Place>();
z.device(place) = x + y;
}
```
### paddle::framework::Tensor到EigenTensor的转换
如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
以EigenTensor为例,做一个介绍
```cpp
Tensor t;
float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
for (int i = 0; i < 1 * 2 * 3; i++) {
p[i] = static_cast<float>(i);
}
EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
```
From是EigenTensor模板提供的一个接口,可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数,因此在转换时需要显示的指定。
在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。
更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)
### 实现计算
当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。
```cpp
auto x = EigenVector<T>::Flatten(*input0);
auto y = EigenVector<T>::Flatten(*input1);
auto z = EigenVector<T>::Flatten(*output);
auto place = context.GetEigenDevice<Place>();
z.device(place) = x + y;
```
在这段代码中,input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口,把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后,input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息,可以调用Resize接口进行改变。
由于Eigen Tensor模块的文档较少,我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。
......@@ -5,15 +5,13 @@
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
如何构建PaddlePaddle的文档
==========================
如何构建文档
============
PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式,我们提供了一个构建脚本build_docs.sh来进行构建。
PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
PaddlePaddle的文档构建有两种方式。
使用Docker构建PaddlePaddle的文档
--------------------------------
使用Docker构建
--------------
使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
......@@ -21,58 +19,46 @@ PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使
cd TO_YOUR_PADDLE_CLONE_PATH
cd paddle/scripts/tools/build_docs
bash build_docs.sh with_docker
编译完成后,会在当前目录生成两个子目录\:
* doc 英文文档目录
* doc_cn 中文文档目录
sh build_docs.sh
编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
打开浏览器访问对应目录下的index.html即可访问本地文档。
直接构建PaddlePaddle的文档
--------------------------
因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包,用户需要首先确认py_paddle包已经安装。
.. code-block:: bash
python -c "import py_paddle"
如果提示错误,那么用户需要在本地编译安装PaddlePaddle,请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
注意,用户在首次编译安装PaddlePaddle时,请将WITH_DOC选项关闭。在编译安装正确之后,请再次确认py_paddle包已经安装,即可进行下一步操作。
直接构建
--------
如果提示正确,可以执行以下命令编译生成文档,即
.. code-block:: bash
cd TO_YOUR_PADDLE_CLONE_PATH
cd paddle/scripts/tools/build_docs
bash build_docs.sh local
编译完成之后,会在当前目录生成两个子目录\:
* doc 英文文档目录
* doc_cn 中文文档目录
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
make gen_proto_py
make paddle_docs paddle_docs_cn
编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
打开浏览器访问对应目录下的index.html即可访问本地文档。
如何书写PaddlePaddle的文档
==========================
如何书写文档
============
PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。
如何更新www.paddlepaddle.org文档
================================
如何更新文档主题
================
PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下,包含所有和前端网页设计相关的文件。
开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中,提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
如何更新doc.paddlepaddle.org
============================
更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
.. _cmake: https://cmake.org/
.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/
......@@ -213,7 +213,7 @@ I1116 09:10:17.123440 50 Util.cpp:130] Calling runInitFunctions
I1116 09:10:17.123764 50 Util.cpp:143] Call runInitFunctions done.
[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
I1116 09:10:17.392917 50 Trainer.cpp:170] trainer mode: Normal
I1116 09:10:17.613910 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
I1116 09:10:17.680917 50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
......
......@@ -18,14 +18,6 @@ limitations under the License. */
#ifndef __NVCC__
#include "paddle/math/MathFunctions.h"
#ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm<float>
#else
#define CBLAS_GEMM paddle::gemm<double>
#endif
template<class OpResetOutput>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
real *gateValue,
......@@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
}
}
template<class OpResetOutput, class OpFinalOutput>
void hl_cpu_gru_forward(OpResetOutput opResetOutput,
OpFinalOutput opFinalOutput,
hl_gru_value value,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
2 * frameSize,
frameSize,
1,
value.prevOutValue,
frameSize,
value.gateWeight,
frameSize * 2,
1,
value.gateValue,
frameSize * 3);
}
forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
frameSize,
frameSize,
1,
value.resetOutputValue,
frameSize,
value.stateWeight,
frameSize,
1,
value.gateValue + frameSize * 2,
frameSize * 3);
}
forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
}
template<class OpStateGrad>
void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
real *gateValue,
......@@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
}
}
template<class OpStateGrad, class OpResetGrad>
void hl_cpu_gru_backward(OpStateGrad opStateGrad,
OpResetGrad opResetGrad,
hl_gru_value value,
hl_gru_grad grad,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
backward_state_grad(opStateGrad, value, grad,
frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize,
1,
grad.gateGrad + frameSize * 2,
frameSize * 3,
value.stateWeight,
frameSize,
0,
grad.resetOutputGrad,
frameSize);
if (grad.stateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize,
batchSize,
1,
value.resetOutputValue,
frameSize,
grad.gateGrad + frameSize * 2,
frameSize * 3,
1,
grad.stateWeightGrad,
frameSize);
}
}
backward_reset_grad(opResetGrad, value, grad,
frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize * 2,
1,
grad.gateGrad,
frameSize * 3,
value.gateWeight,
frameSize * 2,
1,
grad.prevOutGrad,
frameSize);
if (grad.gateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize * 2,
batchSize,
1,
value.prevOutValue,
frameSize,
grad.gateGrad,
frameSize * 3,
1,
grad.gateWeightGrad,
frameSize * 2);
}
}
}
#endif
#endif // HL_CPU_GRU_CUH_
......@@ -9,6 +9,7 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_test(variable_test SRCS variable_test.cc)
......
......@@ -43,6 +43,10 @@ template <>
AttrType AttrTypeID<std::vector<std::string>>() {
return STRINGS;
}
template <>
AttrType AttrTypeID<std::vector<std::pair<int, int>>>() {
return INT_PAIRS;
}
Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
switch (attr_desc.type()) {
......@@ -76,6 +80,14 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
}
return val;
}
case paddle::framework::AttrType::INT_PAIRS: {
std::vector<std::pair<int, int>> val(attr_desc.int_pairs_size());
for (int i = 0; i < attr_desc.int_pairs_size(); ++i) {
val[i].first = attr_desc.int_pairs(i).first();
val[i].second = attr_desc.int_pairs(i).second();
}
return val;
}
}
PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
return boost::blank();
......
......@@ -28,7 +28,8 @@ namespace paddle {
namespace framework {
typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>>
std::vector<float>, std::vector<std::string>,
std::vector<std::pair<int, int>>>
Attribute;
typedef std::unordered_map<std::string, Attribute> AttributeMap;
......@@ -40,11 +41,23 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
// check whether a value(attribute) fit a certain limit
template <typename T>
class LargerThanChecker {
class GreaterThanChecker {
public:
explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
}
private:
T lower_bound_;
};
template <typename T>
class EqualGreaterThanChecker {
public:
explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const {
PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
}
private:
......@@ -109,8 +122,13 @@ class TypedAttrChecker {
return *this;
}
TypedAttrChecker& LargerThan(const T& lower_bound) {
value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
TypedAttrChecker& GreaterThan(const T& lower_bound) {
value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
return *this;
}
TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
return *this;
}
......
......@@ -182,7 +182,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
});
// process recurrent gradient op as a special operator.
if (forwardOp.Type() == "recurrent_op") {
if (forwardOp.Type() == "recurrent") {
// NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
// this will result in infinite loop.
const auto& rnnop =
......
......@@ -2,81 +2,99 @@
## Motivation
In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
## Backward Operator Registry
In Neural Network, many model is solved by the the backpropagation algorithm(known as BP) at present. Technically it caculates the gradient of the loss function, then distributed back through the networks. Follows the chain rule, so we need a module chains the gradient operators/expressions together with to construct the backward pass. Every forward network needs a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
## Implementation
In this design doc, we exported only one API for generating the backward pass.
```c++
std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars);
```
The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
### Backward Operator Registry
A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
| | forward operator | backward operator
| ---------------------- | ---------------- |------------------------- |
| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients |
| **Operator::outputs_** | Outputs | InputGradients |
In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
In most cases, there is a one-to-one correspondence between the forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
For example, we have got a `mul_op`, and we can register its information and corresponding backward operator by the following macro:
```cpp
REGISTER_OP(mul, MulOp, MulOpMaker, MulOpGrad);
REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
```
`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
## Backward Opeartor Creating
### Backward Opeartor Creating
Given a certain forward operator, we can get its corresponding backward opeartor by calling:
Given a certain forward operator, we can get its corresponding backward operator by calling:
```cpp
OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
```
```
The function `BuildGradOp` will sequentially execute following processes:
1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
## Backward Network Building
### Backward Network Building
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network.
given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and append them together one by one. There is some corner case need to process specially.
1. Op
when the input forward network is a Op, return its gradient Operator Immediately.
When the input forward network is an Op, return its gradient Operator Immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
2. NetOp
when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
3. RnnOp
RnnOp is a nested stepnet operator. Backward module need to recusively call `Backward` for every stepnet.
4. Sharing Variables
**sharing variables**. As illustrated in the pictures, two operator's share the same variable name of W@GRAD, which will overwrite their sharing input variable.
<p align="center">
<img src="./images/duplicate_op.png" width="50%" ><br/>
**shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.
​ pic 1. Sharing variables in operators.
<p align="center">
<img src="./images/duplicate_op.png" width="70%" ><br/>
</p>
1. shared variable in two operators.
​ Sharing variable between operators or same input variable used in multiple operators leads to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively and add a generic add operator to replace the overwrite links.
</p>
<p align="center">
<img src="images/duplicate_op2.png" width="40%" ><br/>
Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links.
​ pic 2. Replace sharing variable's gradient with `Add` operator.
<p align="center">
<img src="images/duplicate_op2.png" width="90%" ><br/>
</p>
2. replace shared variable gradient with `Add` Operator
​ Because our framework finds variables accord to their names, we need to rename the output links. We add a suffix of number to represent its position in clockwise.
</p>
5. Part of Gradient is Zero.
In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator, we need to fill a same shape gradient matrix in the position. In our implement, we insert a special `fillZeroLike` operator.
​ Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
......@@ -148,14 +148,16 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
namespace f = paddle::framework;
namespace ops = paddle::operators;
using EnforceNotMet = paddle::platform::EnforceNotMet;
REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, f::NOP);
REGISTER_OP(mul, f::NOP, f::MulOpMaker, f::NOP);
REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, f::NOP);
REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
f::NOP);
REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
REGISTER_OP(add, f::NOP, f::AddOpMaker, f::NOP);
REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, f::NOP);
REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
f::NOP);
TEST(Backward, simple_op_grad) {
auto fwd = f::OpRegistry::CreateOp(
......
......@@ -21,16 +21,16 @@ namespace framework {
/// @cond HIDDEN
template <int i>
Dim<i> make_dim(const int* d) {
Dim<i> make_dim(const int64_t* d) {
return Dim<i>(*d, make_dim<i - 1>(d + 1));
}
template <>
Dim<1> make_dim<1>(const int* d) {
Dim<1> make_dim<1>(const int64_t* d) {
return Dim<1>(*d);
}
void make_ddim(DDim& ddim, const int* dims, int n) {
void make_ddim(DDim& ddim, const int64_t* dims, int n) {
switch (n) {
case 1:
ddim = make_dim<1>(dims);
......@@ -67,13 +67,13 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
/// @endcond
DDim make_ddim(std::initializer_list<int> dims) {
DDim make_ddim(std::initializer_list<int64_t> dims) {
DDim result(make_dim(0));
make_ddim(result, dims.begin(), dims.size());
return result;
}
DDim make_ddim(const std::vector<int>& dims) {
DDim make_ddim(const std::vector<int64_t>& dims) {
DDim result(make_dim(0));
make_ddim(result, &dims[0], dims.size());
return result;
......@@ -81,12 +81,12 @@ DDim make_ddim(const std::vector<int>& dims) {
/// @cond HIDDEN
// XXX For some reason, putting this in an anonymous namespace causes errors
class DynamicMutableIndexer : public boost::static_visitor<int&> {
class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
public:
explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
template <int D>
int& operator()(Dim<D>& dim) const {
int64_t& operator()(Dim<D>& dim) const {
return dim[idx_];
}
......@@ -94,12 +94,12 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
int idx_;
};
class DynamicConstIndexer : public boost::static_visitor<int> {
class DynamicConstIndexer : public boost::static_visitor<int64_t> {
public:
explicit DynamicConstIndexer(int idx) : idx_(idx) {}
template <int D>
int operator()(const Dim<D>& dim) const {
int64_t operator()(const Dim<D>& dim) const {
return dim[idx_];
}
......@@ -109,22 +109,22 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
/// @endcond
int& DDim::operator[](int idx) {
int64_t& DDim::operator[](int idx) {
return boost::apply_visitor(DynamicMutableIndexer(idx), var);
}
int DDim::operator[](int idx) const {
int64_t DDim::operator[](int idx) const {
return boost::apply_visitor(DynamicConstIndexer(idx), var);
}
ssize_t DDim::size() const { return arity(*this); }
int64_t DDim::size() const { return arity(*this); }
bool DDim::operator==(DDim d) const {
if (var.which() != d.getVar().which()) {
return false;
} else {
std::vector<int> v1 = vectorize(*this);
std::vector<int> v2 = vectorize(d);
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
for (unsigned int i = 0; i < v1.size(); i++) {
if (v1[i] != v2[i]) {
......@@ -139,10 +139,10 @@ bool DDim::operator==(DDim d) const {
bool DDim::operator!=(DDim d) const { return !(*this == d); }
DDim DDim::operator+(DDim d) const {
std::vector<int> v1 = vectorize(*this);
std::vector<int> v2 = vectorize(d);
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
std::vector<int> v3;
std::vector<int64_t> v3;
assert(v1.size() == v2.size());
......@@ -154,10 +154,10 @@ DDim DDim::operator+(DDim d) const {
}
DDim DDim::operator*(DDim d) const {
std::vector<int> v1 = vectorize(*this);
std::vector<int> v2 = vectorize(d);
std::vector<int64_t> v1 = vectorize(*this);
std::vector<int64_t> v2 = vectorize(d);
std::vector<int> v3;
std::vector<int64_t> v3;
assert(v1.size() == v2.size());
......@@ -168,15 +168,15 @@ DDim DDim::operator*(DDim d) const {
return make_ddim(v3);
}
int get(const DDim& ddim, int idx) { return ddim[idx]; }
int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
/// @cond HIDDEN
struct VectorizeVisitor : public boost::static_visitor<> {
std::vector<int>& vector;
std::vector<int64_t>& vector;
explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
template <typename T>
void operator()(const T& t) {
......@@ -188,31 +188,31 @@ struct VectorizeVisitor : public boost::static_visitor<> {
};
/// @endcond
std::vector<int> vectorize(const DDim& ddim) {
std::vector<int> result;
std::vector<int64_t> vectorize(const DDim& ddim) {
std::vector<int64_t> result;
VectorizeVisitor visitor(result);
boost::apply_visitor(visitor, ddim);
return result;
}
struct ProductVisitor : public boost::static_visitor<ssize_t> {
struct ProductVisitor : public boost::static_visitor<int64_t> {
template <int D>
ssize_t operator()(const Dim<D>& dim) {
int64_t operator()(const Dim<D>& dim) {
return product(dim);
}
};
ssize_t product(const DDim& ddim) {
int64_t product(const DDim& ddim) {
ProductVisitor visitor;
return boost::apply_visitor(visitor, ddim);
}
struct SliceVectorizeVisitor : public boost::static_visitor<> {
std::vector<int>& vector;
std::vector<int64_t>& vector;
int begin;
int end;
SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
: vector(v), begin(b), end(e) {
PADDLE_ENFORCE(begin < end,
"Begin index must be less than end index in ddim slice.");
......@@ -240,7 +240,7 @@ struct SliceVectorizeVisitor : public boost::static_visitor<> {
};
DDim slice_ddim(const DDim& dim, int begin, int end) {
std::vector<int> vec;
std::vector<int64_t> vec;
vec.reserve(end - begin);
SliceVectorizeVisitor visitor(vec, begin, end);
boost::apply_visitor(visitor, dim);
......@@ -280,8 +280,17 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
return os;
}
DDim::DDim(std::initializer_list<int> init_list) {
DDim::DDim(std::initializer_list<int64_t> init_list) {
*this = make_ddim(init_list);
}
DDim flatten_to_2d(const DDim& src, int num_col_dims) {
int rank = src.size();
return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
product(slice_ddim(src, num_col_dims, rank))});
}
DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
} // namespace framework
} // namespace paddle
......@@ -40,7 +40,7 @@ struct DDim {
template <int D>
explicit DDim(const Dim<D>& in) : var(in) {}
/*implicit*/ DDim(std::initializer_list<int> init_list);
/*implicit*/ DDim(std::initializer_list<int64_t> init_list);
template <int D>
DDim& operator=(const Dim<D>& in) {
......@@ -48,8 +48,8 @@ struct DDim {
return *this;
}
int& operator[](int idx);
int operator[](int idx) const;
int64_t& operator[](int idx);
int64_t operator[](int idx) const;
template <typename Visitor>
typename Visitor::result_type apply_visitor(Visitor& visitor) {
......@@ -71,15 +71,15 @@ struct DDim {
DDim operator*(DDim d) const;
ssize_t size() const;
int64_t size() const;
};
/**
* \brief Make a DDim from std::vector<int>
* \brief Make a DDim from std::vector<int64_t>
*
* \param dims An vector of ints. Must be sized between [1, 9]
*/
DDim make_ddim(const std::vector<int>& dims);
DDim make_ddim(const std::vector<int64_t>& dims);
/**
* \brief Make a DDim from an initializer list
......@@ -87,14 +87,14 @@ DDim make_ddim(const std::vector<int>& dims);
* \param dims An initializer list of ints. Must be sized between [1, 9]
*
*/
DDim make_ddim(std::initializer_list<int> dims);
DDim make_ddim(std::initializer_list<int64_t> dims);
int get(const DDim& dim, int idx);
int64_t get(const DDim& dim, int idx);
void set(DDim& dim, int idx, int val);
std::vector<int> vectorize(const DDim& ddim);
std::vector<int64_t> vectorize(const DDim& ddim);
ssize_t product(const DDim& ddim);
int64_t product(const DDim& ddim);
/**
* \brief Slice a ddim
......@@ -115,6 +115,12 @@ int arity(const DDim& ddim);
std::ostream& operator<<(std::ostream&, const DDim&);
// Reshape a tensor to a matrix. The matrix's first dimension(column length)
// will be the product of tensor's first `num_col_dims` dimensions.
DDim flatten_to_2d(const DDim& src, int num_col_dims);
DDim flatten_to_1d(const DDim& src);
} // namespace framework
} // namespace paddle
......
......@@ -12,7 +12,7 @@ TEST(DDim, Equality) {
EXPECT_EQ(ddim[2], 5);
// construct a DDim from a vector
std::vector<int> vec({9, 1, 5});
std::vector<int64_t> vec({9, 1, 5});
paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
EXPECT_EQ(ddim[0], 9);
EXPECT_EQ(ddim[1], 1);
......@@ -25,7 +25,7 @@ TEST(DDim, Equality) {
EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
// vectorize a DDim
std::vector<int> res_vec = paddle::framework::vectorize(vddim);
std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
EXPECT_EQ(res_vec[0], 9);
EXPECT_EQ(res_vec[1], 1);
EXPECT_EQ(res_vec[2], 5);
......
......@@ -17,13 +17,13 @@ struct Dim {
static constexpr int dimensions = i;
template <typename... Args>
HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
static_assert(sizeof...(_tail) == i - 1,
"Dim initialized with the wrong number of parameters");
}
HOSTDEVICE
Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
HOSTDEVICE
Dim() : head(0), tail() {}
......@@ -31,12 +31,12 @@ struct Dim {
/** Construct a Dim from a linear index and size. Uses Fortran order
* indexing. */
HOSTDEVICE
Dim(int idx, const Dim<i>& size)
Dim(int64_t idx, const Dim<i>& size)
: head(idx % size.head), tail(idx / size.head, size.tail) {}
/** Construct a Dim with each dimension set to the given index */
HOSTDEVICE
Dim(int idx) : head(idx), tail(idx) {}
Dim(int64_t idx) : head(idx), tail(idx) {}
HOSTDEVICE
bool operator==(const Dim<i>& o) const {
......@@ -47,13 +47,13 @@ struct Dim {
bool operator!=(const Dim<i>& o) const { return !(*this == o); }
HOSTDEVICE
int& operator[](int idx);
int64_t& operator[](int idx);
HOSTDEVICE
int operator[](int idx) const;
int64_t operator[](int idx) const;
HOST std::string to_string() const;
int head;
int64_t head;
Dim<i - 1> tail;
};
......@@ -63,7 +63,7 @@ struct Dim<1> {
static constexpr int dimensions = 1;
HOSTDEVICE
Dim(int _head) : head(_head) {}
Dim(int64_t _head) : head(_head) {}
HOSTDEVICE
Dim() : head(0) {}
......@@ -86,11 +86,11 @@ struct Dim<1> {
bool operator!=(const Dim<1>& o) const { return !(*this == o); }
HOSTDEVICE
int& operator[](int idx);
int64_t& operator[](int idx);
HOSTDEVICE
int operator[](int idx) const;
int64_t operator[](int idx) const;
int head;
int64_t head;
};
namespace {
......@@ -100,12 +100,12 @@ template <int i>
struct DimGetter {
// Return a copy if Dim is const
template <typename D>
HOSTDEVICE static int impl(const D& d) {
HOSTDEVICE static int64_t impl(const D& d) {
return DimGetter<i - 1>::impl(d.tail);
}
// Return a reference if Dim is mutable
template <typename D>
HOSTDEVICE static int& impl(D& d) {
HOSTDEVICE static int64_t& impl(D& d) {
return DimGetter<i - 1>::impl(d.tail);
}
};
......@@ -115,18 +115,18 @@ template <>
struct DimGetter<0> {
// Return a copy if Dim is const
template <typename D>
HOSTDEVICE static int impl(const D& d) {
HOSTDEVICE static int64_t impl(const D& d) {
return d.head;
}
// Return a reference if Dim is mutable
template <typename D>
HOSTDEVICE static int& impl(D& d) {
HOSTDEVICE static int64_t& impl(D& d) {
return d.head;
}
};
template <int D>
HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx < 0) {
throw std::invalid_argument("Tried to access a negative dimension");
......@@ -141,7 +141,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
}
template <>
HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx != 0) {
throw std::invalid_argument("Invalid index");
......@@ -153,7 +153,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
}
template <int D>
HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx < 0) {
throw std::invalid_argument("Tried to access a negative dimension");
......@@ -168,7 +168,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
}
template <>
HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
#ifndef __CUDA_ARCH__
if (idx != 0) {
throw std::invalid_argument("Invalid index");
......@@ -182,73 +182,76 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
} // namespace
// Static access to constant Dim
template <int i, int l>
HOSTDEVICE int get(const Dim<l>& d) {
HOSTDEVICE int64_t get(const Dim<l>& d) {
return DimGetter<i>::impl(d);
}
// Static access to mutable Dim
template <int i, int l>
HOSTDEVICE int& get(Dim<l>& d) {
HOSTDEVICE int64_t& get(Dim<l>& d) {
return DimGetter<i>::impl(d);
}
// Dynamic access to constant Dim
template <int l>
HOSTDEVICE int Dim<l>::operator[](int i) const {
HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
return indexer(*this, i);
}
// Dynamic access to mutable Dim
template <int l>
HOSTDEVICE int& Dim<l>::operator[](int i) {
HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
return indexer(*this, i);
}
// Dynamic access to constant Dim
inline HOSTDEVICE int Dim<1>::operator[](int i) const {
inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
return indexer(*this, i);
}
// Dynamic access to mutable Dim
inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
return indexer(*this, i);
}
// Dynamic access to constant Dim
// without std::enable_if will try to instantiate this on get<0>(d)
template <int l>
HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
int i) {
HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
int i) {
return d[i];
}
// Dynamic access to mutable Dim
template <int l>
HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
int i) {
return d[i];
}
// Dot product of two dims
template <int i>
HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
return a.head * b.head + linearize(a.tail, b.tail);
}
// Base case dot product of two Dims
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
return a.head * b.head;
}
// Product of a Dim
template <int i>
HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
return prod * a.head * product(a.tail);
}
// Base case product of a Dim
// Notice it is inline because it is no longer a template
template <>
HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
return prod * a.head;
}
......
......@@ -8,7 +8,7 @@ __global__ void test(paddle::framework::Dim<2>* o) {
o[0] = paddle::framework::make_dim(5, 6);
}
__global__ void dyn_idx_gpu(int* o) {
__global__ void dyn_idx_gpu(int64_t* o) {
auto d = paddle::framework::make_dim(5, 6);
o[0] = d[1];
}
......@@ -47,9 +47,9 @@ TEST(Dim, Equality) {
EXPECT_EQ(b[1], 11);
// dynamic access on GPU
thrust::device_vector<int> r(1);
thrust::device_vector<int64_t> r(1);
dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
int res = r[0];
int64_t res = r[0];
EXPECT_EQ(res, 6);
// ex_prefix_mul
......
......@@ -28,7 +28,7 @@ struct EigenDim {
static Type From(const DDim& dims) {
PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
Type ret;
for (int d = 0; d < arity(dims); d++) {
for (int64_t d = 0; d < arity(dims); d++) {
ret[d] = dims[d];
}
return ret;
......@@ -63,20 +63,35 @@ struct EigenTensor {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
int rank = tensor.dims_.size();
PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
"`num_col_dims` must be between (0, rank_of_tensor).");
return EigenMatrix::From(tensor,
flatten_to_2d(tensor.dims(), num_col_dims));
}
static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
int num_col_dims) {
int rank = tensor.dims_.size();
PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
"`num_col_dims` must be between (0, rank_of_tensor).");
return EigenMatrix::From(tensor,
flatten_to_2d(tensor.dims(), num_col_dims));
}
};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
// Flatten reshapes a Tensor into an EigenVector.
static typename EigenVector::Type Flatten(Tensor& tensor) {
return EigenVector::From(
tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
return EigenVector::From(tensor, {product(tensor.dims_)});
}
static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
return EigenVector::From(
tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
return EigenVector::From(tensor, {product(tensor.dims_)});
}
};
......
......@@ -108,5 +108,24 @@ TEST(Eigen, Matrix) {
}
}
TEST(Eigen, MatrixReshape) {
Tensor t;
float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
p[i] = static_cast<float>(i);
}
EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
ASSERT_EQ(2 * 3, em.dimension(0));
ASSERT_EQ(6 * 4, em.dimension(1));
for (int i = 0; i < 2 * 3; i++) {
for (int j = 0; j < 6 * 4; j++) {
ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
}
}
}
} // namespace framework
} // namespace paddle
......@@ -22,8 +22,14 @@ enum AttrType {
INTS = 3;
FLOATS = 4;
STRINGS = 5;
INT_PAIRS = 6;
}
message IntPair {
required int32 first = 1;
required int32 second = 2;
};
// OpDesc describes an instance of a C++ framework::OperatorBase
// derived class type.
message OpDesc {
......@@ -37,6 +43,7 @@ message OpDesc {
repeated int32 ints = 6;
repeated float floats = 7;
repeated string strings = 8;
repeated IntPair int_pairs = 9;
};
message Var {
......@@ -80,3 +87,24 @@ message OpProto {
repeated Attr attrs = 4;
required string comment = 5;
}
enum DataType {
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LoDTensorDesc {
required DataType data_type = 1;
repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
optional int32 lod_level = 3 [ default = 0 ];
}
message VarDesc {
required string name = 1;
optional LoDTensorDesc lod_tensor = 2;
}
......@@ -3,7 +3,7 @@
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
USE_OP(add_two);
USE_OP(add);
namespace paddle {
namespace framework {
......@@ -41,7 +41,7 @@ namespace f = paddle::framework;
TEST(GradOpBuilder, AddTwo) {
std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
"add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
"add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
std::shared_ptr<f::OperatorBase> grad_add_op =
f::OpRegistry::CreateGradOp(*add_op);
EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
......@@ -54,8 +54,8 @@ TEST(GradOpBuilder, AddTwo) {
EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
}
REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, f::NOP);
REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, f::NOP);
REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
TEST(GradOpBuilder, MutiInOut) {
std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
......
......@@ -19,8 +19,8 @@
namespace paddle {
namespace framework {
LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
LOD new_lod;
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
LoD new_lod;
new_lod.reserve(level_end - level_begin);
for (size_t i = level_begin; i < level_end; i++) {
new_lod.emplace_back(in.at(i));
......@@ -28,10 +28,10 @@ LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
return new_lod;
}
LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end) {
// slice the lod.
LOD new_lod;
LoD new_lod;
new_lod.reserve(in.size() - level);
auto start = in.at(level)[elem_begin];
auto end = in.at(level)[elem_end];
......@@ -46,13 +46,13 @@ LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
std::transform(new_lod.back().begin(), new_lod.back().end(),
new_lod.back().begin(),
[start](int v) { return v - start; });
PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD");
PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
}
PADDLE_ENFORCE_LE(new_lod.size(), in.size());
return new_lod;
}
bool operator==(const LOD& a, const LOD& b) {
bool operator==(const LoD& a, const LoD& b) {
if (a.size() != b.size()) {
return false;
}
......@@ -72,12 +72,12 @@ bool operator==(const LOD& a, const LOD& b) {
return true;
}
void LODTensor::SliceLevels(size_t level_begin, size_t level_end) {
void LoDTensor::SliceLevels(size_t level_begin, size_t level_end) {
auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
lod_ = new_lod;
}
void LODTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
void LoDTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
NumLevels());
PADDLE_ENFORCE(elem_begin < NumElements(level),
......
......@@ -18,8 +18,10 @@
#ifndef PADDLE_ONLY_CPU
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif
#include <glog/logging.h>
#include "paddle/framework/ddim.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/enforce.h"
......@@ -32,37 +34,38 @@ template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<T>;
using Vector = thrust::host_vector<
T, thrust::system::cuda::experimental::pinned_allocator<T>>;
#endif
using LOD = std::vector<Vector<size_t>>;
using LoD = std::vector<Vector<size_t>>;
LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end);
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end);
bool operator==(const LOD& a, const LOD& b);
bool operator==(const LoD& a, const LoD& b);
/*
* LODTensor (Level of details Tensor)
* LoDTensor (Level of details Tensor)
* see https://en.wikipedia.org/wiki/Level_of_details for reference.
*/
class LODTensor {
class LoDTensor {
public:
LODTensor() {}
LODTensor(const LOD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
LoDTensor() {}
LoDTensor(const LoD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
void set_lod(const LOD& lod) { lod_ = lod; }
void set_lod(const LoD& lod) { lod_ = lod; }
void set_tensor(Tensor* tensor) { tensor_ = tensor; }
Tensor& tensor() { return *tensor_; }
LOD lod() { return lod_; }
LoD lod() { return lod_; }
/*
* Get a element from LOD.
* Get a element from LoD.
*/
size_t lod_element(size_t level, size_t elem) const {
PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
......@@ -74,7 +77,7 @@ class LODTensor {
}
/*
* Number of LODTensor's levels, each level has units of data, for example,
* Number of LoDTensor's levels, each level has units of data, for example,
* in the sentence's view, article, paragraph, sentence are 3 levels.
*/
size_t NumLevels() const { return lod_.size(); }
......@@ -100,7 +103,7 @@ class LODTensor {
void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
private:
LOD lod_;
LoD lod_;
Tensor* tensor_; // not owned
};
} // namespace framework
......
......@@ -94,7 +94,7 @@ Let's go on slicing this slice. Its <1,1>-slice is
|||
```
### The General Slicing Algorithm
### The Slicing Algorithm
The algorithm, with over-simplified data structure, is defined as
......@@ -106,17 +106,41 @@ struct LoDTensor {
float* tensor_;
};
LoDTensor Slice(const LoDTensor& lodt, int level, int sequence) {
LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
```
Let us revisit the example above
}
```
3
3 1 2
3 2 4 1 2 3
||| || |||| | || |||
```
### Slicing the Top Level
Suppose that we want to retrieve the <1,2>-slice
Please be aware that an RNN operator only slices the top level of a LoD Tensor to get the step inputs.
```
2
2 3
|| |||
```
```c++
LoDTensor Slice(const LoDTensor& lodt, int sequence) {
we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
To avoid the traversal of the LoD tree at slcing time, we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level. For example, above LoD Tensor can be transformed into
```
0
0 9 10
0 3 5 9 10 12
||| || |||| | || |||
```
We don't really need the 0 on top, so the LoD Tensor could be
}
```
0 9 10
0 3 5 9 10 12
||| || |||| | || |||
```
......@@ -21,7 +21,7 @@
namespace paddle {
namespace framework {
class LODTensorTester : public ::testing::Test {
class LoDTensorTester : public ::testing::Test {
public:
virtual void SetUp() override {
// tensor's batch_size: 30
......@@ -29,7 +29,7 @@ class LODTensorTester : public ::testing::Test {
// 0 10 20
// 0 5 10 15 20
// 0 2 5 7 10 12 15 20
LOD lod;
LoD lod;
lod.push_back(std::vector<size_t>{0, 10, 20});
lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
......@@ -47,21 +47,21 @@ class LODTensorTester : public ::testing::Test {
protected:
platform::CPUPlace place;
Tensor tensor;
LODTensor lod_tensor;
LoDTensor lod_tensor;
};
TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
TEST_F(LODTensorTester, NumElements) {
TEST_F(LoDTensorTester, NumElements) {
ASSERT_EQ(lod_tensor.NumElements(0), 2UL);
ASSERT_EQ(lod_tensor.NumElements(1), 4UL);
ASSERT_EQ(lod_tensor.NumElements(2), 8UL);
}
TEST_F(LODTensorTester, SliceLevels) {
TEST_F(LoDTensorTester, SliceLevels) {
// slice 1 level
for (size_t level = 0; level < 3UL; ++level) {
LODTensor new_lod_tensor = lod_tensor;
LoDTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceLevels(level, level + 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
......@@ -70,7 +70,7 @@ TEST_F(LODTensorTester, SliceLevels) {
}
// slice 2 level
for (size_t level = 0; level < 2UL; ++level) {
LODTensor new_lod_tensor = lod_tensor;
LoDTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceLevels(level, level + 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
......@@ -80,9 +80,9 @@ TEST_F(LODTensorTester, SliceLevels) {
}
}
TEST_F(LODTensorTester, SliceInLevel) {
TEST_F(LoDTensorTester, SliceInLevel) {
size_t level = 0;
LODTensor new_lod_tensor = lod_tensor;
LoDTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceInLevel(level, 0, 2);
EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
......
/*
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h"
#include <gtest/gtest.h>
__global__ void test(size_t* a, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) {
a[i] *= 2;
}
}
TEST(LoDTensor, LoDInGPU) {
paddle::framework::Tensor tensor;
paddle::framework::LoDTensor lod_tensor;
paddle::platform::GPUPlace place(0);
paddle::framework::LoD src_lod;
src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
tensor.Resize({14, 16});
tensor.mutable_data<float>(place);
lod_tensor.set_lod(src_lod);
lod_tensor.set_tensor(&tensor);
CHECK_EQ(lod_tensor.lod_element(0, 2), 4);
CHECK_EQ(lod_tensor.lod_element(0, 4), 8);
auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].data(), lod[0].size());
cudaDeviceSynchronize();
for (size_t i = 0; i < src_lod[0].size(); ++i) {
CHECK_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
}
}
......@@ -33,7 +33,8 @@ namespace framework {
class OpRegistry {
public:
template <typename OpType, typename ProtoMakerType, typename GradOpType>
static void RegisterOp(const std::string& op_type) {
static void RegisterOp(const std::string& op_type,
const std::string& grad_op_type) {
PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
"'%s' is registered more than once.", op_type);
OpInfo op_info;
......@@ -42,9 +43,9 @@ class OpRegistry {
const VariableNameMap& outputs, const AttributeMap& attrs) {
return new OpType(type, inputs, outputs, attrs);
};
op_info.grad_op_type_ = grad_op_type;
if (std::type_index(typeid(ProtoMakerType)) !=
std::type_index(typeid(NOPMaker))) {
op_info.grad_op_type_ = op_type + "_grad";
op_info.proto_ = new OpProto;
op_info.checker_ = new OpAttrChecker;
auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
......@@ -54,14 +55,15 @@ class OpRegistry {
op_info.proto_->IsInitialized(),
"Fail to initialize %s's OpProto, because %s is not initialized",
op_type, op_info.proto_->InitializationErrorString());
// register gradient op
RegisterOp<GradOpType, NOPMaker, NOP>(op_info.grad_op_type_);
} else {
op_info.grad_op_type_ = "";
op_info.proto_ = nullptr;
op_info.checker_ = nullptr;
}
OpInfoMap::Instance().Insert(op_type, op_info);
// register gradient op
if (!grad_op_type.empty()) {
RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
}
}
static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
......@@ -90,8 +92,10 @@ class Registrar {
template <typename OpType, typename ProtoMakerType, typename GradOpType>
class OpRegistrar : public Registrar {
public:
explicit OpRegistrar(const char* op_type) {
OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type);
explicit OpRegistrar(const char* op_type) { OpRegistrar(op_type, ""); }
OpRegistrar(const char* op_type, const char* grad_op_type) {
OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type,
grad_op_type);
}
};
......@@ -117,7 +121,8 @@ class OpKernelRegistrar : public Registrar {
/**
* Macro to register Operator.
*/
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_class) \
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
class _OpClass_##op_type##_ : public op_class { \
......@@ -132,14 +137,14 @@ class OpKernelRegistrar : public Registrar {
}; \
static ::paddle::framework::OpRegistrar< \
_OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_> \
__op_registrar_##op_type##__(#op_type); \
__op_registrar_##op_type##__(#op_type, #grad_op_type); \
int TouchOpRegistrar_##op_type() { \
__op_registrar_##op_type##__.Touch(); \
return 0; \
}
#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
REGISTER_OP(op_type, op_class, op_maker_class, ::paddle::framework::NOP)
REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
/**
* Macro to register OperatorKernel.
......@@ -194,6 +199,8 @@ class OpKernelRegistrar : public Registrar {
USE_OP_DEVICE_KERNEL(op_type, GPU)
#endif
#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
#define USE_CPU_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CPU);
......
......@@ -21,7 +21,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
AddOutput("output", "output of cosine op");
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
.GreaterThan(0.0);
AddComment("This is cos op");
}
};
......@@ -80,7 +80,7 @@ TEST(OpRegistry, CreateOp) {
paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx);
float scale_get = op->GetAttr<float>("scale");
float scale_get = op->Attr<float>("scale");
ASSERT_EQ(scale_get, scale);
}
......@@ -121,7 +121,7 @@ TEST(OpRegistry, DefaultValue) {
paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx);
ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
ASSERT_EQ(op->Attr<float>("scale"), 1.0);
}
TEST(OpRegistry, CustomChecker) {
......@@ -172,38 +172,6 @@ TEST(OpRegistry, CustomChecker) {
paddle::platform::CPUDeviceContext dev_ctx;
paddle::framework::Scope scope;
op->Run(scope, dev_ctx);
int test_attr = op->GetAttr<int>("test_attr");
int test_attr = op->Attr<int>("test_attr");
ASSERT_EQ(test_attr, 4);
}
class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
public:
TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddAttr<float>("scale", "scale of test op");
AddAttr<float>("scale", "scale of test op");
}
};
TEST(ProtoMaker, DuplicatedAttr) {
pd::OpProto op_proto;
pd::OpAttrChecker op_checker;
auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
public:
TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of test op");
AddInput("input", "input of test op");
}
};
TEST(ProtoMaker, DuplicatedInOut) {
pd::OpProto op_proto;
pd::OpAttrChecker op_checker;
auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
}
\ No newline at end of file
......@@ -123,6 +123,15 @@ OperatorBase::OperatorBase(const std::string& type,
CheckAllInputOutputSet();
}
std::vector<std::string> OperatorBase::InputVars() const {
std::vector<std::string> ret_val;
for (auto& o : outputs_) {
ret_val.reserve(ret_val.size() + o.second.size());
ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
}
return ret_val;
}
std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
std::vector<std::string> ret_val;
if (has_intermediate) {
......
......@@ -69,7 +69,7 @@ class OperatorBase {
virtual ~OperatorBase() {}
template <typename T>
inline const T& GetAttr(const std::string& name) const {
inline const T& Attr(const std::string& name) const {
PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
name);
return boost::get<T>(attrs_.at(name));
......@@ -94,11 +94,14 @@ class OperatorBase {
const VariableNameMap& Inputs() const { return inputs_; }
const VariableNameMap& Outputs() const { return outputs_; }
//! Get a input with argument's name described in `op_proto`
std::string Input(const std::string& name) const;
//! Get a input which has multiple variables.
const std::vector<std::string>& Inputs(const std::string& name) const;
std::vector<std::string> InputVars() const;
//! Get a output with argument's name described in `op_proto`
std::string Output(const std::string& name) const;
//! Get an output which has multiple variables.
......@@ -233,6 +236,15 @@ class InferShapeContext {
InferShapeContext(const OperatorBase& op, const Scope& scope)
: op_(op), scope_(scope) {}
const OperatorBase& op() const { return op_; }
const Scope& scope() const { return scope_; }
template <typename T>
inline const T& Attr(const std::string& name) const {
return op_.Attr<T>(name);
}
size_t InputSize(const std::string& name) const {
return op_.Inputs(name).size();
}
......@@ -302,9 +314,9 @@ class InferShapeContext {
}
template <typename T>
std::vector<const T*> MultiOutput(const std::string& name) const {
std::vector<T*> MultiOutput(const std::string& name) const {
auto names = op_.Outputs(name);
std::vector<const T*> res;
std::vector<T*> res;
res.reserve(names.size());
std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) {
......@@ -314,6 +326,7 @@ class InferShapeContext {
return res;
}
private:
const OperatorBase& op_;
const Scope& scope_;
};
......
......@@ -102,7 +102,7 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
AddOutput("y", "output of test op");
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
.GreaterThan(0.0);
AddComment("This is test op");
}
};
......@@ -122,10 +122,10 @@ class CPUKernelTest : public OpKernel {
public:
void Compute(const ExecutionContext& ctx) const {
std::cout << "this is cpu kernel" << std::endl;
std::cout << ctx.op_.DebugString() << std::endl;
std::cout << ctx.op().DebugString() << std::endl;
cpu_kernel_run_num++;
ASSERT_EQ(ctx.op_.Input("x"), "IN1");
ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
ASSERT_EQ(ctx.op().Input("x"), "IN1");
ASSERT_EQ(ctx.op().Output("y"), "OUT1");
}
};
......@@ -140,7 +140,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
AddOutput("ys", "outputs of test op").AsDuplicable();
AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0)
.LargerThan(0.0);
.GreaterThan(0.0);
AddComment("This is test op");
}
};
......@@ -148,7 +148,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
class CPUKernalMultiInputsTest : public OpKernel {
public:
void Compute(const ExecutionContext& ctx) const {
auto xs = ctx.op_.Inputs("xs");
auto xs = ctx.op().Inputs("xs");
ASSERT_EQ(xs.size(), 3UL);
ASSERT_EQ(xs[0], "x0");
ASSERT_EQ(xs[1], "x1");
......@@ -172,10 +172,10 @@ class CPUKernalMultiInputsTest : public OpKernel {
auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
ASSERT_EQ(outTensor0.size(), 2U);
auto k = ctx.op_.Input("k");
auto k = ctx.op().Input("k");
ASSERT_EQ(k, "k0");
auto ys = ctx.op_.Outputs("ys");
auto ys = ctx.op().Outputs("ys");
ASSERT_EQ(ys.size(), 2UL);
ASSERT_EQ(ys[0], "y0");
ASSERT_EQ(ys[1], "y1");
......@@ -263,4 +263,38 @@ TEST(Operator, Clone) {
OperatorClone a("ABC", {}, {}, {});
auto b = a.Clone();
ASSERT_EQ(a.Type(), b->Type());
}
class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
public:
TestAttrProtoMaker(paddle::framework::OpProto* proto,
paddle::framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddAttr<float>("scale", "scale of test op");
AddAttr<float>("scale", "scale of test op");
}
};
TEST(ProtoMaker, DuplicatedAttr) {
paddle::framework::OpProto op_proto;
paddle::framework::OpAttrChecker op_checker;
auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
public:
TestInOutProtoMaker(paddle::framework::OpProto* proto,
paddle::framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input of test op");
AddInput("input", "input of test op");
}
};
TEST(ProtoMaker, DuplicatedInOut) {
paddle::framework::OpProto op_proto;
paddle::framework::OpAttrChecker op_checker;
auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
}
\ No newline at end of file
......@@ -43,6 +43,9 @@ class Tensor {
template <typename T, size_t D, int MajorType, typename IndexType>
friend struct EigenTensor;
template <typename T, int MajorType, typename IndexType>
friend struct EigenMatrix;
template <typename T, int MajorType, typename IndexType>
friend struct EigenVector;
......@@ -78,6 +81,9 @@ class Tensor {
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
/*! Return the numel of the memory block. */
inline int64_t numel() const;
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
......@@ -159,6 +165,12 @@ class Tensor {
/*! points to dimensions of memory block. */
DDim dims_;
/**
* A cache of the number of elements in a tensor.
* Would be 0 for an uninitialized tensor.
*/
int64_t numel_;
/**
* @brief A PlaceHolder may be shared by more than one tensor.
*
......
......@@ -24,7 +24,7 @@ inline void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL(
holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE_GE(
holder_->size(), product(dims_) * sizeof(T) + offset_,
holder_->size(), numel() * sizeof(T) + offset_,
"Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.\n"
"or maybe the required data-type mismatches the data already stored.");
......@@ -54,11 +54,11 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
template <typename T>
inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
PADDLE_ENFORCE_GT(product(dims_), 0,
PADDLE_ENFORCE_GT(numel(), 0,
"Tensor's numel must be larger than zero to call "
"Tensor::mutable_data. Call Tensor::set_dim first.");
/* some versions of boost::variant don't have operator!= */
size_t size = product(dims_) * sizeof(T);
int64_t size = numel() * sizeof(T);
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
......@@ -97,7 +97,7 @@ inline void Tensor::CopyFrom(const Tensor& src,
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
auto size = product(src.dims_) * sizeof(T);
auto size = src.numel() * sizeof(T);
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
......@@ -131,7 +131,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
PADDLE_ENFORCE_LT(begin_idx, end_idx,
"Begin index must be less than end index.");
PADDLE_ENFORCE_NE(dims_[0], 1, "Can not slice a tensor with dims_[0] = 1.");
int base = product(dims_) / dims_[0];
size_t base = numel() / dims_[0];
Tensor dst;
dst.holder_ = holder_;
DDim dst_dims = dims_;
......@@ -143,10 +143,21 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
inline Tensor& Tensor::Resize(const DDim& dims) {
dims_ = dims;
numel_ = product(dims_);
return *this;
}
inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return numel_; }
template <typename T>
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;
res.ShareDataWith<T>(src);
res.Resize(flatten_to_2d(src.dims(), num_col_dims));
return res;
}
} // namespace framework
} // namespace paddle
......@@ -262,3 +262,16 @@ TEST(Tensor, CopyFrom) {
}
#endif
}
TEST(Tensor, ReshapeToMatrix) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor src;
int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, CPUPlace());
for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
src_ptr[i] = i;
}
Tensor res = ReshapeToMatrix<int>(src, 2);
ASSERT_EQ(res.dims()[0], 2 * 3);
ASSERT_EQ(res.dims()[1], 4 * 9);
}
\ No newline at end of file
......@@ -44,6 +44,7 @@ if(WITH_GPU)
add_simple_unittest(RowConvOpTest)
add_simple_unittest(BlockExpandOpTest)
add_simple_unittest(CropOpTest)
add_simple_unittest(SwitchOpTest)
endif()
add_simple_unittest(Im2ColTest)
......
......@@ -83,9 +83,9 @@ struct EigenBlasGemm {
};
#ifdef PADDLE_TYPE_DOUBLE
template class EigenBlasGemm<double>;
template struct EigenBlasGemm<double>;
#else
template class EigenBlasGemm<float>;
template struct EigenBlasGemm<float>;
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "GemmFunctor.h"
#include "hl_cpu_gru.cuh"
namespace paddle {
template <DeviceType Device, class T>
struct GruFunctor {
template <class OpResetOutput, class OpFinalOutput>
static void compute(OpResetOutput opResetOutput,
OpFinalOutput opFinalOutput,
hl_gru_value value,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
#ifndef __NVCC__
if (value.prevOutValue) {
BlasGemm<Device, T>::compute(false,
false,
batchSize,
2 * frameSize,
frameSize,
1,
value.prevOutValue,
frameSize,
value.gateWeight,
frameSize * 2,
1,
value.gateValue,
frameSize * 3);
}
forward_reset_output(
opResetOutput, value, frameSize, batchSize, active_gate);
if (value.prevOutValue) {
BlasGemm<Device, T>::compute(false,
false,
batchSize,
frameSize,
frameSize,
1,
value.resetOutputValue,
frameSize,
value.stateWeight,
frameSize,
1,
value.gateValue + frameSize * 2,
frameSize * 3);
}
forward_final_output(
opFinalOutput, value, frameSize, batchSize, active_node);
#endif
}
};
template <DeviceType Device, class T>
struct GruGradFunctor {
template <class OpStateGrad, class OpResetGrad>
static void compute(OpStateGrad opStateGrad,
OpResetGrad opResetGrad,
hl_gru_value value,
hl_gru_grad grad,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
#ifndef __NVCC__
backward_state_grad(
opStateGrad, value, grad, frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
BlasGemm<Device, T>::compute(false,
true,
batchSize,
frameSize,
frameSize,
1,
grad.gateGrad + frameSize * 2,
frameSize * 3,
value.stateWeight,
frameSize,
0,
grad.resetOutputGrad,
frameSize);
if (grad.stateWeightGrad) {
BlasGemm<Device, T>::compute(true,
false,
frameSize,
frameSize,
batchSize,
1,
value.resetOutputValue,
frameSize,
grad.gateGrad + frameSize * 2,
frameSize * 3,
1,
grad.stateWeightGrad,
frameSize);
}
}
backward_reset_grad(
opResetGrad, value, grad, frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
BlasGemm<Device, T>::compute(false,
true,
batchSize,
frameSize,
frameSize * 2,
1,
grad.gateGrad,
frameSize * 3,
value.gateWeight,
frameSize * 2,
1,
grad.prevOutGrad,
frameSize);
if (grad.gateWeightGrad) {
BlasGemm<Device, T>::compute(true,
false,
frameSize,
frameSize * 2,
batchSize,
1,
value.prevOutValue,
frameSize,
grad.gateGrad,
frameSize * 3,
1,
grad.gateWeightGrad,
frameSize * 2);
}
}
#endif
}
};
} // namespace paddle
......@@ -94,95 +94,4 @@ public:
int paddingWidth);
};
template <class T>
struct Padding {
static void run(const T* src,
T* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
memcpy(dest, src, inputWidth * sizeof(T));
dest += inputWidth;
src += inputWidth;
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
}
}
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
struct Padding<float> {
static void run(const float* src,
float* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
int step = inputWidth >> 2;
int remain = inputWidth & 3;
for (int s = 0; s < step; s++) {
float32x4_t s0 = vld1q_f32(src);
vst1q_f32(dest, s0);
src += 4;
dest += 4;
}
for (int r = 0; r < remain; r++) {
*dest++ = *src++;
}
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
}
}
};
#endif
} // namespace paddle
......@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "MulOp.h"
/// todo(tianbing), delete it
#include <iostream>
#include "paddle/math/MathFunctions.h"
#include "GemmFunctor.h"
#include "paddle/math/SIMDFunctions.h"
#include "paddle/utils/ThreadLocal.h"
#ifndef PADDLE_TYPE_DOUBLE
#define GEMM paddle::gemm<float>
#else
#define GEMM paddle::gemm<double>
#endif
namespace {
inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
for (unsigned int i = 0; i < len; ++i) {
......@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
real scaleT,
bool aTrans,
bool bTrans) {
GEMM(aTrans ? CblasTrans : CblasNoTrans,
bTrans ? CblasTrans : CblasNoTrans,
out.getHeight(),
out.getWidth(),
!aTrans ? a.getWidth() : a.getHeight(),
scaleAB,
a.getData(),
a.getStride(),
b.getData(),
b.getStride(),
scaleT,
out.getData(),
out.getStride());
BlasGemm<DEVICE_TYPE_CPU, real>::compute(
aTrans,
bTrans,
out.getHeight(),
out.getWidth(),
!aTrans ? a.getWidth() : a.getHeight(),
scaleAB,
a.getData(),
a.getStride(),
b.getData(),
b.getStride(),
scaleT,
out.getData(),
out.getStride());
}
/// dense matrix (+)= sparse matrix * dense matrix
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "SwitchOp.h"
#include "paddle/math/Vector.h"
namespace paddle {
template <>
void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
const real* inputs,
const int num,
const int inC,
const int inH,
const int inW,
const int argType) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < inC; ++c) {
for (int h = 0; h < inH; ++h) {
for (int w = 0; w < inW; ++w) {
if (argType == ADD_TO) {
outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
} else {
outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
}
}
}
}
}
}
template <>
void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
const real* inputs,
const int num,
const int inH,
const int inW,
const int inC,
const int argType) {
for (int n = 0; n < num; ++n) {
for (int h = 0; h < inH; ++h) {
for (int w = 0; w < inW; ++w) {
for (int c = 0; c < inC; ++c) {
if (argType == ADD_TO) {
outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
} else {
outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
}
}
}
}
}
}
/**
* \brief Switch dimension order of image input.
* The input and output is a 4D tensor. Switch order
* 'batch_size,channels, height, width' to
* order 'batch_size, height, width, channels'.
*
* Argument in this Function:
* \param inputs input data with order 'batch_size,channels, height, width'.
* \param outputs output data with order 'batch_size, height, width, channels'.
*/
template <DeviceType Device>
class NCHW2NHWCFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
CHECK_EQ(1UL, outputs.size());
size_t num = inputs[0].shape()[0];
size_t inC = inputs[0].shape()[1];
size_t inH = inputs[0].shape()[2];
size_t inW = inputs[0].shape()[3];
NCHW2NHWC<Device>(outputs[0].data<real>(),
inputs[0].data<real>(),
num,
inC,
inH,
inW,
outputs[0].getArgType());
}
};
/**
* \brief Switch dimension order of image input.
* The input and output is a 4D tensor. Switch order
* 'batch_size, height, width, channels' to
* order 'batch_size, channels, height, width'.
*
* Argument in this Function:
* \param inputs input data with order 'batch_size, height, width, channels'.
* \param outputs output data with order 'batch_size, channels, height, width'.
*/
template <DeviceType Device>
class NHWC2NCHWFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
CHECK_EQ(1UL, outputs.size());
size_t num = inputs[0].shape()[0];
size_t inH = inputs[0].shape()[1];
size_t inW = inputs[0].shape()[2];
size_t inC = inputs[0].shape()[3];
NHWC2NCHW<Device>(outputs[0].data<real>(),
inputs[0].data<real>(),
num,
inH,
inW,
inC,
outputs[0].getArgType());
}
};
REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Function.h"
namespace paddle {
/**
* \brief This funtion switch dimension order of image input.
* The input and output is a 4D tensor. Switch order 'batch_size,
*channels, height, width' to
* order 'batch_size, height, width, channels'.
*
* \param[out] outputs save results.
* \param[in] inputs input data.
* \param[in] num batch size of input data.
* \param[in] inC channel number of input data.
* \param[in] inH height of input data.
* \param[in] inH with of input data.
* \param[in] argType type of output argument.
*/
template <DeviceType Device>
void NCHW2NHWC(real* outputs,
const real* inputs,
const int num,
const int inC,
const int inH,
const int inW,
const int argtype);
/**
* \brief This funtion switch dimension order of image input.
* The input and output is a 4D tensor. Switch order 'batch_size,
*height, width, channels' to
* order 'batch_size, channels, height, width'.
*
* \param[out] inGrad gradients of previous layer.
* \param[in] outGrad output gradients.
* \param[in] num batch size of input data.
* \param[in] inH height of input data.
* \param[in] inW with of input data.
* \param[in] inC channel number of input data.
* \param[in] argType type of output argument.
*/
template <DeviceType Device>
void NHWC2NCHW(real* inGrad,
const real* outGrad,
const int num,
const int inH,
const int inW,
const int inC,
const int argType);
} // namespace paddle
/* Copyright (c) 2016 Paddle
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "SwitchOp.h"
#include "hl_base.h"
namespace paddle {
__global__ void KeNCHW2NHWC(real* outputs,
const real* inputs,
int inC,
int inH,
int inW,
int nthreads,
int argType) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) {
const int w = idx % inW;
const int h = (idx / inW) % inH;
const int c = (idx / inW / inH) % inC;
const int n = idx / inW / inH / inC;
const int off = ((n * inH + h) * inW + w) * inC + c;
if (argType == ADD_TO) {
outputs[off] += inputs[idx];
} else {
outputs[off] = inputs[idx];
}
}
}
template <>
void NCHW2NHWC<DEVICE_TYPE_GPU>(real* outputs,
const real* inputs,
const int num,
const int inC,
const int inH,
const int inW,
const int argType) {
size_t nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
KeNCHW2NHWC<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
outputs, inputs, inC, inH, inW, nth, argType);
CHECK_SYNC("NCHW2NHWC");
}
__global__ void KeNHWC2NCHW(real* outputs,
const real* inputs,
int inH,
int inW,
int inC,
int nthreads,
int argType) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) {
const int c = idx % inC;
const int w = (idx / inC) % inW;
const int h = (idx / inC / inW) % inH;
const int n = idx / inW / inH / inC;
const int off = ((n * inC + c) * inH + h) * inW + w;
if (argType == ADD_TO) {
outputs[off] += inputs[idx];
} else {
outputs[off] = inputs[idx];
}
}
}
template <>
void NHWC2NCHW<DEVICE_TYPE_GPU>(real* outputs,
const real* inputs,
const int num,
const int inH,
const int inW,
const int inC,
const int argType) {
int nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
KeNHWC2NCHW<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
outputs, inputs, inH, inW, inC, nth, argType);
CHECK_SYNC("NHWC2NCHW");
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
namespace paddle {
TEST(Pad, real) {
for (size_t numSamples : {1, 4, 8, 16}) {
for (size_t channels : {1, 4, 8, 16}) {
for (size_t imgSizeH : {1, 4, 8, 16}) {
for (size_t imgSizeW : {1, 4, 8, 16}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
for (bool test_grad : {true, false}) {
CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC",
FuncConfig());
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels};
compare.addInputs(
BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
compare.addOutputs(BufferArg(
VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
compare.run();
}
}
}
}
}
}
} // namespace paddle
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "NeonDepthwiseConv.h"
#include "paddle/function/ConvOp.h"
namespace paddle {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <DeviceType Device>
class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
check(inputs, outputs);
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
int batchSize = input[0];
int inputChannels = input[1];
int inputHeight = input[2];
int inputWidth = input[3];
int filterHeight = getFilterHeight(filter);
int filterWidth = getFilterWidth(filter);
int outputChannels = output[1];
int outputHeight = output[2];
int outputWidth = output[3];
int filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
// only support strideH() == strideW() and filterHeight == filterWidth.
CHECK_EQ(strideH(), strideW());
CHECK_EQ(paddingH(), paddingW());
CHECK_EQ(filterHeight, filterWidth);
float* inputData = inputs[0].data<float>();
float* filterData = inputs[1].data<float>();
float* outputData = outputs[0].data<float>();
// padding the input, input -> inputPadding
float* inputPadding = inputData;
int padInputHeight =
(inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
int padInputWidth =
(inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
resizeBuffer<Device>(newSize);
inputPadding = reinterpret_cast<float*>(memory_->getBuf());
if (strideH() == 1) {
neon::Padding<float>::run(inputData,
inputPadding,
batchSize * inputChannels,
inputHeight,
inputWidth,
padInputHeight,
padInputWidth);
} else if (strideH() == 2) {
neon::StridePadding::run(inputData,
inputPadding,
batchSize * inputChannels,
inputHeight,
inputWidth,
padInputHeight,
padInputWidth);
} else {
LOG(FATAL) << "Not supported";
}
}
std::function<void(
const float*, const float*, int, int, int, int, int, int, float*)>
DepthWiseConv;
if (filterWidth == 3) {
DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
} else if (filterWidth == 4) {
DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
} else {
LOG(FATAL) << "Not supported";
}
for (int i = 0; i < batchSize; i++) {
DepthWiseConv(inputPadding,
filterData,
padInputHeight,
padInputWidth,
outputChannels,
outputHeight,
outputWidth,
filterMultiplier,
outputData);
inputPadding += inputChannels * padInputHeight * padInputWidth;
outputData += outputChannels * outputHeight * outputWidth;
}
}
};
#ifndef PADDLE_TYPE_DOUBLE
REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
CPU,
NeonDepthwiseConvTransposeFunction);
#endif
#endif
} // namespace paddle
......@@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) {
return vget_lane_f32(vpadd_f32(v, v), 0);
}
inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
float32x4_t b,
float32x4_t v,
const int lane) {
return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
}
#define vmlaq_laneq_f32(a, b, v, lane) \
vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
#endif
} // namespace neon
......
......@@ -14,18 +14,20 @@ limitations under the License. */
#include "Evaluator.h"
#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
#include "paddle/utils/StringUtil.h"
namespace paddle {
/**
* calculate sequence-to-sequence edit distance
*/
class CTCErrorEvaluator : public NotGetableEvaluator {
class CTCErrorEvaluator : public Evaluator {
private:
MatrixPtr outActivations_;
int numTimes_, numClasses_, numSequences_, blank_;
real deletions_, insertions_, substitutions_;
int seqClassficationError_;
mutable std::unordered_map<std::string, real> evalResults_;
std::vector<int> path2String(const std::vector<int>& path) {
std::vector<int> str;
......@@ -183,6 +185,18 @@ private:
return stringAlignment(gtStr, recogStr);
}
void storeLocalValues() const {
evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
evalResults_["deletion_error"] =
numSequences_ ? deletions_ / numSequences_ : 0;
evalResults_["insertion_error"] =
numSequences_ ? insertions_ / numSequences_ : 0;
evalResults_["substitution_error"] =
numSequences_ ? substitutions_ / numSequences_ : 0;
evalResults_["sequence_error"] =
(real)seqClassficationError_ / numSequences_;
}
public:
CTCErrorEvaluator()
: numTimes_(0),
......@@ -245,16 +259,12 @@ public:
}
virtual void printStats(std::ostream& os) const {
os << config_.name() << "="
<< (numSequences_ ? totalScore_ / numSequences_ : 0);
os << " deletions error"
<< "=" << (numSequences_ ? deletions_ / numSequences_ : 0);
os << " insertions error"
<< "=" << (numSequences_ ? insertions_ / numSequences_ : 0);
os << " substitutions error"
<< "=" << (numSequences_ ? substitutions_ / numSequences_ : 0);
os << " sequences error"
<< "=" << (real)seqClassficationError_ / numSequences_;
storeLocalValues();
os << config_.name() << " error = " << evalResults_["error"];
os << " deletions error = " << evalResults_["deletion_error"];
os << " insertions error = " << evalResults_["insertion_error"];
os << " substitution error = " << evalResults_["substitution_error"];
os << " sequence error = " << evalResults_["sequence_error"];
}
virtual void distributeEval(ParameterClient2* client) {
......@@ -272,6 +282,37 @@ public:
seqClassficationError_ = (int)buf[4];
numSequences_ = (int)buf[5];
}
void getNames(std::vector<std::string>* names) {
storeLocalValues();
names->reserve(names->size() + evalResults_.size());
for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
names->push_back(config_.name() + "." + it->first);
}
}
real getValue(const std::string& name, Error* err) const {
storeLocalValues();
std::vector<std::string> buffers;
paddle::str::split(name, '.', &buffers);
auto it = evalResults_.find(buffers[buffers.size() - 1]);
if (it == evalResults_.end()) {
*err = Error("Evaluator does not have the key %s", name.c_str());
return 0.0f;
}
return it->second;
}
std::string getType(const std::string& name, Error* err) const {
this->getValue(name, err);
if (!err->isOK()) {
return "";
}
return "ctc_edit_distance";
}
};
REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
......
......@@ -268,7 +268,13 @@ public:
}
// get type of evaluator
std::string getTypeImpl() const { return "chunk"; }
std::string getType(const std::string& name, Error* err) const {
this->getValue(name, err);
if (!err->isOK()) {
return "";
}
return "chunk";
}
private:
void storeLocalValues() const {
......
......@@ -211,6 +211,7 @@ public:
*err = Error("Not implemented");
return .0f;
}
std::string getType(const std::string& name, Error* err) const {
*err = Error("Not implemented");
return "";
......@@ -331,6 +332,7 @@ private:
protected:
std::string getTypeImpl() const;
};
/**
* @brief precision, recall and f1 score Evaluator
* \f[
......@@ -358,6 +360,12 @@ public:
virtual void distributeEval(ParameterClient2* client);
void getNames(std::vector<std::string>* names);
real getValue(const std::string& name, Error* err) const;
std::string getType(const std::string& name, Error* err) const;
struct StatsInfo {
/// numbers of true positives
double TP;
......@@ -428,11 +436,6 @@ private:
mutable std::unordered_map<std::string, real> values_;
void storeLocalValues() const;
// Evaluator interface
public:
void getNames(std::vector<std::string>* names);
real getValue(const std::string& name, Error* err) const;
std::string getType(const std::string& name, Error* err) const;
};
/*
......
......@@ -62,14 +62,18 @@ void BatchNormBaseLayer::calFeatureMapSize() {
const ImageConfig& conf = config_.inputs(0).image_conf();
imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
if (0 == imageD_) imageD_ = conf.img_size_z();
if (imageH_ == 0 && imageW_ == 0) {
imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
imageW_ = conf.img_size();
} else {
getOutput().setFrameHeight(imageH_);
getOutput().setFrameWidth(imageW_);
getOutput().setFrameDepth(imageD_);
}
imgPixels_ = imageH_ * imageW_;
imgPixels_ = imageH_ * imageW_ * imageD_;
}
} // namespace paddle
......@@ -80,6 +80,7 @@ protected:
/// Height or width of input image feature.
/// Both of them are 1 if the input is fully-connected layer.
int imageD_;
int imageH_;
int imageW_;
/// Height * Width.
......
......@@ -42,10 +42,10 @@ bool Conv3DLayer::init(const LayerMap &layerMap,
if (sharedBiases_) {
CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
biases_ =
std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
} else {
biases_ =
std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
}
}
return true;
......@@ -83,8 +83,8 @@ void Conv3DLayer::forward(PassType passType) {
int outWidth = getSize();
resetOutput(batchSize, outWidth);
REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
const MatrixPtr &inMat = getInputValue(i);
const MatrixPtr &outMat = getOutputValue();
int M = M_[i];
......@@ -120,7 +120,6 @@ void Conv3DLayer::forward(PassType passType) {
}
}
if (nullptr != this->biasParameter_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
this->addBias();
}
forwardActivation();
......@@ -134,15 +133,14 @@ void Conv3DLayer::backward(const UpdateCallback &callback) {
biases_->getParameterPtr()->incUpdate(callback);
}
REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
if (weights_[i]->getWGrad()) {
bpropWeights(i);
}
if (getInputGrad(i)) {
bpropData(i);
}
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
weights_[i]->getParameterPtr()->incUpdate(callback);
}
}
......@@ -224,20 +222,31 @@ void Conv3DLayer::bpropData(int i) {
}
void Conv3DLayer::bpropBiases() {
MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
1,
biases_->getWGrad()->getElementCnt(),
false,
useGpu_);
MatrixPtr outGradMat = getOutputGrad();
if (this->sharedBiases_) {
biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
biases->collectSharedBias(*outGradMat, 1.0f);
} else {
biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
biases->collectBias(*outGradMat, 1.0f);
}
}
void Conv3DLayer::addBias() {
MatrixPtr outMat = getOutputValue();
MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
1,
biases_->getW()->getElementCnt(),
false,
useGpu_);
if (this->sharedBiases_) {
outMat->addSharedBias(*(biases_->getW()), 1.0f);
outMat->addSharedBias(*(bias), 1.0f);
} else {
outMat->addBias(*(biases_->getW()), 1.0f);
outMat->addBias(*(bias), 1.0f);
}
}
......
......@@ -37,7 +37,7 @@ bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
}
void CudnnBatchNormLayer::reshape(int batchSize) {
hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_, imageW_);
hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
}
void CudnnBatchNormLayer::forward(PassType passType) {
......@@ -104,7 +104,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
EPS,
batchSize,
channels_,
imageH_,
imageH_ * imageD_,
imageW_);
}
}
......
......@@ -42,10 +42,10 @@ bool DeConv3DLayer::init(const LayerMap &layerMap,
if (sharedBiases_) {
CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
biases_ =
std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
} else {
biases_ =
std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
}
}
return true;
......@@ -53,27 +53,27 @@ bool DeConv3DLayer::init(const LayerMap &layerMap,
size_t DeConv3DLayer::getSize() {
CHECK_NE(inputLayers_.size(), 0UL);
outputH_.clear();
outputW_.clear();
outputD_.clear();
imgSizeW_.clear();
imgSizeH_.clear();
imgSizeD_.clear();
N_.clear();
NOut_.clear();
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); ++i) {
outputW_.push_back(
imageSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
outputH_.push_back(imageSize(
imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
outputD_.push_back(imageSize(
imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
NOut_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
N_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
imgSizeW_.push_back(
imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
imgSizeH_.push_back(imageSize(
outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
imgSizeD_.push_back(imageSize(
outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
layerSize += NOut_[i] * numFilters_;
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
getOutput().setFrameDepth(outputD_[0]);
getOutput().setFrameHeight(imgSizeH_[0]);
getOutput().setFrameWidth(imgSizeW_[0]);
getOutput().setFrameDepth(imgSizeD_[0]);
return layerSize;
}
......@@ -84,8 +84,8 @@ void DeConv3DLayer::forward(PassType passType) {
resetOutput(batchSize, outWidth);
const MatrixPtr outMat = getOutputValue();
REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
const MatrixPtr &inMat = getInputValue(i);
int M = M_[i];
int N = N_[i];
......@@ -103,9 +103,9 @@ void DeConv3DLayer::forward(PassType passType) {
}
colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
numFilters_,
outputD_[i],
outputH_[i],
outputW_[i],
imgSizeD_[i],
imgSizeH_[i],
imgSizeW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
......@@ -120,7 +120,6 @@ void DeConv3DLayer::forward(PassType passType) {
}
}
if (nullptr != this->biasParameter_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
this->addBias();
}
forwardActivation();
......@@ -133,21 +132,21 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
bpropBiases();
biases_->getParameterPtr()->incUpdate(callback);
}
REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
for (size_t i = 0; i < inputLayers_.size(); ++i) {
if (weights_[i]->getWGrad() || this->needGradient_) {
int M = M_[i];
int N = N_[i];
int K = K_[i];
REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
const MatrixPtr &inMat = getInputValue(i);
for (int n = 0; n < batchSize; ++n) {
colBuf_->vol2Col(
getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
numFilters_,
outputD_[i],
outputH_[i],
outputW_[i],
imgSizeD_[i],
imgSizeH_[i],
imgSizeW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
......@@ -182,7 +181,6 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
}
}
}
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
weights_[i]->getParameterPtr()->incUpdate(callback);
}
}
......@@ -191,21 +189,31 @@ void DeConv3DLayer::bpropWeights(int i) {}
void DeConv3DLayer::bpropData(int i) {}
void DeConv3DLayer::bpropBiases() {
MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
1,
biases_->getWGrad()->getElementCnt(),
false,
useGpu_);
const MatrixPtr &outGradMat = getOutputGrad();
if (this->sharedBiases_) {
biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
biases->collectSharedBias(*outGradMat, 1.0f);
} else {
biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
biases->collectBias(*outGradMat, 1.0f);
}
}
void DeConv3DLayer::addBias() {
MatrixPtr outMat = getOutputValue();
MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
1,
biases_->getW()->getElementCnt(),
false,
useGpu_);
if (this->sharedBiases_) {
outMat->addSharedBias(*(biases_->getW()), 1.0f);
outMat->addSharedBias(*(bias), 1.0f);
} else {
outMat->addBias(*(biases_->getW()), 1.0f);
outMat->addBias(*(bias), 1.0f);
}
}
......
......@@ -139,7 +139,13 @@ void DetectionOutputLayer::forward(PassType passType) {
allDecodedBBoxes,
&allIndices);
resetOutput(numKept, 7);
if (numKept > 0) {
resetOutput(numKept, 7);
} else {
MatrixPtr outV = getOutputValue();
outV = NULL;
return;
}
MatrixPtr outV = getOutputValue();
getDetectionOutput(confBuffer_->getData(),
numKept,
......
......@@ -469,7 +469,7 @@ size_t getDetectionIndices(
const size_t numClasses,
const size_t backgroundId,
const size_t batchSize,
const size_t confThreshold,
const real confThreshold,
const size_t nmsTopK,
const real nmsThreshold,
const size_t keepTopK,
......
......@@ -275,7 +275,7 @@ size_t getDetectionIndices(
const size_t numClasses,
const size_t backgroundId,
const size_t batchSize,
const size_t confThreshold,
const real confThreshold,
const size_t nmsTopK,
const real nmsThreshold,
const size_t keepTopK,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册