提交 5f905248 编写于 作者: D dangqingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into lod_tensor_py

...@@ -4,7 +4,6 @@ cache: ...@@ -4,7 +4,6 @@ cache:
- $HOME/.ccache - $HOME/.ccache
- $HOME/.cache/pip - $HOME/.cache/pip
- $TRAVIS_BUILD_DIR/build/third_party - $TRAVIS_BUILD_DIR/build/third_party
- $TRAVIS_BUILD_DIR/build_android/third_party
sudo: required sudo: required
dist: trusty dist: trusty
os: os:
...@@ -12,7 +11,6 @@ os: ...@@ -12,7 +11,6 @@ os:
env: env:
- JOB=build_doc - JOB=build_doc
- JOB=check_style - JOB=check_style
- JOB=build_android
addons: addons:
apt: apt:
packages: packages:
......
...@@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com> ...@@ -4,9 +4,15 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
ARG UBUNTU_MIRROR ARG UBUNTU_MIRROR
RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
# ENV variables
ARG ANDROID_ABI
ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV HOME=/root \ ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \ ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y \ apt-get install -y \
...@@ -15,12 +21,11 @@ RUN apt-get update && \ ...@@ -15,12 +21,11 @@ RUN apt-get update && \
apt-get clean -y apt-get clean -y
# Install Go and glide # Install Go and glide
RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \ RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
tar -C /usr/local -xzf go.tgz && \ tar -xz -C /usr/local && \
mkdir /root/gopath && \ mkdir /root/gopath && \
mkdir /root/gopath/bin && \ mkdir /root/gopath/bin && \
mkdir /root/gopath/src && \ mkdir /root/gopath/src
rm go.tgz
ENV GOROOT=/usr/local/go GOPATH=/root/gopath ENV GOROOT=/usr/local/go GOPATH=/root/gopath
# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
...@@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \ ...@@ -42,7 +47,8 @@ RUN mkdir /opt/android-ndk-tmp && \
wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \ wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
unzip -q android-ndk-r14b-linux-x86_64.zip && \ unzip -q android-ndk-r14b-linux-x86_64.zip && \
mv android-ndk-r14b ${ANDROID_NDK_HOME} && \ mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \ ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-23 --install-dir=${ANDROID_ARM_STANDALONE_TOOLCHAIN} && \
${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm64 --platform=android-23 --install-dir=${ANDROID_ARM64_STANDALONE_TOOLCHAIN} && \
rm -rf /opt/android-ndk-tmp && \ rm -rf /opt/android-ndk-tmp && \
rm -rf ${ANDROID_NDK_HOME} rm -rf ${ANDROID_NDK_HOME}
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
# The supported variables are listed belows: # The supported variables are listed belows:
# #
# ANDROID_STANDALONE_TOOLCHAIN # ANDROID_STANDALONE_TOOLCHAIN
# ANDROID_TOOLCHAIN
# ANDROID_ABI # ANDROID_ABI
# ANDROID_NATIVE_API_LEVEL # ANDROID_NATIVE_API_LEVEL
# ANDROID_ARM_MODE # ANDROID_ARM_MODE
...@@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL) ...@@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
ENDIF() ENDIF()
ENDIF() ENDIF()
IF(NOT DEFINED ANDROID_TOOLCHAIN)
SET(ANDROID_TOOLCHAIN clang)
ENDIF()
IF(NOT DEFINED ANDROID_ABI) IF(NOT DEFINED ANDROID_ABI)
SET(ANDROID_ABI "armeabi-v7a") SET(ANDROID_ABI "armeabi-v7a")
ENDIF() ENDIF()
...@@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ...@@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
"${CMAKE_VERSION}), when cross-compiling for Android.") "${CMAKE_VERSION}), when cross-compiling for Android.")
IF(ANDROID_STANDALONE_TOOLCHAIN) IF(ANDROID_STANDALONE_TOOLCHAIN)
# Use standalone toolchain
SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot") SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
IF(NOT CMAKE_SYSTEM_VERSION) IF(NOT CMAKE_SYSTEM_VERSION)
...@@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ...@@ -96,26 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ENDIF() ENDIF()
# Toolchain # Toolchain
SET(ANDROID_TOOLCHAIN "gcc")
SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN}) SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") ELSE(ANDROID_NDK)
SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi) # TODO: use android ndk
IF(ANDROID_ABI STREQUAL "armeabi") ENDIF()
SET(CMAKE_SYSTEM_PROCESSOR armv5te)
ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a") IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
SET(CMAKE_SYSTEM_PROCESSOR armv7-a) SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
ENDIF() IF(ANDROID_ABI STREQUAL "armeabi")
ENDIF() SET(CMAKE_SYSTEM_PROCESSOR armv5te)
IF(ANDROID_ABI STREQUAL "arm64-v8a") SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android) ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
SET(CMAKE_SYSTEM_PROCESSOR aarch64) SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
ENDIF() ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-") ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
SET(CMAKE_SYSTEM_PROCESSOR aarch64)
SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
ENDIF()
SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
IF(ANDROID_TOOLCHAIN STREQUAL clang)
SET(ANDROID_C_COMPILER_NAME clang)
SET(ANDROID_CXX_COMPILER_NAME clang++)
SET(CMAKE_C_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
SET(ANDROID_C_COMPILER_NAME gcc)
SET(ANDROID_CXX_COMPILER_NAME g++)
ELSE()
MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
ENDIF() ENDIF()
# C compiler # C compiler
IF(NOT CMAKE_C_COMPILER) IF(NOT CMAKE_C_COMPILER)
SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc") SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
ELSE() ELSE()
GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM) GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
ENDIF() ENDIF()
...@@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ...@@ -125,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
# CXX compiler # CXX compiler
IF(NOT CMAKE_CXX_COMPILER) IF(NOT CMAKE_CXX_COMPILER)
SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++") SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
ELSE() ELSE()
GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM) GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
ENDIF() ENDIF()
...@@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ...@@ -137,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE) SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
# Toolchain and ABI specific flags. # Toolchain and ABI specific flags.
SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64") SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections") SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
IF(ANDROID_ABI STREQUAL "armeabi") IF(ANDROID_ABI STREQUAL "armeabi")
...@@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ...@@ -145,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
-march=armv5te -march=armv5te
-mtune=xscale -mtune=xscale
-msoft-float) -msoft-float)
ENDIF() ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
IF(ANDROID_ABI STREQUAL "armeabi-v7a")
LIST(APPEND ANDROID_COMPILER_FLAGS LIST(APPEND ANDROID_COMPILER_FLAGS
-march=armv7-a -march=armv7-a
-mfloat-abi=softfp) -mfloat-abi=softfp)
...@@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ...@@ -156,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16) LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
ENDIF() ENDIF()
LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8) LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
ENDIF() ENDIF()
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
...@@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0") ...@@ -164,10 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
ELSE() ELSE()
LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb) LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
ENDIF() ENDIF()
IF(ANDROID_TOOLCHAIN STREQUAL clang)
# Disable integrated-as for better compatibility.
LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
ENDIF()
ENDIF() ENDIF()
IF(ANDROID_ABI STREQUAL "arm64-v8a") IF(ANDROID_TOOLCHAIN STREQUAL clang)
LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a) # CMake automatically forwards all compiler flags to the linker,
# and clang doesn't like having -Wa flags being used for linking.
# To prevent CMake from doing this would require meddling with
# the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
ENDIF() ENDIF()
STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}") STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
......
...@@ -12,6 +12,10 @@ ...@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
IF(USE_EIGEN_FOR_BLAS)
return()
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas) INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
......
# Design Doc: Computations as Graphs # Design Doc: Computations as a Graph
A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before. A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
...@@ -8,6 +8,8 @@ This document explains that the construction of a graph as three steps: ...@@ -8,6 +8,8 @@ This document explains that the construction of a graph as three steps:
- construct the backward part - construct the backward part
- construct the optimization part - construct the optimization part
## The Construction of a Graph
Let us take the problem of image classification as a simple example. The application program that trains the model looks like: Let us take the problem of image classification as a simple example. The application program that trains the model looks like:
```python ```python
...@@ -25,7 +27,9 @@ The first four lines of above program build the forward part of the graph. ...@@ -25,7 +27,9 @@ The first four lines of above program build the forward part of the graph.
![](images/graph_construction_example_forward_only.png) ![](images/graph_construction_example_forward_only.png)
In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b. In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message. In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message.
...@@ -49,3 +53,18 @@ According to the chain rule of gradient computation, `ConstructBackwardGraph` wo ...@@ -49,3 +53,18 @@ According to the chain rule of gradient computation, `ConstructBackwardGraph` wo
For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph: For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph:
![](images/graph_construction_example_all.png) ![](images/graph_construction_example_all.png)
## Block and Graph
The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
A Block keeps operators in an array `BlockDesc::ops`
```protobuf
message BlockDesc {
repeated OpDesc ops = 1;
repeated VarDesc vars = 2;
}
```
in the order that there appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
...@@ -2,6 +2,8 @@ digraph ImageClassificationGraph { ...@@ -2,6 +2,8 @@ digraph ImageClassificationGraph {
///////// The forward part ///////// ///////// The forward part /////////
FeedX [label="Feed", color=blue, shape=box]; FeedX [label="Feed", color=blue, shape=box];
FeedY [label="Feed", color=blue, shape=box]; FeedY [label="Feed", color=blue, shape=box];
InitW [label="Init", color=blue, shape=diamond];
Initb [label="Init", color=blue, shape=diamond];
FC [label="FC", color=blue, shape=box]; FC [label="FC", color=blue, shape=box];
MSE [label="MSE", color=blue, shape=box]; MSE [label="MSE", color=blue, shape=box];
...@@ -14,6 +16,8 @@ digraph ImageClassificationGraph { ...@@ -14,6 +16,8 @@ digraph ImageClassificationGraph {
FeedX -> x -> FC -> y -> MSE -> cost [color=blue]; FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
FeedY -> l [color=blue]; FeedY -> l [color=blue];
InitW -> W [color=blue];
Initb -> b [color=blue];
W -> FC [color=blue]; W -> FC [color=blue];
b -> FC [color=blue]; b -> FC [color=blue];
l -> MSE [color=blue]; l -> MSE [color=blue];
......
...@@ -147,7 +147,7 @@ class CosineOp { ...@@ -147,7 +147,7 @@ class CosineOp {
struct CosineOpProtoMaker : public OpProtoMaker { struct CosineOpProtoMaker : public OpProtoMaker {
CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) { CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
AddInput("input", "input of cosine op"); AddInput("input", "input of cosine op");
AddAttr("scale", "scale of cosine op", float).Default(1.0).LargerThan(0.0); AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
AddType("cos"); AddType("cos");
AddComment("This is cos op"); AddComment("This is cos op");
} }
......
## Background
PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
PaddlePaddle use proto message to describe compile time graph for
1. Computation graph should be able to be saved to a file.
1. In distributed training, the graph will be serialized and send to multiple workers.
The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
| |compile time|runtime|
|---|---|---|
|Data|VarDesc(proto)|Variable(cpp)|
|Operation|OpDesc(proto)|Operator(cpp)|
## Definition of VarDesc
A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
```proto
message VarDesc {
required string name = 1;
optional LoDTensorDesc lod_tensor = 2;
}
```
## Definition of LodTensorDesc
```proto
enum DataType {
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LoDTensorDesc {
required DataType data_type = 1;
repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
optional int32 lod_level = 3 [default=0];
}
```
## Definition of Variable in Python
In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
```python
image = Variable(dims=[-1, 640, 480])
# fc1 and fc2 are both Variable
fc1 = layer.fc(input=image, output_size=10)
fc2 = layer.fc(input=fc1, output_size=20)
```
### what should class `Variable` Have
1. `name`.a name of string type is used to mark the value of the Variable.
1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
1. `operator`. Variable should record which operator produce itself. The reaon is:
- we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
```python
import VarDesc
import LoDTensorDesc
import framework
def AddInitialOperator(variable, initializer):
# add an initialize Operator to block to init this Variable
class Variable(object):
def __init__(self, name, dims, type, initializer):
self._block = get_default_block()
self._name = name
self.op = None
tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
_var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
self._var = framework.CreateVar(_var_desc)
self._block.add_var(self)
# add initial op according to initializer
if initializer is not None:
AddInitialOperator(self, initializer)
def dims(self):
return self._var.dims()
def data_type(self):
return self._var.data_type()
def to_proto(self):
pass
```
Then we can use this Variable to create a fc layer in Python.
```python
import paddle as pd
def flatten_size(X, num_flatten_dims):
prod = 1 # of last num_flatten_dims
for i in xrange(num_flatten_dims):
prod = prod * X.dims[-i-1]
return prod
def layer.fc(X, output_size, num_flatten_dims):
W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
out = Variable(type=FP32)
y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
pd.InferShape(y)
return out
x = Variable(dims=[-1, 640, 480])
y = layer.fc(x, output_size=100)
z = layer.fc(y, output_size=200)
paddle.eval(targets=[z], ...)
print(z)
```
...@@ -23,17 +23,20 @@ ...@@ -23,17 +23,20 @@
- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。 - `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成 - `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
依据是否包含kernel,将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorBase`,后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下: 依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorBase`,后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
内容 | 定义位置 内容 | 定义位置
-------------- | :---------------------- -------------- | :----------------------
OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake
Op定义 | `.cc`文件 Op定义 | `.cc`文件
Kernel实现 | CPU、GPU共享Kernel在`.h`文件,否则,CPU可以在`.cc`文件,GPU可在`.cu`文件。 Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,GPU 实现在`.cu`文件中。
注册Op | Op注册在`.cc`文件;Kernel注册CPU在`.cc`文件,GPU在`.cu`文件 注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,GPU实现在`.cu`文件中
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc``*_op.cu`(如有)结尾。
下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。 下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
...@@ -42,9 +45,11 @@ Kernel实现 | CPU、GPU共享Kernel在`.h`文件,否则,CPU可以在` ...@@ -42,9 +45,11 @@ Kernel实现 | CPU、GPU共享Kernel在`.h`文件,否则,CPU可以在`
### 1. 定义ProtoMaker类 ### 1. 定义ProtoMaker类
矩阵乘的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释: 矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。
``` 首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释:
```cpp
class MulOpMaker : public framework::OpProtoAndCheckerMaker { class MulOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
...@@ -59,20 +64,20 @@ The equation is: Out = X * Y ...@@ -59,20 +64,20 @@ The equation is: Out = X * Y
} }
}; };
``` ```
[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`,构造函数包括2个 [`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数
- `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。 - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。
- `framework::OpAttrChecker` :后者用于检查参数属性的合法性。 - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。
构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加该Op的注释,这些函数会将对应内容添加到`OpProto`中。
`MulOp`中添加两个输入`X``Y`,添加了一个输出`Out`,并解释了各自含义,该命名尽可能的规范 构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`
上面的代码在`MulOp`中添加两个输入`X``Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守命名规范。
再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子:
``` 再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例:
```cpp
template <typename AttrType> template <typename AttrType>
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
...@@ -87,17 +92,19 @@ The equation is: Out = scale*X ...@@ -87,17 +92,19 @@ The equation is: Out = scale*X
} }
}; };
``` ```
在这个例子里,两处不同: 这个例子有两处不同:
- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。 - `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中,如果Op的某个输入不参与反向梯度的计算,请显示地调用`.NotInGradient()`进行设置。
- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
### 2. 定义Operator类 ### 2. 定义Operator类
下面的点实现了MulOp的定义:
```c++ ```cpp
class MulOp : public framework::OperatorWithKernel { class MulOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -121,33 +128,46 @@ class MulOp : public framework::OperatorWithKernel { ...@@ -121,33 +128,46 @@ class MulOp : public framework::OperatorWithKernel {
``` ```
[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel``public`成员: [`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel``public`成员:
```c++ ```cpp
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
``` ```
这句表示使用基类`OperatorWithKernel`的构造函数,也可写成: 这句表示使用基类`OperatorWithKernel`的构造函数,也可写成:
```c++ ```cpp
MulOp(const std::string &type, const framework::VariableNameMap &inputs, MulOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs, const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs) const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
``` ```
还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是: 还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是:
- 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。 - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。
- 2). 设置输出Tensor的形状。 - 2). 设置输出Tensor的形状。
通常`OpProtoMaker``Op`类的定义写在`.cc`文件中,和要讲到的注册函数一起放在`.cc` 通常`OpProtoMaker``Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`
### 3. 定义OpKernel类 ### 3. 定义OpKernel类
```C++ `MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
template <typename Place, typename T>
class MulKernel : public framework::OpKernel { - `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)
public:
- `typename T` : 表示数据类型,如`float`, `double`等。
需要为`MulKernel`类重写`Compute`接口。
- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`
-`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。
- `Compute`函数里实现`OpKernel`的具体计算逻辑。
下面是 `MulKernel` `Compute`的实现:
```cpp
template <typename Place, typename T>
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<Tensor>("X"); auto* X = context.Input<Tensor>("X");
auto* Y = context.Input<Tensor>("Y"); auto* Y = context.Input<Tensor>("Y");
...@@ -157,141 +177,136 @@ class MulKernel : public framework::OpKernel { ...@@ -157,141 +177,136 @@ class MulKernel : public framework::OpKernel {
const_cast<platform::DeviceContext*>(context.device_context_); const_cast<platform::DeviceContext*>(context.device_context_);
math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context); math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
} }
}; };
``` ```
`MulKernel`继承自`framework::OpKernel`,带有模板参数: 需要注意:**不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。**
- `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43) `MulOp`的CPU、GPU实现共享同一个`Kernel``OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)
- `typename T` : 表示数据类型,如`float`, `double` 为了使`OpKernel`的计算过程书写更加简单,并且CPU、GPU的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)
`MulKernel`需要重写`Compute`接口,该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型,同样可获取到输入输出和属性参数,`Compute`函数里写具体实现时。
到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。
注意,不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel``OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43) 反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**
为了使得`OpKernel`的计算过程书写较为简单,CPU、GPU的代码可以复用,我们通常借助Eigen unsupported Tensor模块来实现。关于在paddle中如何使用Eigen库,请参考对应的使用[文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)
到此前向Op实现完成,需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似,这里不再重复。但注意,反向Op没有`ProtoMaker`
### 4. 注册Operator ### 4. 注册Operator
`.cc`文件中注册前向、反向Op类,注册CPU Kernel。 -`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
```cpp
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::CPUPlace, float>);
```
在上面的代码中:
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。
- `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulKernel`类。
```c++
namespace ops = paddle::operators; -`.cu`文件中注册GPU Kernel。
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); - 请注意,如果GPU Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下:
REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mul_grad, ```cpp
ops::MulGradKernel<paddle::platform::CPUPlace, float>); // if use Eigen unsupported module before include head files
``` #define EIGEN_USE_GPU
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker``ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad` namespace ops = paddle::operators;
- `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace``float`类型,同理,注册`ops::MulKernel`类。 REGISTER_OP_GPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::GPUPlace, float>);
`.cu`文件中注册GPU Kernel。请注意,如果GPU Kernel的实现是基于Eigen unsupported模块,那么在 `.cu`的最前面请加上宏定义 `#define EIGEN_USE_GPU` ```
```c++
// if use Eigen unsupported module before include head files
#define EIGEN_USE_GPU
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::GPUPlace, float>);
```
### 5. 编译 ### 5. 编译
[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。 - 简单**无特殊依赖**的OP无需修改CMakeList.txt文件。[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt) 会自动将 `paddle/operators` 目录下新增的 `*_op.cc` 文件加入编译。
- 较为复杂、**有额外依赖** 的operator仍需要修改[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)。如,`mul_op` 依赖 `math_function`,需要在`CMakeLists.txt`中添加如下内容:
```
op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function) ```
``` op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function) +
```
下面命令可以编译:
- 运行下面命令可以进行编译:
```
make mul_op ```
``` make mul_op
```
## 绑定Python ## 绑定Python
- 绑定Python - 绑定Python
[`paddle/pybind/pybind.cc 在 [`paddle/pybind/pybind.cc
`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类: `](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) 使用`USE_OP`告知编译器需要链接的Op,具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
``` ```
USE_OP(mul); USE_OP(mul);
``` ```
如果只实现了CPU版本,则使用`USE_CPU_ONLY_OP`: 如果只实现了CPU版本,则使用`USE_CPU_ONLY_OP`:
``` ```
USE_CPU_ONLY_OP(gather); USE_CPU_ONLY_OP(gather);
``` ```
如果OP不带Kernel,则使用`USE_NO_KENREL_OP`: 如果OP不带Kernel,则使用`USE_NO_KENREL_OP`:
``` ```
USE_NO_KENREL_OP(recurrent); USE_NO_KENREL_OP(recurrent);
``` ```
使用`USE_OP`告知编译器需要链接该Op的目标文件,具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
- 生成库 - 生成库
[`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中,使得该Op可以链接到生成的lib库中。 无需修改 [`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件,`paddle/operators` 目录下新增的 `*_op.cc` 文件会被自动添加链接到生成的lib库中。
```
if(WITH_PYTHON)
cc_library(paddle_pybind SHARED
SRCS pybind.cc
DEPS pybind python backward
mul_op
minus_op)
endif(WITH_PYTHON)
```
## 实现单元测试 ## 实现单元测试
单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py) 单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
### 前向Operator单 ### 前向Operator单元测试
前向Op单测继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`,具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数,以及Python对比的输出值。 前向Op单元测试继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator,需要:
``` 1. 在`setUp`函数定义输入、输出,以及相关的属性参数。
import unittest 2. 生成随机的输入数据。
import numpy as np 3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。
from gradient_checker import GradientChecker, create_op
from op_test_util import OpTestMeta
class TestMulOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self): ```python
self.type = "mul" import unittest
self.inputs = { import numpy as np
'X': np.random.random((32, 84)).astype("float32"), from gradient_checker import GradientChecker, create_op
'Y': np.random.random((84, 100)).astype("float32") from op_test_util import OpTestMeta
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
```
首先需要`import`必要的包,下面详细解释其他值:
- `self.type = "mul" ` : 定义类型,和注册的类型一致。
- `self.inputs` : 定义输入,类型为Numpy.array,并初始化。
- `self.outputs` : 定义输出,并得到Python结算结果。
class TestMulOp(unittest.TestCase):
### 反向Operator单测 __metaclass__ = OpTestMeta
反向Op单测继承自`GradientChecker`,而`GradientChecker`集成自`unittest.TestCase`,所以反向单测函数需要`test_`开头。 def setUp(self):
self.type = "mul"
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
```
``` 上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释:
- `self.type = "mul" ` : 定义类型,与operator注册时注册的类型一致。
- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。
- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。
### 反向Operator单元测试
反向Op单元测试继承自`GradientChecker`,而`GradientChecker`继承自`unittest.TestCase`,因此,**反向单元测试函数需要以`test_`开头**。
```python
class TestMulGradOp(GradientChecker): class TestMulGradOp(GradientChecker):
def setUp(self): def setUp(self):
self.op = create_op("mul") self.op = create_op("mul")
...@@ -325,33 +340,34 @@ class TestMulGradOp(GradientChecker): ...@@ -325,33 +340,34 @@ class TestMulGradOp(GradientChecker):
no_grad_set={"Y"}) no_grad_set={"Y"})
``` ```
下面解释一些关键的地方: 下面解释代码中一些关键的地方:
- 调用`create_op("mul")`创建反向Op对应的前向Op。 - 调用`create_op("mul")`创建反向Op对应的前向Op。
- 调用`compare_grad`函数对比CPU、GPU计算结果。 - 调用`compare_grad`函数对比CPU、GPU计算结果。
- `test_normal`中调用`check_grad`检查梯度稳定性,这里采用数值法检测梯度正确性。 - `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
- 第一个参数`self.op` : 前向Op。 - 第一个参数`self.op` : 前向Op。
- 第二个参数`self.inputs` : 输入词典,词典的Key和`ProtoMaker`定义保持一致。 - 第二个参数`self.inputs` : 输入词典,词典的Key和`ProtoMaker`定义保持一致。
- 第三个参数`["X", "Y"]` : 指定对输入变量`X``Y`做梯度检测。 - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
- 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out` - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
- `test_ignore_x``test_ignore_y`分支测试只需要计算一个输入梯度的情况。 - `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
### 编译和执行 ### 编译和执行单元测试
测完成之后,在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译 元测试编写完成之后,在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)中添加以下内容,将单元测试加入工程
``` ```
py_test(test_mul_op SRCS test_mul_op.py) py_test(test_mul_op SRCS test_mul_op.py)
``` ```
编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`,编译成功之后执行单测命令为 请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试
``` ```bash
make test ARGS="-R test_mul_op -V" make test ARGS="-R test_mul_op -V"
``` ```
或者: 或者:
``` ```bash
ctest -R test_mul_op ctest -R test_mul_op
``` ```
...@@ -18,14 +18,6 @@ limitations under the License. */ ...@@ -18,14 +18,6 @@ limitations under the License. */
#ifndef __NVCC__ #ifndef __NVCC__
#include "paddle/math/MathFunctions.h"
#ifndef PADDLE_TYPE_DOUBLE
#define CBLAS_GEMM paddle::gemm<float>
#else
#define CBLAS_GEMM paddle::gemm<double>
#endif
template<class OpResetOutput> template<class OpResetOutput>
void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
real *gateValue, real *gateValue,
...@@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput, ...@@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
} }
} }
template<class OpResetOutput, class OpFinalOutput>
void hl_cpu_gru_forward(OpResetOutput opResetOutput,
OpFinalOutput opFinalOutput,
hl_gru_value value,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
2 * frameSize,
frameSize,
1,
value.prevOutValue,
frameSize,
value.gateWeight,
frameSize * 2,
1,
value.gateValue,
frameSize * 3);
}
forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
if (value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasNoTrans,
batchSize,
frameSize,
frameSize,
1,
value.resetOutputValue,
frameSize,
value.stateWeight,
frameSize,
1,
value.gateValue + frameSize * 2,
frameSize * 3);
}
forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
}
template<class OpStateGrad> template<class OpStateGrad>
void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
real *gateValue, real *gateValue,
...@@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad, ...@@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
} }
} }
template<class OpStateGrad, class OpResetGrad>
void hl_cpu_gru_backward(OpStateGrad opStateGrad,
OpResetGrad opResetGrad,
hl_gru_value value,
hl_gru_grad grad,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
backward_state_grad(opStateGrad, value, grad,
frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize,
1,
grad.gateGrad + frameSize * 2,
frameSize * 3,
value.stateWeight,
frameSize,
0,
grad.resetOutputGrad,
frameSize);
if (grad.stateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize,
batchSize,
1,
value.resetOutputValue,
frameSize,
grad.gateGrad + frameSize * 2,
frameSize * 3,
1,
grad.stateWeightGrad,
frameSize);
}
}
backward_reset_grad(opResetGrad, value, grad,
frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
CBLAS_GEMM(CblasNoTrans,
CblasTrans,
batchSize,
frameSize,
frameSize * 2,
1,
grad.gateGrad,
frameSize * 3,
value.gateWeight,
frameSize * 2,
1,
grad.prevOutGrad,
frameSize);
if (grad.gateWeightGrad) {
CBLAS_GEMM(CblasTrans,
CblasNoTrans,
frameSize,
frameSize * 2,
batchSize,
1,
value.prevOutValue,
frameSize,
grad.gateGrad,
frameSize * 3,
1,
grad.gateWeightGrad,
frameSize * 2);
}
}
}
#endif #endif
#endif // HL_CPU_GRU_CUH_ #endif // HL_CPU_GRU_CUH_
...@@ -41,9 +41,9 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc); ...@@ -41,9 +41,9 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
// check whether a value(attribute) fit a certain limit // check whether a value(attribute) fit a certain limit
template <typename T> template <typename T>
class LargerThanChecker { class GreaterThanChecker {
public: public:
explicit LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {} explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
void operator()(T& value) const { void operator()(T& value) const {
PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail"); PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
} }
...@@ -110,8 +110,8 @@ class TypedAttrChecker { ...@@ -110,8 +110,8 @@ class TypedAttrChecker {
return *this; return *this;
} }
TypedAttrChecker& LargerThan(const T& lower_bound) { TypedAttrChecker& GreaterThan(const T& lower_bound) {
value_checkers_.push_back(LargerThanChecker<T>(lower_bound)); value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
return *this; return *this;
} }
......
...@@ -87,3 +87,24 @@ message OpProto { ...@@ -87,3 +87,24 @@ message OpProto {
repeated Attr attrs = 4; repeated Attr attrs = 4;
required string comment = 5; required string comment = 5;
} }
enum DataType {
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LoDTensorDesc {
required DataType data_type = 1;
repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
optional int32 lod_level = 3 [ default = 0 ];
}
message VarDesc {
required string name = 1;
optional LoDTensorDesc lod_tensor = 2;
}
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
USE_OP(add_two); USE_OP(add);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -41,7 +41,7 @@ namespace f = paddle::framework; ...@@ -41,7 +41,7 @@ namespace f = paddle::framework;
TEST(GradOpBuilder, AddTwo) { TEST(GradOpBuilder, AddTwo) {
std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp( std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
"add_two", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {})); "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
std::shared_ptr<f::OperatorBase> grad_add_op = std::shared_ptr<f::OperatorBase> grad_add_op =
f::OpRegistry::CreateGradOp(*add_op); f::OpRegistry::CreateGradOp(*add_op);
EXPECT_EQ(grad_add_op->Inputs().size(), 4UL); EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
......
...@@ -19,8 +19,8 @@ ...@@ -19,8 +19,8 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) { LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
LOD new_lod; LoD new_lod;
new_lod.reserve(level_end - level_begin); new_lod.reserve(level_end - level_begin);
for (size_t i = level_begin; i < level_end; i++) { for (size_t i = level_begin; i < level_end; i++) {
new_lod.emplace_back(in.at(i)); new_lod.emplace_back(in.at(i));
...@@ -28,10 +28,10 @@ LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) { ...@@ -28,10 +28,10 @@ LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
return new_lod; return new_lod;
} }
LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end) { size_t elem_end) {
// slice the lod. // slice the lod.
LOD new_lod; LoD new_lod;
new_lod.reserve(in.size() - level); new_lod.reserve(in.size() - level);
auto start = in.at(level)[elem_begin]; auto start = in.at(level)[elem_begin];
auto end = in.at(level)[elem_end]; auto end = in.at(level)[elem_end];
...@@ -46,13 +46,13 @@ LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin, ...@@ -46,13 +46,13 @@ LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
std::transform(new_lod.back().begin(), new_lod.back().end(), std::transform(new_lod.back().begin(), new_lod.back().end(),
new_lod.back().begin(), new_lod.back().begin(),
[start](int v) { return v - start; }); [start](int v) { return v - start; });
PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD"); PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
} }
PADDLE_ENFORCE_LE(new_lod.size(), in.size()); PADDLE_ENFORCE_LE(new_lod.size(), in.size());
return new_lod; return new_lod;
} }
bool operator==(const LOD& a, const LOD& b) { bool operator==(const LoD& a, const LoD& b) {
if (a.size() != b.size()) { if (a.size() != b.size()) {
return false; return false;
} }
...@@ -72,12 +72,12 @@ bool operator==(const LOD& a, const LOD& b) { ...@@ -72,12 +72,12 @@ bool operator==(const LOD& a, const LOD& b) {
return true; return true;
} }
void LODTensor::SliceLevels(size_t level_begin, size_t level_end) { void LoDTensor::SliceLevels(size_t level_begin, size_t level_end) {
auto new_lod = framework::SliceLevels(lod_, level_begin, level_end); auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
lod_ = new_lod; lod_ = new_lod;
} }
void LODTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) { void LoDTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level, PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
NumLevels()); NumLevels());
PADDLE_ENFORCE(elem_begin < NumElements(level), PADDLE_ENFORCE(elem_begin < NumElements(level),
......
...@@ -35,34 +35,34 @@ template <typename T> ...@@ -35,34 +35,34 @@ template <typename T>
using Vector = thrust::host_vector<T>; using Vector = thrust::host_vector<T>;
#endif #endif
using LOD = std::vector<Vector<size_t>>; using LoD = std::vector<Vector<size_t>>;
LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end); LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end); size_t elem_end);
bool operator==(const LOD& a, const LOD& b); bool operator==(const LoD& a, const LoD& b);
/* /*
* LODTensor (Level of details Tensor) * LoDTensor (Level of details Tensor)
* see https://en.wikipedia.org/wiki/Level_of_details for reference. * see https://en.wikipedia.org/wiki/Level_of_details for reference.
*/ */
class LODTensor { class LoDTensor {
public: public:
LODTensor() {} LoDTensor() {}
LODTensor(const LOD& lod, Tensor* t) : lod_(lod), tensor_(t) {} LoDTensor(const LoD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
void set_lod(const LOD& lod) { lod_ = lod; } void set_lod(const LoD& lod) { lod_ = lod; }
void set_tensor(Tensor* tensor) { tensor_ = tensor; } void set_tensor(Tensor* tensor) { tensor_ = tensor; }
Tensor& tensor() { return *tensor_; } Tensor& tensor() { return *tensor_; }
LOD lod() { return lod_; } LoD lod() { return lod_; }
/* /*
* Get a element from LOD. * Get a element from LoD.
*/ */
size_t lod_element(size_t level, size_t elem) const { size_t lod_element(size_t level, size_t elem) const {
PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level, PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
...@@ -74,7 +74,7 @@ class LODTensor { ...@@ -74,7 +74,7 @@ class LODTensor {
} }
/* /*
* Number of LODTensor's levels, each level has units of data, for example, * Number of LoDTensor's levels, each level has units of data, for example,
* in the sentence's view, article, paragraph, sentence are 3 levels. * in the sentence's view, article, paragraph, sentence are 3 levels.
*/ */
size_t NumLevels() const { return lod_.size(); } size_t NumLevels() const { return lod_.size(); }
...@@ -100,7 +100,7 @@ class LODTensor { ...@@ -100,7 +100,7 @@ class LODTensor {
void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end); void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
private: private:
LOD lod_; LoD lod_;
Tensor* tensor_; // not owned Tensor* tensor_; // not owned
}; };
} // namespace framework } // namespace framework
......
...@@ -94,7 +94,7 @@ Let's go on slicing this slice. Its <1,1>-slice is ...@@ -94,7 +94,7 @@ Let's go on slicing this slice. Its <1,1>-slice is
||| |||
``` ```
### The General Slicing Algorithm ### The Slicing Algorithm
The algorithm, with over-simplified data structure, is defined as The algorithm, with over-simplified data structure, is defined as
...@@ -106,17 +106,41 @@ struct LoDTensor { ...@@ -106,17 +106,41 @@ struct LoDTensor {
float* tensor_; float* tensor_;
}; };
LoDTensor Slice(const LoDTensor& lodt, int level, int sequence) { LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
```
Let us revisit the example above
} ```
3
3 1 2
3 2 4 1 2 3
||| || |||| | || |||
``` ```
### Slicing the Top Level Suppose that we want to retrieve the <1,2>-slice
Please be aware that an RNN operator only slices the top level of a LoD Tensor to get the step inputs. ```
2
2 3
|| |||
```
```c++ we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
LoDTensor Slice(const LoDTensor& lodt, int sequence) {
To avoid the traversal of the LoD tree at slcing time, we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level. For example, above LoD Tensor can be transformed into
```
0
0 9 10
0 3 5 9 10 12
||| || |||| | || |||
```
We don't really need the 0 on top, so the LoD Tensor could be
} ```
0 9 10
0 3 5 9 10 12
||| || |||| | || |||
``` ```
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class LODTensorTester : public ::testing::Test { class LoDTensorTester : public ::testing::Test {
public: public:
virtual void SetUp() override { virtual void SetUp() override {
// tensor's batch_size: 30 // tensor's batch_size: 30
...@@ -29,7 +29,7 @@ class LODTensorTester : public ::testing::Test { ...@@ -29,7 +29,7 @@ class LODTensorTester : public ::testing::Test {
// 0 10 20 // 0 10 20
// 0 5 10 15 20 // 0 5 10 15 20
// 0 2 5 7 10 12 15 20 // 0 2 5 7 10 12 15 20
LOD lod; LoD lod;
lod.push_back(std::vector<size_t>{0, 10, 20}); lod.push_back(std::vector<size_t>{0, 10, 20});
lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20}); lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20}); lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
...@@ -47,21 +47,21 @@ class LODTensorTester : public ::testing::Test { ...@@ -47,21 +47,21 @@ class LODTensorTester : public ::testing::Test {
protected: protected:
platform::CPUPlace place; platform::CPUPlace place;
Tensor tensor; Tensor tensor;
LODTensor lod_tensor; LoDTensor lod_tensor;
}; };
TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); } TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
TEST_F(LODTensorTester, NumElements) { TEST_F(LoDTensorTester, NumElements) {
ASSERT_EQ(lod_tensor.NumElements(0), 2UL); ASSERT_EQ(lod_tensor.NumElements(0), 2UL);
ASSERT_EQ(lod_tensor.NumElements(1), 4UL); ASSERT_EQ(lod_tensor.NumElements(1), 4UL);
ASSERT_EQ(lod_tensor.NumElements(2), 8UL); ASSERT_EQ(lod_tensor.NumElements(2), 8UL);
} }
TEST_F(LODTensorTester, SliceLevels) { TEST_F(LoDTensorTester, SliceLevels) {
// slice 1 level // slice 1 level
for (size_t level = 0; level < 3UL; ++level) { for (size_t level = 0; level < 3UL; ++level) {
LODTensor new_lod_tensor = lod_tensor; LoDTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceLevels(level, level + 1); new_lod_tensor.SliceLevels(level, level + 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL); ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level)); ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
...@@ -70,7 +70,7 @@ TEST_F(LODTensorTester, SliceLevels) { ...@@ -70,7 +70,7 @@ TEST_F(LODTensorTester, SliceLevels) {
} }
// slice 2 level // slice 2 level
for (size_t level = 0; level < 2UL; ++level) { for (size_t level = 0; level < 2UL; ++level) {
LODTensor new_lod_tensor = lod_tensor; LoDTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceLevels(level, level + 2); new_lod_tensor.SliceLevels(level, level + 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL); ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level)); ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
...@@ -80,9 +80,9 @@ TEST_F(LODTensorTester, SliceLevels) { ...@@ -80,9 +80,9 @@ TEST_F(LODTensorTester, SliceLevels) {
} }
} }
TEST_F(LODTensorTester, SliceInLevel) { TEST_F(LoDTensorTester, SliceInLevel) {
size_t level = 0; size_t level = 0;
LODTensor new_lod_tensor = lod_tensor; LoDTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceInLevel(level, 0, 2); new_lod_tensor.SliceInLevel(level, 0, 2);
EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL); EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL); EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
......
...@@ -21,7 +21,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { ...@@ -21,7 +21,7 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
AddOutput("output", "output of cosine op"); AddOutput("output", "output of cosine op");
AddAttr<float>("scale", "scale of cosine op") AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0) .SetDefault(1.0)
.LargerThan(0.0); .GreaterThan(0.0);
AddComment("This is cos op"); AddComment("This is cos op");
} }
}; };
...@@ -80,7 +80,7 @@ TEST(OpRegistry, CreateOp) { ...@@ -80,7 +80,7 @@ TEST(OpRegistry, CreateOp) {
paddle::framework::Scope scope; paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx; paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx); op->Run(scope, dev_ctx);
float scale_get = op->GetAttr<float>("scale"); float scale_get = op->Attr<float>("scale");
ASSERT_EQ(scale_get, scale); ASSERT_EQ(scale_get, scale);
} }
...@@ -121,7 +121,7 @@ TEST(OpRegistry, DefaultValue) { ...@@ -121,7 +121,7 @@ TEST(OpRegistry, DefaultValue) {
paddle::framework::Scope scope; paddle::framework::Scope scope;
paddle::platform::CPUDeviceContext dev_ctx; paddle::platform::CPUDeviceContext dev_ctx;
op->Run(scope, dev_ctx); op->Run(scope, dev_ctx);
ASSERT_EQ(op->GetAttr<float>("scale"), 1.0); ASSERT_EQ(op->Attr<float>("scale"), 1.0);
} }
TEST(OpRegistry, CustomChecker) { TEST(OpRegistry, CustomChecker) {
...@@ -172,6 +172,6 @@ TEST(OpRegistry, CustomChecker) { ...@@ -172,6 +172,6 @@ TEST(OpRegistry, CustomChecker) {
paddle::platform::CPUDeviceContext dev_ctx; paddle::platform::CPUDeviceContext dev_ctx;
paddle::framework::Scope scope; paddle::framework::Scope scope;
op->Run(scope, dev_ctx); op->Run(scope, dev_ctx);
int test_attr = op->GetAttr<int>("test_attr"); int test_attr = op->Attr<int>("test_attr");
ASSERT_EQ(test_attr, 4); ASSERT_EQ(test_attr, 4);
} }
\ No newline at end of file
...@@ -69,7 +69,7 @@ class OperatorBase { ...@@ -69,7 +69,7 @@ class OperatorBase {
virtual ~OperatorBase() {} virtual ~OperatorBase() {}
template <typename T> template <typename T>
inline const T& GetAttr(const std::string& name) const { inline const T& Attr(const std::string& name) const {
PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
name); name);
return boost::get<T>(attrs_.at(name)); return boost::get<T>(attrs_.at(name));
...@@ -238,8 +238,8 @@ class InferShapeContext { ...@@ -238,8 +238,8 @@ class InferShapeContext {
const Scope& scope() const { return scope_; } const Scope& scope() const { return scope_; }
template <typename T> template <typename T>
inline const T& GetAttr(const std::string& name) const { inline const T& Attr(const std::string& name) const {
return op_.GetAttr<T>(name); return op_.Attr<T>(name);
} }
size_t InputSize(const std::string& name) const { size_t InputSize(const std::string& name) const {
......
...@@ -102,7 +102,7 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { ...@@ -102,7 +102,7 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
AddOutput("y", "output of test op"); AddOutput("y", "output of test op");
AddAttr<float>("scale", "scale of cosine op") AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0) .SetDefault(1.0)
.LargerThan(0.0); .GreaterThan(0.0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
...@@ -140,7 +140,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker ...@@ -140,7 +140,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
AddOutput("ys", "outputs of test op").AsDuplicable(); AddOutput("ys", "outputs of test op").AsDuplicable();
AddAttr<float>("scale", "scale of cosine op") AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0) .SetDefault(1.0)
.LargerThan(0.0); .GreaterThan(0.0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
......
...@@ -44,6 +44,7 @@ if(WITH_GPU) ...@@ -44,6 +44,7 @@ if(WITH_GPU)
add_simple_unittest(RowConvOpTest) add_simple_unittest(RowConvOpTest)
add_simple_unittest(BlockExpandOpTest) add_simple_unittest(BlockExpandOpTest)
add_simple_unittest(CropOpTest) add_simple_unittest(CropOpTest)
add_simple_unittest(SwitchOpTest)
endif() endif()
add_simple_unittest(Im2ColTest) add_simple_unittest(Im2ColTest)
......
...@@ -83,9 +83,9 @@ struct EigenBlasGemm { ...@@ -83,9 +83,9 @@ struct EigenBlasGemm {
}; };
#ifdef PADDLE_TYPE_DOUBLE #ifdef PADDLE_TYPE_DOUBLE
template class EigenBlasGemm<double>; template struct EigenBlasGemm<double>;
#else #else
template class EigenBlasGemm<float>; template struct EigenBlasGemm<float>;
#endif #endif
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "GemmFunctor.h"
#include "hl_cpu_gru.cuh"
namespace paddle {
template <DeviceType Device, class T>
struct GruFunctor {
template <class OpResetOutput, class OpFinalOutput>
static void compute(OpResetOutput opResetOutput,
OpFinalOutput opFinalOutput,
hl_gru_value value,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
#ifndef __NVCC__
if (value.prevOutValue) {
BlasGemm<Device, T>::compute(false,
false,
batchSize,
2 * frameSize,
frameSize,
1,
value.prevOutValue,
frameSize,
value.gateWeight,
frameSize * 2,
1,
value.gateValue,
frameSize * 3);
}
forward_reset_output(
opResetOutput, value, frameSize, batchSize, active_gate);
if (value.prevOutValue) {
BlasGemm<Device, T>::compute(false,
false,
batchSize,
frameSize,
frameSize,
1,
value.resetOutputValue,
frameSize,
value.stateWeight,
frameSize,
1,
value.gateValue + frameSize * 2,
frameSize * 3);
}
forward_final_output(
opFinalOutput, value, frameSize, batchSize, active_node);
#endif
}
};
template <DeviceType Device, class T>
struct GruGradFunctor {
template <class OpStateGrad, class OpResetGrad>
static void compute(OpStateGrad opStateGrad,
OpResetGrad opResetGrad,
hl_gru_value value,
hl_gru_grad grad,
int frameSize,
int batchSize,
hl_activation_mode_t active_node,
hl_activation_mode_t active_gate) {
#ifndef __NVCC__
backward_state_grad(
opStateGrad, value, grad, frameSize, batchSize, active_node);
if (value.prevOutValue && grad.prevOutGrad) {
BlasGemm<Device, T>::compute(false,
true,
batchSize,
frameSize,
frameSize,
1,
grad.gateGrad + frameSize * 2,
frameSize * 3,
value.stateWeight,
frameSize,
0,
grad.resetOutputGrad,
frameSize);
if (grad.stateWeightGrad) {
BlasGemm<Device, T>::compute(true,
false,
frameSize,
frameSize,
batchSize,
1,
value.resetOutputValue,
frameSize,
grad.gateGrad + frameSize * 2,
frameSize * 3,
1,
grad.stateWeightGrad,
frameSize);
}
}
backward_reset_grad(
opResetGrad, value, grad, frameSize, batchSize, active_gate);
if (grad.prevOutGrad && value.prevOutValue) {
BlasGemm<Device, T>::compute(false,
true,
batchSize,
frameSize,
frameSize * 2,
1,
grad.gateGrad,
frameSize * 3,
value.gateWeight,
frameSize * 2,
1,
grad.prevOutGrad,
frameSize);
if (grad.gateWeightGrad) {
BlasGemm<Device, T>::compute(true,
false,
frameSize,
frameSize * 2,
batchSize,
1,
value.prevOutValue,
frameSize,
grad.gateGrad,
frameSize * 3,
1,
grad.gateWeightGrad,
frameSize * 2);
}
}
#endif
}
};
} // namespace paddle
...@@ -94,95 +94,4 @@ public: ...@@ -94,95 +94,4 @@ public:
int paddingWidth); int paddingWidth);
}; };
template <class T>
struct Padding {
static void run(const T* src,
T* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
memcpy(dest, src, inputWidth * sizeof(T));
dest += inputWidth;
src += inputWidth;
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
}
}
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
struct Padding<float> {
static void run(const float* src,
float* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
int step = inputWidth >> 2;
int remain = inputWidth & 3;
for (int s = 0; s < step; s++) {
float32x4_t s0 = vld1q_f32(src);
vst1q_f32(dest, s0);
src += 4;
dest += 4;
}
for (int r = 0; r < remain; r++) {
*dest++ = *src++;
}
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
}
}
};
#endif
} // namespace paddle } // namespace paddle
...@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and ...@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "MulOp.h" #include "MulOp.h"
/// todo(tianbing), delete it #include "GemmFunctor.h"
#include <iostream>
#include "paddle/math/MathFunctions.h"
#include "paddle/math/SIMDFunctions.h" #include "paddle/math/SIMDFunctions.h"
#include "paddle/utils/ThreadLocal.h" #include "paddle/utils/ThreadLocal.h"
#ifndef PADDLE_TYPE_DOUBLE
#define GEMM paddle::gemm<float>
#else
#define GEMM paddle::gemm<double>
#endif
namespace { namespace {
inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
for (unsigned int i = 0; i < len; ++i) { for (unsigned int i = 0; i < len; ++i) {
...@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out, ...@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
real scaleT, real scaleT,
bool aTrans, bool aTrans,
bool bTrans) { bool bTrans) {
GEMM(aTrans ? CblasTrans : CblasNoTrans, BlasGemm<DEVICE_TYPE_CPU, real>::compute(
bTrans ? CblasTrans : CblasNoTrans, aTrans,
out.getHeight(), bTrans,
out.getWidth(), out.getHeight(),
!aTrans ? a.getWidth() : a.getHeight(), out.getWidth(),
scaleAB, !aTrans ? a.getWidth() : a.getHeight(),
a.getData(), scaleAB,
a.getStride(), a.getData(),
b.getData(), a.getStride(),
b.getStride(), b.getData(),
scaleT, b.getStride(),
out.getData(), scaleT,
out.getStride()); out.getData(),
out.getStride());
} }
/// dense matrix (+)= sparse matrix * dense matrix /// dense matrix (+)= sparse matrix * dense matrix
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "SwitchOp.h"
#include "paddle/math/Vector.h"
namespace paddle {
template <>
void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
const real* inputs,
const int num,
const int inC,
const int inH,
const int inW,
const int argType) {
for (int n = 0; n < num; ++n) {
for (int c = 0; c < inC; ++c) {
for (int h = 0; h < inH; ++h) {
for (int w = 0; w < inW; ++w) {
if (argType == ADD_TO) {
outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
} else {
outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
}
}
}
}
}
}
template <>
void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
const real* inputs,
const int num,
const int inH,
const int inW,
const int inC,
const int argType) {
for (int n = 0; n < num; ++n) {
for (int h = 0; h < inH; ++h) {
for (int w = 0; w < inW; ++w) {
for (int c = 0; c < inC; ++c) {
if (argType == ADD_TO) {
outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
} else {
outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
}
}
}
}
}
}
/**
* \brief Switch dimension order of image input.
* The input and output is a 4D tensor. Switch order
* 'batch_size,channels, height, width' to
* order 'batch_size, height, width, channels'.
*
* Argument in this Function:
* \param inputs input data with order 'batch_size,channels, height, width'.
* \param outputs output data with order 'batch_size, height, width, channels'.
*/
template <DeviceType Device>
class NCHW2NHWCFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
CHECK_EQ(1UL, outputs.size());
size_t num = inputs[0].shape()[0];
size_t inC = inputs[0].shape()[1];
size_t inH = inputs[0].shape()[2];
size_t inW = inputs[0].shape()[3];
NCHW2NHWC<Device>(outputs[0].data<real>(),
inputs[0].data<real>(),
num,
inC,
inH,
inW,
outputs[0].getArgType());
}
};
/**
* \brief Switch dimension order of image input.
* The input and output is a 4D tensor. Switch order
* 'batch_size, height, width, channels' to
* order 'batch_size, channels, height, width'.
*
* Argument in this Function:
* \param inputs input data with order 'batch_size, height, width, channels'.
* \param outputs output data with order 'batch_size, channels, height, width'.
*/
template <DeviceType Device>
class NHWC2NCHWFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(1UL, inputs.size());
CHECK_EQ(1UL, outputs.size());
size_t num = inputs[0].shape()[0];
size_t inH = inputs[0].shape()[1];
size_t inW = inputs[0].shape()[2];
size_t inC = inputs[0].shape()[3];
NHWC2NCHW<Device>(outputs[0].data<real>(),
inputs[0].data<real>(),
num,
inH,
inW,
inC,
outputs[0].getArgType());
}
};
REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Function.h"
namespace paddle {
/**
* \brief This funtion switch dimension order of image input.
* The input and output is a 4D tensor. Switch order 'batch_size,
*channels, height, width' to
* order 'batch_size, height, width, channels'.
*
* \param[out] outputs save results.
* \param[in] inputs input data.
* \param[in] num batch size of input data.
* \param[in] inC channel number of input data.
* \param[in] inH height of input data.
* \param[in] inH with of input data.
* \param[in] argType type of output argument.
*/
template <DeviceType Device>
void NCHW2NHWC(real* outputs,
const real* inputs,
const int num,
const int inC,
const int inH,
const int inW,
const int argtype);
/**
* \brief This funtion switch dimension order of image input.
* The input and output is a 4D tensor. Switch order 'batch_size,
*height, width, channels' to
* order 'batch_size, channels, height, width'.
*
* \param[out] inGrad gradients of previous layer.
* \param[in] outGrad output gradients.
* \param[in] num batch size of input data.
* \param[in] inH height of input data.
* \param[in] inW with of input data.
* \param[in] inC channel number of input data.
* \param[in] argType type of output argument.
*/
template <DeviceType Device>
void NHWC2NCHW(real* inGrad,
const real* outGrad,
const int num,
const int inH,
const int inW,
const int inC,
const int argType);
} // namespace paddle
/* Copyright (c) 2016 Paddle
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "SwitchOp.h"
#include "hl_base.h"
namespace paddle {
__global__ void KeNCHW2NHWC(real* outputs,
const real* inputs,
int inC,
int inH,
int inW,
int nthreads,
int argType) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) {
const int w = idx % inW;
const int h = (idx / inW) % inH;
const int c = (idx / inW / inH) % inC;
const int n = idx / inW / inH / inC;
const int off = ((n * inH + h) * inW + w) * inC + c;
if (argType == ADD_TO) {
outputs[off] += inputs[idx];
} else {
outputs[off] = inputs[idx];
}
}
}
template <>
void NCHW2NHWC<DEVICE_TYPE_GPU>(real* outputs,
const real* inputs,
const int num,
const int inC,
const int inH,
const int inW,
const int argType) {
size_t nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
KeNCHW2NHWC<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
outputs, inputs, inC, inH, inW, nth, argType);
CHECK_SYNC("NCHW2NHWC");
}
__global__ void KeNHWC2NCHW(real* outputs,
const real* inputs,
int inH,
int inW,
int inC,
int nthreads,
int argType) {
const int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < nthreads) {
const int c = idx % inC;
const int w = (idx / inC) % inW;
const int h = (idx / inC / inW) % inH;
const int n = idx / inW / inH / inC;
const int off = ((n * inC + c) * inH + h) * inW + w;
if (argType == ADD_TO) {
outputs[off] += inputs[idx];
} else {
outputs[off] = inputs[idx];
}
}
}
template <>
void NHWC2NCHW<DEVICE_TYPE_GPU>(real* outputs,
const real* inputs,
const int num,
const int inH,
const int inW,
const int inC,
const int argType) {
int nth = num * inC * inH * inW;
int blockSize = 1024;
int gridSize = (nth + 1024 - 1) / 1024;
KeNHWC2NCHW<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
outputs, inputs, inH, inW, inC, nth, argType);
CHECK_SYNC("NHWC2NCHW");
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
namespace paddle {
TEST(Pad, real) {
for (size_t numSamples : {1, 4, 8, 16}) {
for (size_t channels : {1, 4, 8, 16}) {
for (size_t imgSizeH : {1, 4, 8, 16}) {
for (size_t imgSizeW : {1, 4, 8, 16}) {
VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
<< " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
for (bool test_grad : {true, false}) {
CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC",
FuncConfig());
TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels};
compare.addInputs(
BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
compare.addOutputs(BufferArg(
VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
compare.run();
}
}
}
}
}
}
} // namespace paddle
...@@ -12,468 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,468 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "neon_util.h" #include "NeonDepthwiseConv.h"
#include "paddle/function/ConvOp.h" #include "paddle/function/ConvOp.h"
#include "paddle/function/Im2Col.h"
namespace paddle { namespace paddle {
namespace neon {
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <int filterSize, int stride>
struct DepthwiseConvKernel {};
inline float32_t conv3x3(float32x4_t r0,
float32x4_t r1,
float32x4_t r2,
float32x4_t k0,
float32x4_t k1,
float32x4_t k2) {
float32x4_t tmp;
tmp = vmulq_f32(r0, k0);
tmp = vmlaq_f32(tmp, r1, k1);
tmp = vmlaq_f32(tmp, r2, k2);
return vaddvq_f32(tmp);
}
inline float32_t conv4x4(float32x4_t r0,
float32x4_t r1,
float32x4_t r2,
float32x4_t r3,
float32x4_t k0,
float32x4_t k1,
float32x4_t k2,
float32x4_t k3) {
float32x4_t tmp;
tmp = vmulq_f32(r0, k0);
tmp = vmlaq_f32(tmp, r1, k1);
tmp = vmlaq_f32(tmp, r2, k2);
tmp = vmlaq_f32(tmp, r3, k3);
return vaddvq_f32(tmp);
}
/**
* Each step calculates four elements of the output.
* First step:
* R0[0, 1, 2, 3...] * K[0][0]
* R0[1, 2, 3, 4...] * K[0][1]
* R0[2, 3, 4, 5...] * K[0][2]
* R1[0, 1, 2, 3...] * K[1][0]
* R1[1, 2, 3, 4...] * K[1][1]
* R1[2, 3, 4, 5...] * K[1][2]
* R2[0, 1, 2, 3...] * K[2][0]
* R2[1, 2, 3, 4...] * K[2][1]
* + R2[2, 3, 4, 5...] * K[2][2]
* ------------------------------
* Output[0, 1, 2, 3]
*/
template <>
struct DepthwiseConvKernel<3, 1> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 9) {
// Load the filters
float32x4_t k[3];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 3);
k[2] = vld1q_f32(filterData + 6);
k[0] = vsetq_lane_f32(0.f, k[0], 3);
k[1] = vsetq_lane_f32(0.f, k[1], 3);
k[2] = vsetq_lane_f32(0.f, k[2], 3);
const float* r0 =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
const float* r1 = r0 + inputWidth;
const float* r2 = r0 + inputWidth * 2;
float32x4_t input[3][3];
for (int h = 0; h < outputHeight; h++) {
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t tmp;
input[0][0] = vld1q_f32(r0);
tmp = vld1q_f32(r0 + 4);
input[0][1] = vextq_f32(input[0][0], tmp, 1);
input[0][2] = vextq_f32(input[0][0], tmp, 2);
input[1][0] = vld1q_f32(r1);
tmp = vld1q_f32(r1 + 4);
input[1][1] = vextq_f32(input[1][0], tmp, 1);
input[1][2] = vextq_f32(input[1][0], tmp, 2);
input[2][0] = vld1q_f32(r2);
tmp = vld1q_f32(r2 + 4);
input[2][1] = vextq_f32(input[2][0], tmp, 1);
input[2][2] = vextq_f32(input[2][0], tmp, 2);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 4;
r1 += 4;
r2 += 4;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0++;
r1++;
r2++;
outputData++;
}
r0 += 2;
r1 += 2;
r2 += 2;
}
}
}
};
/**
* Each step calculates four elements of the output.
* First step:
* R0[0, 2, 4, 6...] * K[0][0]
* R0[1, 3, 5, 7...] * K[0][1]
* R0[2, 4, 6, 8...] * K[0][2]
* R1[0, 2, 4, 6...] * K[1][0]
* R1[1, 3, 5, 7...] * K[1][1]
* R1[2, 4, 6, 8...] * K[1][2]
* R2[0, 2, 4, 6...] * K[2][0]
* R2[1, 3, 5, 7...] * K[2][1]
* R2[2, 4, 6, 8...] * K[2][2]
* ------------------------------
* Output[0, 1, 2, 3]
*/
template <>
struct DepthwiseConvKernel<3, 2> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 9) {
// Load the filters
float32x4_t k[3];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 3);
k[2] = vld1q_f32(filterData + 6);
k[0] = vsetq_lane_f32(0.f, k[0], 3);
k[1] = vsetq_lane_f32(0.f, k[1], 3);
k[2] = vsetq_lane_f32(0.f, k[2], 3);
const float* start =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
float32x4_t input[3][3];
for (int h = 0; h < outputHeight; h++) {
const float* r0 = start + 2 * h * inputWidth;
const float* r1 = start + (2 * h + 1) * inputWidth;
const float* r2 = start + (2 * h + 2) * inputWidth;
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t data1;
float32x4x2_t data2;
data2 = vld2q_f32(r0);
input[0][0] = data2.val[0];
input[0][1] = data2.val[1];
data1 = vld1q_f32(r0 + 8);
input[0][2] = vextq_f32(data2.val[0], data1, 1);
data2 = vld2q_f32(r1);
input[1][0] = data2.val[0];
input[1][1] = data2.val[1];
data1 = vld1q_f32(r1 + 8);
input[1][2] = vextq_f32(data2.val[0], data1, 1);
data2 = vld2q_f32(r2);
input[2][0] = data2.val[0];
input[2][1] = data2.val[1];
data1 = vld1q_f32(r2 + 8);
input[2][2] = vextq_f32(data2.val[0], data1, 1);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 8;
r1 += 8;
r2 += 8;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0 += 2;
r1 += 2;
r2 += 2;
outputData++;
}
}
}
}
};
/**
* Each step calculates four elements of the output.
*/
template <>
struct DepthwiseConvKernel<4, 1> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 16) {
// Load the filters
float32x4_t k[4];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 4);
k[2] = vld1q_f32(filterData + 8);
k[3] = vld1q_f32(filterData + 12);
const float* r0 =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
const float* r1 = r0 + inputWidth;
const float* r2 = r0 + inputWidth * 2;
const float* r3 = r0 + inputWidth * 3;
float32x4_t input[4][4];
for (int h = 0; h < outputHeight; h++) {
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t tmp;
input[0][0] = vld1q_f32(r0);
tmp = vld1q_f32(r0 + 4);
input[0][1] = vextq_f32(input[0][0], tmp, 1);
input[0][2] = vextq_f32(input[0][0], tmp, 2);
input[0][3] = vextq_f32(input[0][0], tmp, 3);
input[1][0] = vld1q_f32(r1);
tmp = vld1q_f32(r1 + 4);
input[1][1] = vextq_f32(input[1][0], tmp, 1);
input[1][2] = vextq_f32(input[1][0], tmp, 2);
input[1][3] = vextq_f32(input[1][0], tmp, 3);
input[2][0] = vld1q_f32(r2);
tmp = vld1q_f32(r2 + 4);
input[2][1] = vextq_f32(input[2][0], tmp, 1);
input[2][2] = vextq_f32(input[2][0], tmp, 2);
input[2][3] = vextq_f32(input[2][0], tmp, 3);
input[3][0] = vld1q_f32(r3);
tmp = vld1q_f32(r3 + 4);
input[3][1] = vextq_f32(input[3][0], tmp, 1);
input[3][2] = vextq_f32(input[3][0], tmp, 2);
input[3][3] = vextq_f32(input[3][0], tmp, 3);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
float32x4_t i3 = vld1q_f32(r3);
*outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
r0++;
r1++;
r2++;
r3++;
outputData++;
}
r0 += 3;
r1 += 3;
r2 += 3;
r3 += 3;
}
}
}
};
/**
* Each step calculates four elements of the output.
*/
template <>
struct DepthwiseConvKernel<4, 2> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 16) {
// Load the filters
float32x4_t k[4];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 4);
k[2] = vld1q_f32(filterData + 8);
k[3] = vld1q_f32(filterData + 12);
const float* start =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
float32x4_t input[4][4];
for (int h = 0; h < outputHeight; h++) {
const float* r0 = start + 2 * h * inputWidth;
const float* r1 = start + (2 * h + 1) * inputWidth;
const float* r2 = start + (2 * h + 2) * inputWidth;
const float* r3 = start + (2 * h + 3) * inputWidth;
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4x2_t data1;
float32x4x2_t data2;
data1 = vld2q_f32(r0);
data2 = vld2q_f32(r0 + 8);
input[0][0] = data1.val[0];
input[0][1] = data1.val[1];
input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r1);
data2 = vld2q_f32(r1 + 8);
input[1][0] = data1.val[0];
input[1][1] = data1.val[1];
input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r2);
data2 = vld2q_f32(r2 + 8);
input[2][0] = data1.val[0];
input[2][1] = data1.val[1];
input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r3);
data2 = vld2q_f32(r3 + 8);
input[3][0] = data1.val[0];
input[3][1] = data1.val[1];
input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 8;
r1 += 8;
r2 += 8;
r3 += 8;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
float32x4_t i3 = vld1q_f32(r3);
*outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;
outputData++;
}
}
}
}
};
template <DeviceType Device> template <DeviceType Device>
class NeonDepthwiseConvFunction : public ConvFunctionBase { class NeonDepthwiseConvFunction : public ConvFunctionBase {
public: public:
...@@ -497,16 +42,16 @@ public: ...@@ -497,16 +42,16 @@ public:
const TensorShape& filter = inputs[1].shape(); const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape(); const TensorShape& output = outputs[0].shape();
size_t batchSize = input[0]; int batchSize = input[0];
size_t inputChannels = input[1]; int inputChannels = input[1];
size_t inputHeight = input[2]; int inputHeight = input[2];
size_t inputWidth = input[3]; int inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter); int filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter); int filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1]; int outputChannels = output[1];
size_t outputHeight = output[2]; int outputHeight = output[2];
size_t outputWidth = output[3]; int outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_; int filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_); CHECK_EQ(inputChannels, groups_);
// only support strideH() == strideW() and filterHeight == filterWidth. // only support strideH() == strideW() and filterHeight == filterWidth.
...@@ -519,22 +64,19 @@ public: ...@@ -519,22 +64,19 @@ public:
// padding the input // padding the input
float* inputPadding = inputData; float* inputPadding = inputData;
int padInputHeight = inputHeight + 2 * paddingH();
int padInputWidth = inputWidth + 2 * paddingW();
if (paddingH() > 0 || paddingW() > 0) { if (paddingH() > 0 || paddingW() > 0) {
int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) * int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
(inputWidth + 2 * paddingW());
resizeBuffer<Device>(newSize); resizeBuffer<Device>(newSize);
inputPadding = reinterpret_cast<float*>(memory_->getBuf()); inputPadding = reinterpret_cast<float*>(memory_->getBuf());
Padding<float>::run(inputData, neon::Padding<float>::run(inputData,
inputPadding, inputPadding,
batchSize * inputChannels, batchSize * inputChannels,
inputHeight, inputHeight,
inputWidth, inputWidth,
paddingH(), padInputHeight,
paddingW()); padInputWidth);
// height and width of padding data
inputHeight += 2 * paddingH();
inputWidth += 2 * paddingW();
} }
std::function<void( std::function<void(
...@@ -542,36 +84,37 @@ public: ...@@ -542,36 +84,37 @@ public:
DepthWiseConv; DepthWiseConv;
if (filterWidth == 3 && strideW() == 1) { if (filterWidth == 3 && strideW() == 1) {
DepthWiseConv = DepthwiseConvKernel<3, 1>::run; DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
} else if (filterWidth == 3 && strideW() == 2) { } else if (filterWidth == 3 && strideW() == 2) {
DepthWiseConv = DepthwiseConvKernel<3, 2>::run; DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
} else if (filterWidth == 4 && strideW() == 1) { } else if (filterWidth == 4 && strideW() == 1) {
DepthWiseConv = DepthwiseConvKernel<4, 1>::run; DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
} else if (filterWidth == 4 && strideW() == 2) { } else if (filterWidth == 4 && strideW() == 2) {
DepthWiseConv = DepthwiseConvKernel<4, 2>::run; DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
} else { } else {
LOG(FATAL) << "Not supported"; LOG(FATAL) << "Not supported";
} }
for (size_t i = 0; i < batchSize; i++) { for (int i = 0; i < batchSize; i++) {
DepthWiseConv(inputPadding, DepthWiseConv(inputPadding,
filterData, filterData,
inputHeight, padInputHeight,
inputWidth, padInputWidth,
outputChannels, outputChannels,
outputHeight, outputHeight,
outputWidth, outputWidth,
filterMultiplier, filterMultiplier,
outputData); outputData);
inputPadding += inputChannels * inputHeight * inputWidth; inputPadding += inputChannels * padInputHeight * padInputWidth;
outputData += outputChannels * outputHeight * outputWidth; outputData += outputChannels * outputHeight * outputWidth;
} }
} }
}; };
#ifndef PADDLE_TYPE_DOUBLE
REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction); REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
#endif
#endif #endif
} // namespace neon
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string.h>
#include "neon_util.h"
namespace paddle {
namespace neon {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <int filterSize, int stride>
struct DepthwiseConvKernel {};
inline float32_t conv3x3(float32x4_t r0,
float32x4_t r1,
float32x4_t r2,
float32x4_t k0,
float32x4_t k1,
float32x4_t k2) {
float32x4_t tmp;
tmp = vmulq_f32(r0, k0);
tmp = vmlaq_f32(tmp, r1, k1);
tmp = vmlaq_f32(tmp, r2, k2);
return vaddvq_f32(tmp);
}
inline float32_t conv4x4(float32x4_t r0,
float32x4_t r1,
float32x4_t r2,
float32x4_t r3,
float32x4_t k0,
float32x4_t k1,
float32x4_t k2,
float32x4_t k3) {
float32x4_t tmp;
tmp = vmulq_f32(r0, k0);
tmp = vmlaq_f32(tmp, r1, k1);
tmp = vmlaq_f32(tmp, r2, k2);
tmp = vmlaq_f32(tmp, r3, k3);
return vaddvq_f32(tmp);
}
/**
* Each step calculates four elements of the output.
* First step:
* R0[0, 1, 2, 3...] * K[0][0]
* R0[1, 2, 3, 4...] * K[0][1]
* R0[2, 3, 4, 5...] * K[0][2]
* R1[0, 1, 2, 3...] * K[1][0]
* R1[1, 2, 3, 4...] * K[1][1]
* R1[2, 3, 4, 5...] * K[1][2]
* R2[0, 1, 2, 3...] * K[2][0]
* R2[1, 2, 3, 4...] * K[2][1]
* + R2[2, 3, 4, 5...] * K[2][2]
* ------------------------------
* Output[0, 1, 2, 3]
*/
template <>
struct DepthwiseConvKernel<3, 1> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 9) {
// Load the filters
float32x4_t k[3];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 3);
k[2] = vld1q_f32(filterData + 6);
k[0] = vsetq_lane_f32(0.f, k[0], 3);
k[1] = vsetq_lane_f32(0.f, k[1], 3);
k[2] = vsetq_lane_f32(0.f, k[2], 3);
const float* r0 =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
const float* r1 = r0 + inputWidth;
const float* r2 = r0 + inputWidth * 2;
float32x4_t input[3][3];
for (int h = 0; h < outputHeight; h++) {
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t tmp;
input[0][0] = vld1q_f32(r0);
tmp = vld1q_f32(r0 + 4);
input[0][1] = vextq_f32(input[0][0], tmp, 1);
input[0][2] = vextq_f32(input[0][0], tmp, 2);
input[1][0] = vld1q_f32(r1);
tmp = vld1q_f32(r1 + 4);
input[1][1] = vextq_f32(input[1][0], tmp, 1);
input[1][2] = vextq_f32(input[1][0], tmp, 2);
input[2][0] = vld1q_f32(r2);
tmp = vld1q_f32(r2 + 4);
input[2][1] = vextq_f32(input[2][0], tmp, 1);
input[2][2] = vextq_f32(input[2][0], tmp, 2);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 4;
r1 += 4;
r2 += 4;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0++;
r1++;
r2++;
outputData++;
}
r0 += 2;
r1 += 2;
r2 += 2;
}
}
}
};
/**
* Each step calculates four elements of the output.
* First step:
* R0[0, 2, 4, 6...] * K[0][0]
* R0[1, 3, 5, 7...] * K[0][1]
* R0[2, 4, 6, 8...] * K[0][2]
* R1[0, 2, 4, 6...] * K[1][0]
* R1[1, 3, 5, 7...] * K[1][1]
* R1[2, 4, 6, 8...] * K[1][2]
* R2[0, 2, 4, 6...] * K[2][0]
* R2[1, 3, 5, 7...] * K[2][1]
* R2[2, 4, 6, 8...] * K[2][2]
* ------------------------------
* Output[0, 1, 2, 3]
*/
template <>
struct DepthwiseConvKernel<3, 2> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 9) {
// Load the filters
float32x4_t k[3];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 3);
k[2] = vld1q_f32(filterData + 6);
k[0] = vsetq_lane_f32(0.f, k[0], 3);
k[1] = vsetq_lane_f32(0.f, k[1], 3);
k[2] = vsetq_lane_f32(0.f, k[2], 3);
const float* start =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
float32x4_t input[3][3];
for (int h = 0; h < outputHeight; h++) {
const float* r0 = start + 2 * h * inputWidth;
const float* r1 = start + (2 * h + 1) * inputWidth;
const float* r2 = start + (2 * h + 2) * inputWidth;
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t data1;
float32x4x2_t data2;
data2 = vld2q_f32(r0);
input[0][0] = data2.val[0];
input[0][1] = data2.val[1];
data1 = vld1q_f32(r0 + 8);
input[0][2] = vextq_f32(data2.val[0], data1, 1);
data2 = vld2q_f32(r1);
input[1][0] = data2.val[0];
input[1][1] = data2.val[1];
data1 = vld1q_f32(r1 + 8);
input[1][2] = vextq_f32(data2.val[0], data1, 1);
data2 = vld2q_f32(r2);
input[2][0] = data2.val[0];
input[2][1] = data2.val[1];
data1 = vld1q_f32(r2 + 8);
input[2][2] = vextq_f32(data2.val[0], data1, 1);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 8;
r1 += 8;
r2 += 8;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0 += 2;
r1 += 2;
r2 += 2;
outputData++;
}
}
}
}
};
/**
* Each step calculates four elements of the output.
*/
template <>
struct DepthwiseConvKernel<4, 1> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 16) {
// Load the filters
float32x4_t k[4];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 4);
k[2] = vld1q_f32(filterData + 8);
k[3] = vld1q_f32(filterData + 12);
const float* r0 =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
const float* r1 = r0 + inputWidth;
const float* r2 = r0 + inputWidth * 2;
const float* r3 = r0 + inputWidth * 3;
float32x4_t input[4][4];
for (int h = 0; h < outputHeight; h++) {
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t tmp;
input[0][0] = vld1q_f32(r0);
tmp = vld1q_f32(r0 + 4);
input[0][1] = vextq_f32(input[0][0], tmp, 1);
input[0][2] = vextq_f32(input[0][0], tmp, 2);
input[0][3] = vextq_f32(input[0][0], tmp, 3);
input[1][0] = vld1q_f32(r1);
tmp = vld1q_f32(r1 + 4);
input[1][1] = vextq_f32(input[1][0], tmp, 1);
input[1][2] = vextq_f32(input[1][0], tmp, 2);
input[1][3] = vextq_f32(input[1][0], tmp, 3);
input[2][0] = vld1q_f32(r2);
tmp = vld1q_f32(r2 + 4);
input[2][1] = vextq_f32(input[2][0], tmp, 1);
input[2][2] = vextq_f32(input[2][0], tmp, 2);
input[2][3] = vextq_f32(input[2][0], tmp, 3);
input[3][0] = vld1q_f32(r3);
tmp = vld1q_f32(r3 + 4);
input[3][1] = vextq_f32(input[3][0], tmp, 1);
input[3][2] = vextq_f32(input[3][0], tmp, 2);
input[3][3] = vextq_f32(input[3][0], tmp, 3);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
float32x4_t i3 = vld1q_f32(r3);
*outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
r0++;
r1++;
r2++;
r3++;
outputData++;
}
r0 += 3;
r1 += 3;
r2 += 3;
r3 += 3;
}
}
}
};
/**
* Each step calculates four elements of the output.
*/
template <>
struct DepthwiseConvKernel<4, 2> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 16) {
// Load the filters
float32x4_t k[4];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 4);
k[2] = vld1q_f32(filterData + 8);
k[3] = vld1q_f32(filterData + 12);
const float* start =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
float32x4_t input[4][4];
for (int h = 0; h < outputHeight; h++) {
const float* r0 = start + 2 * h * inputWidth;
const float* r1 = start + (2 * h + 1) * inputWidth;
const float* r2 = start + (2 * h + 2) * inputWidth;
const float* r3 = start + (2 * h + 3) * inputWidth;
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4x2_t data1;
float32x4x2_t data2;
data1 = vld2q_f32(r0);
data2 = vld2q_f32(r0 + 8);
input[0][0] = data1.val[0];
input[0][1] = data1.val[1];
input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r1);
data2 = vld2q_f32(r1 + 8);
input[1][0] = data1.val[0];
input[1][1] = data1.val[1];
input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r2);
data2 = vld2q_f32(r2 + 8);
input[2][0] = data1.val[0];
input[2][1] = data1.val[1];
input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r3);
data2 = vld2q_f32(r3 + 8);
input[3][0] = data1.val[0];
input[3][1] = data1.val[1];
input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 8;
r1 += 8;
r2 += 8;
r3 += 8;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
float32x4_t i3 = vld1q_f32(r3);
*outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;
outputData++;
}
}
}
}
};
template <class T>
struct Padding {
static void run(const T* input,
T* inputPadding,
int channels,
int inputHeight,
int inputWidth,
int padInputHeight,
int padInputWidth) {
const int paddingHeight = (padInputHeight - inputHeight) / 2;
const int paddingWidth = (padInputWidth - inputWidth) / 2;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
inputPadding += padInputWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*inputPadding++ = T(0);
}
memcpy(inputPadding, input, inputWidth * sizeof(T));
inputPadding += inputWidth;
input += inputWidth;
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*inputPadding++ = T(0);
}
}
if (paddingHeight > 0) {
memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
inputPadding += padInputWidth * paddingHeight;
}
}
}
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
struct Padding<float> {
static void run(const float* input,
float* inputPadding,
int channels,
int inputHeight,
int inputWidth,
int padInputHeight,
int padInputWidth) {
const int paddingHeight = (padInputHeight - inputHeight) / 2;
const int paddingWidth = (padInputWidth - inputWidth) / 2;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
inputPadding += padInputWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*inputPadding++ = float(0);
}
int step = inputWidth >> 2;
int remain = inputWidth & 3;
for (int s = 0; s < step; s++) {
float32x4_t s0 = vld1q_f32(input);
vst1q_f32(inputPadding, s0);
input += 4;
inputPadding += 4;
}
for (int r = 0; r < remain; r++) {
*inputPadding++ = *input++;
}
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*inputPadding++ = float(0);
}
}
if (paddingHeight > 0) {
memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
inputPadding += padInputWidth * paddingHeight;
}
}
}
};
// for stride is 2
struct StridePadding {
static void run(const float* input,
float* inputPadding,
int channels,
int inputHeight,
int inputWidth,
int padInputHeight,
int padInputWidth) {
const int paddingHeight = (padInputHeight - (inputHeight * 2 - 1)) / 2;
const int paddingWidth = (padInputWidth - (inputWidth * 2 - 1)) / 2;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
inputPadding += padInputWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*inputPadding++ = float(0);
}
int step = inputWidth >> 2;
int remain = inputWidth & 3;
float32x4_t s1 = vdupq_n_f32(0.f);
for (int s = 0; s < step; s++) {
float32x4_t s0 = vld1q_f32(input);
float32x4x2_t v = {s0, s1};
vst2q_f32(inputPadding, v);
input += 4;
inputPadding += 8;
}
for (int r = 0; r < remain; r++) {
*inputPadding++ = *input++;
*inputPadding++ = float(0);
}
inputPadding--;
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*inputPadding++ = float(0);
}
if (i != inputHeight - 1) {
memset(inputPadding, 0, padInputWidth * sizeof(float));
inputPadding += padInputWidth;
}
}
if (paddingHeight > 0) {
memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
inputPadding += padInputWidth * paddingHeight;
}
}
}
};
#endif
#endif
} // namespace neon
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "NeonDepthwiseConv.h"
#include "paddle/function/ConvOp.h"
namespace paddle {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <DeviceType Device>
class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
check(inputs, outputs);
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
int batchSize = input[0];
int inputChannels = input[1];
int inputHeight = input[2];
int inputWidth = input[3];
int filterHeight = getFilterHeight(filter);
int filterWidth = getFilterWidth(filter);
int outputChannels = output[1];
int outputHeight = output[2];
int outputWidth = output[3];
int filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
// only support strideH() == strideW() and filterHeight == filterWidth.
CHECK_EQ(strideH(), strideW());
CHECK_EQ(paddingH(), paddingW());
CHECK_EQ(filterHeight, filterWidth);
float* inputData = inputs[0].data<float>();
float* filterData = inputs[1].data<float>();
float* outputData = outputs[0].data<float>();
// padding the input, input -> inputPadding
float* inputPadding = inputData;
int padInputHeight =
(inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
int padInputWidth =
(inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
resizeBuffer<Device>(newSize);
inputPadding = reinterpret_cast<float*>(memory_->getBuf());
if (strideH() == 1) {
neon::Padding<float>::run(inputData,
inputPadding,
batchSize * inputChannels,
inputHeight,
inputWidth,
padInputHeight,
padInputWidth);
} else if (strideH() == 2) {
neon::StridePadding::run(inputData,
inputPadding,
batchSize * inputChannels,
inputHeight,
inputWidth,
padInputHeight,
padInputWidth);
} else {
LOG(FATAL) << "Not supported";
}
}
std::function<void(
const float*, const float*, int, int, int, int, int, int, float*)>
DepthWiseConv;
if (filterWidth == 3) {
DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
} else if (filterWidth == 4) {
DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
} else {
LOG(FATAL) << "Not supported";
}
for (int i = 0; i < batchSize; i++) {
DepthWiseConv(inputPadding,
filterData,
padInputHeight,
padInputWidth,
outputChannels,
outputHeight,
outputWidth,
filterMultiplier,
outputData);
inputPadding += inputChannels * padInputHeight * padInputWidth;
outputData += outputChannels * outputHeight * outputWidth;
}
}
};
#ifndef PADDLE_TYPE_DOUBLE
REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
CPU,
NeonDepthwiseConvTransposeFunction);
#endif
#endif
} // namespace paddle
...@@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) { ...@@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) {
return vget_lane_f32(vpadd_f32(v, v), 0); return vget_lane_f32(vpadd_f32(v, v), 0);
} }
inline float32x4_t vmlaq_laneq_f32(float32x4_t a, #define vmlaq_laneq_f32(a, b, v, lane) \
float32x4_t b, vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
float32x4_t v,
const int lane) {
return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
}
#endif #endif
} // namespace neon } // namespace neon
......
...@@ -83,8 +83,8 @@ void Conv3DLayer::forward(PassType passType) { ...@@ -83,8 +83,8 @@ void Conv3DLayer::forward(PassType passType) {
int outWidth = getSize(); int outWidth = getSize();
resetOutput(batchSize, outWidth); resetOutput(batchSize, outWidth);
REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
for (size_t i = 0; i != inputLayers_.size(); ++i) { for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
const MatrixPtr &inMat = getInputValue(i); const MatrixPtr &inMat = getInputValue(i);
const MatrixPtr &outMat = getOutputValue(); const MatrixPtr &outMat = getOutputValue();
int M = M_[i]; int M = M_[i];
...@@ -120,7 +120,6 @@ void Conv3DLayer::forward(PassType passType) { ...@@ -120,7 +120,6 @@ void Conv3DLayer::forward(PassType passType) {
} }
} }
if (nullptr != this->biasParameter_) { if (nullptr != this->biasParameter_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
this->addBias(); this->addBias();
} }
forwardActivation(); forwardActivation();
...@@ -134,15 +133,14 @@ void Conv3DLayer::backward(const UpdateCallback &callback) { ...@@ -134,15 +133,14 @@ void Conv3DLayer::backward(const UpdateCallback &callback) {
biases_->getParameterPtr()->incUpdate(callback); biases_->getParameterPtr()->incUpdate(callback);
} }
REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
for (size_t i = 0; i != inputLayers_.size(); ++i) { for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
if (weights_[i]->getWGrad()) { if (weights_[i]->getWGrad()) {
bpropWeights(i); bpropWeights(i);
} }
if (getInputGrad(i)) { if (getInputGrad(i)) {
bpropData(i); bpropData(i);
} }
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
weights_[i]->getParameterPtr()->incUpdate(callback); weights_[i]->getParameterPtr()->incUpdate(callback);
} }
} }
......
...@@ -84,8 +84,8 @@ void DeConv3DLayer::forward(PassType passType) { ...@@ -84,8 +84,8 @@ void DeConv3DLayer::forward(PassType passType) {
resetOutput(batchSize, outWidth); resetOutput(batchSize, outWidth);
const MatrixPtr outMat = getOutputValue(); const MatrixPtr outMat = getOutputValue();
REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
for (size_t i = 0; i != inputLayers_.size(); ++i) { for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
const MatrixPtr &inMat = getInputValue(i); const MatrixPtr &inMat = getInputValue(i);
int M = M_[i]; int M = M_[i];
int N = N_[i]; int N = N_[i];
...@@ -120,7 +120,6 @@ void DeConv3DLayer::forward(PassType passType) { ...@@ -120,7 +120,6 @@ void DeConv3DLayer::forward(PassType passType) {
} }
} }
if (nullptr != this->biasParameter_) { if (nullptr != this->biasParameter_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
this->addBias(); this->addBias();
} }
forwardActivation(); forwardActivation();
...@@ -133,12 +132,12 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) { ...@@ -133,12 +132,12 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
bpropBiases(); bpropBiases();
biases_->getParameterPtr()->incUpdate(callback); biases_->getParameterPtr()->incUpdate(callback);
} }
REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
for (size_t i = 0; i < inputLayers_.size(); ++i) { for (size_t i = 0; i < inputLayers_.size(); ++i) {
if (weights_[i]->getWGrad() || this->needGradient_) { if (weights_[i]->getWGrad() || this->needGradient_) {
int M = M_[i]; int M = M_[i];
int N = N_[i]; int N = N_[i];
int K = K_[i]; int K = K_[i];
REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_); Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
const MatrixPtr &inMat = getInputValue(i); const MatrixPtr &inMat = getInputValue(i);
for (int n = 0; n < batchSize; ++n) { for (int n = 0; n < batchSize; ++n) {
...@@ -182,7 +181,6 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) { ...@@ -182,7 +181,6 @@ void DeConv3DLayer::backward(const UpdateCallback &callback) {
} }
} }
} }
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
weights_[i]->getParameterPtr()->incUpdate(callback); weights_[i]->getParameterPtr()->incUpdate(callback);
} }
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "GruCompute.h" #include "GruCompute.h"
#include "hl_recurrent_apply.cuh" #include "hl_recurrent_apply.cuh"
#include "paddle/function/GruFunctor.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
namespace paddle { namespace paddle {
...@@ -25,13 +26,13 @@ void GruCompute::init(LayerConfig &config) { ...@@ -25,13 +26,13 @@ void GruCompute::init(LayerConfig &config) {
template <> template <>
void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) { void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
hl_cpu_gru_forward(hppl::forward::gru_resetOutput(), GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
hppl::forward::gru_finalOutput(), hppl::forward::gru_finalOutput(),
value, value,
frameSize, frameSize,
batchSize, batchSize,
activeNode_, activeNode_,
activeGate_); activeGate_);
} }
template <> template <>
...@@ -39,14 +40,15 @@ void GruCompute::backward<0>(hl_gru_value value, ...@@ -39,14 +40,15 @@ void GruCompute::backward<0>(hl_gru_value value,
hl_gru_grad grad, hl_gru_grad grad,
int frameSize, int frameSize,
int batchSize) { int batchSize) {
hl_cpu_gru_backward(hppl::backward::gru_stateGrad(), GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
hppl::backward::gru_resetGrad(), hppl::backward::gru_stateGrad(),
value, hppl::backward::gru_resetGrad(),
grad, value,
frameSize, grad,
batchSize, frameSize,
activeNode_, batchSize,
activeGate_); activeNode_,
activeGate_);
} }
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "SwitchOrderLayer.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(switch_order, SwitchOrderLayer);
bool SwitchOrderLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
auto& img_conf = config_.inputs(0).image_conf();
size_t inH =
img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
size_t inW = img_conf.img_size();
size_t inC = img_conf.channels();
inDims_ = TensorShape({0, inC, inH, inW});
outDims_ = TensorShape(4);
auto& reshape_conf = config_.reshape_conf();
for (size_t i = 0; i < reshape_conf.heightaxis_size(); i++) {
heightAxis_.push_back(reshape_conf.heightaxis(i));
}
for (size_t i = 0; i < reshape_conf.widthaxis_size(); i++) {
widthAxis_.push_back(reshape_conf.widthaxis(i));
}
createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
return true;
}
void SwitchOrderLayer::setOutDims() {
outDims_.setDim(0, inDims_[0]);
outDims_.setDim(1, inDims_[2]);
outDims_.setDim(2, inDims_[3]);
outDims_.setDim(3, inDims_[1]);
reshapeHeight_ = 1;
for (size_t i = 0; i < heightAxis_.size(); i++) {
reshapeHeight_ *= outDims_[heightAxis_[i]];
}
output_.setFrameHeight(reshapeHeight_);
reshapeWidth_ = 1;
for (size_t i = 0; i < widthAxis_.size(); i++) {
reshapeWidth_ *= outDims_[widthAxis_[i]];
}
output_.setFrameWidth(reshapeWidth_);
}
void SwitchOrderLayer::setInDims() {
MatrixPtr input = inputLayers_[0]->getOutputValue();
size_t batchSize = input->getHeight();
inDims_.setDim(0, batchSize);
int h = inputLayers_[0]->getOutput().getFrameHeight();
if (h != 0) inDims_.setDim(2, h);
int w = inputLayers_[0]->getOutput().getFrameWidth();
if (w != 0) inDims_.setDim(3, w);
int totalCount = input->getElementCnt();
int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
if (channels != 0) inDims_.setDim(1, channels);
}
void SwitchOrderLayer::forward(PassType passType) {
Layer::forward(passType);
setInDims();
setOutDims();
resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
if (heightAxis_.size() > 0) {
getOutputValue()->reshape(reshapeHeight_, reshapeWidth_);
getOutputGrad()->reshape(reshapeHeight_, reshapeWidth_);
}
// switch NCHW to NHWC
BufferArgs inputs;
BufferArgs outputs;
inputs.addArg(*getInputValue(0), inDims_);
outputs.addArg(*getOutputValue(), outDims_);
nchw2nhwc_[0]->calc(inputs, outputs);
forwardActivation();
}
void SwitchOrderLayer::backward(const UpdateCallback& callback) {
(void)callback;
backwardActivation();
// switch NHWC to NCHW
BufferArgs inputs;
BufferArgs outputs;
inputs.addArg(*getOutputGrad(), outDims_);
outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
nhwc2nchw_[0]->calc(inputs, outputs);
}
} // namespace paddle
...@@ -12,9 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #pragma once
#include "paddle/operators/scatter_op.h"
namespace ops = paddle::operators; #include "Layer.h"
REGISTER_OP_GPU_KERNEL(scatter,
ops::ScatterOpKernel<paddle::platform::GPUPlace, float>); namespace paddle {
/**
* \brief This layer calculate softmax in image channel dimension.
*/
class SwitchOrderLayer : public Layer {
public:
explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
~SwitchOrderLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
void setInDims();
void setOutDims();
protected:
std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
TensorShape inDims_;
TensorShape outDims_;
std::vector<int> heightAxis_;
std::vector<int> widthAxis_;
size_t reshapeHeight_;
size_t reshapeWidth_;
};
} // namespace paddle
...@@ -2008,6 +2008,31 @@ TEST(Layer, CropLayer) { ...@@ -2008,6 +2008,31 @@ TEST(Layer, CropLayer) {
} }
} }
TEST(Layer, SwitchOrderLayer) {
TestConfig config;
// config input_0
config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
LayerInputConfig* input = config.layerConfig.add_inputs();
ImageConfig* img = input->mutable_image_conf();
img->set_channels(4);
img->set_img_size(16);
img->set_img_size_y(16);
ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
reshape->add_heightaxis(0);
reshape->add_heightaxis(1);
reshape->add_heightaxis(2);
reshape->add_widthaxis(3);
// config softmax layer
config.layerConfig.set_type("switch_order");
config.layerConfig.set_name("switchOrderLayer");
for (auto useGpu : {false, true}) {
testLayerGrad(config, "switch_order", 100, false, useGpu, true);
}
}
vector<real> randSampling(real range, int n) { vector<real> randSampling(real range, int n) {
CHECK_GE(range, n); CHECK_GE(range, n);
vector<real> num(range); vector<real> num(range);
......
...@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP) ...@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
namespace paddle { namespace paddle {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <> template <>
void gemm<float>(const CBLAS_TRANSPOSE transA, void gemm<float>(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const CBLAS_TRANSPOSE transB,
...@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA, ...@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
C, C,
ldc); ldc);
} }
#endif
template <> template <>
int getrf<float>(const CBLAS_ORDER order, int getrf<float>(const CBLAS_ORDER order,
...@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order, ...@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv); return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
} }
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <> template <>
void axpy<float>(const int n, const float alpha, const float* x, float* y) { void axpy<float>(const int n, const float alpha, const float* x, float* y) {
cblas_saxpy(n, alpha, x, 1, y, 1); cblas_saxpy(n, alpha, x, 1, y, 1);
...@@ -201,6 +204,7 @@ template <> ...@@ -201,6 +204,7 @@ template <>
double dotProduct<double>(const int n, const double* x, const double* y) { double dotProduct<double>(const int n, const double* x, const double* y) {
return cblas_ddot(n, x, 1, y, 1); return cblas_ddot(n, x, 1, y, 1);
} }
#endif
#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML) #if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
......
...@@ -40,7 +40,14 @@ extern "C" { ...@@ -40,7 +40,14 @@ extern "C" {
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
#include <cblas.h> #include <cblas.h>
#else
typedef enum CBLAS_ORDER {
CblasRowMajor = 101,
CblasColMajor = 102
} CBLAS_ORDER;
#endif
int LAPACKE_sgetrf( int LAPACKE_sgetrf(
int matrix_layout, int m, int n, float* a, int lda, int* ipiv); int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
int LAPACKE_dgetrf( int LAPACKE_dgetrf(
...@@ -56,6 +63,7 @@ int LAPACKE_dgetri( ...@@ -56,6 +63,7 @@ int LAPACKE_dgetri(
namespace paddle { namespace paddle {
#ifndef PADDLE_USE_EIGEN_FOR_BLAS
template <class T> template <class T>
void gemm(const CBLAS_TRANSPOSE transA, void gemm(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const CBLAS_TRANSPOSE transB,
...@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA, ...@@ -70,6 +78,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
const T beta, const T beta,
T* C, T* C,
const int ldc); const int ldc);
#endif
template <class T> template <class T>
int getrf(const CBLAS_ORDER Order, int getrf(const CBLAS_ORDER Order,
...@@ -84,10 +93,21 @@ int getri( ...@@ -84,10 +93,21 @@ int getri(
const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
template <class T> template <class T>
void axpy(const int n, const T alpha, const T* x, T* y); void axpy(const int n, const T alpha, const T* x, T* y) {
/// y = y + alpha * x
for (int i = 0; i < n; i++) {
y[i] = y[i] + alpha * x[i];
}
}
template <class T> template <class T>
T dotProduct(const int n, const T* x, const T* y); T dotProduct(const int n, const T* x, const T* y) {
T result = static_cast<T>(0);
for (int i = 0; i < n; i++) {
result += x[i] * y[i];
}
return result;
}
template <class T> template <class T>
void vExp(const int n, const T* a, T* r); void vExp(const int n, const T* a, T* r);
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "hl_top_k.h" #include "hl_top_k.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/function/GemmFunctor.h"
#include "paddle/utils/ThreadLocal.h" #include "paddle/utils/ThreadLocal.h"
#include "SIMDFunctions.h" #include "SIMDFunctions.h"
...@@ -2773,24 +2774,24 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { ...@@ -2773,24 +2774,24 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
CHECK(!isTransposed()) << "Not supported"; CHECK(!isTransposed()) << "Not supported";
size_t a_col, b_col, a_row, b_row; size_t a_col, b_col, a_row, b_row;
CBLAS_TRANSPOSE a_trans, b_trans; bool a_trans, b_trans;
if (!a->isTransposed()) { if (!a->isTransposed()) {
a_col = a->getWidth(); a_col = a->getWidth();
a_row = a->getHeight(); a_row = a->getHeight();
a_trans = CblasNoTrans; a_trans = false;
} else { } else {
a_col = a->getHeight(); a_col = a->getHeight();
a_row = a->getWidth(); a_row = a->getWidth();
a_trans = CblasTrans; a_trans = true;
} }
if (!b->isTransposed()) { if (!b->isTransposed()) {
b_col = b->getWidth(); b_col = b->getWidth();
b_row = b->getHeight(); b_row = b->getHeight();
b_trans = CblasNoTrans; b_trans = false;
} else { } else {
b_col = b->getHeight(); b_col = b->getHeight();
b_row = b->getWidth(); b_row = b->getWidth();
b_trans = CblasTrans; b_trans = true;
} }
CHECK_EQ(a_col, b_row); CHECK_EQ(a_col, b_row);
...@@ -2807,7 +2808,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { ...@@ -2807,7 +2808,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
int lda = a->getStride(); int lda = a->getStride();
int ldb = b->getStride(); int ldb = b->getStride();
int ldc = getStride(); int ldc = getStride();
gemm<real>( BlasGemm<DEVICE_TYPE_CPU, real>::compute(
a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc); a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
} }
......
...@@ -1616,6 +1616,10 @@ public: ...@@ -1616,6 +1616,10 @@ public:
}; };
class CpuMatrix : public Matrix { class CpuMatrix : public Matrix {
private:
MatrixPtr sftmaxSum_;
MatrixPtr sftmaxDot_;
public: public:
CpuMatrix(size_t height, size_t width, bool trans = false); CpuMatrix(size_t height, size_t width, bool trans = false);
CpuMatrix(real* data, size_t height, size_t width, bool trans = false) CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
......
...@@ -14,27 +14,31 @@ function(op_library TARGET) ...@@ -14,27 +14,31 @@ function(op_library TARGET)
cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN}) "${multiValueArgs}" ${ARGN})
foreach(src ${op_library_SRCS}) list(LENGTH op_library_SRCS op_library_SRCS_len)
if (${src} MATCHES ".*\\.cu$") if (${op_library_SRCS_len} EQUAL 0)
list(APPEND cu_srcs ${src}) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${TARGET}.cc)
list(APPEND cc_srcs ${src})
else()
message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
endif() endif()
endforeach() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
else()
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
else()
message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
endif()
endforeach()
endif()
list(LENGTH cc_srcs cc_srcs_len) list(LENGTH cc_srcs cc_srcs_len)
if (${cc_srcs_len} EQUAL 0) if (${cc_srcs_len} EQUAL 0)
message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
endif() endif()
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH op_library_DEPS dep_len)
if (${cu_srcs_len} EQUAL 0 AND ${dep_len} EQUAL 0)
message(WARNING "The op library ${TARGET} not support GPU!")
endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
...@@ -46,22 +50,22 @@ endfunction() ...@@ -46,22 +50,22 @@ endfunction()
add_subdirectory(math) add_subdirectory(math)
list(REMOVE_ITEM GENERAL_OPS set(DEPS_OPS
net_op identity_op
minus_op minus_op
mul_op mul_op
recurrent_op recurrent_op
scale_op) scale_op)
op_library(identity_op DEPS scale_op)
op_library(net_op SRCS net_op.cc) op_library(minus_op DEPS scale_op)
op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op) op_library(mul_op DEPS math_function)
op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS framework_proto tensor operator net_op) DEPS framework_proto tensor operator net_op)
op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op) op_library(scale_op DEPS net_op)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src} SRCS ${src}.cc ${src}.cu) op_library(${src})
endforeach() endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
......
...@@ -57,7 +57,6 @@ class AddOpGrad : public framework::OperatorWithKernel { ...@@ -57,7 +57,6 @@ class AddOpGrad : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker, add_two_grad, ops::AddOpGrad); REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
REGISTER_OP_CPU_KERNEL(add_two, REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
ops::AddKernel<paddle::platform::CPUPlace, float>);
...@@ -12,10 +12,7 @@ ...@@ -12,10 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/framework/op_registry.h"
#include "paddle/operators/add_op.h" #include "paddle/operators/add_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(add_two, REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
ops::AddKernel<paddle::platform::GPUPlace, float>);
...@@ -23,6 +23,9 @@ using Tensor = framework::Tensor; ...@@ -23,6 +23,9 @@ using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
class CosSimKernel : public framework::OpKernel { class CosSimKernel : public framework::OpKernel {
...@@ -43,14 +46,14 @@ class CosSimKernel : public framework::OpKernel { ...@@ -43,14 +46,14 @@ class CosSimKernel : public framework::OpKernel {
auto new_dims = framework::make_ddim({dims[0], size / dims[0]}); auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
auto x = EigenMatrix<T>::From(*input_x, new_dims); auto x = EigenMatrix<T>::From(*input_x, new_dims);
auto y = EigenMatrix<T>::From(*input_y, new_dims); auto y = EigenMatrix<T>::From(*input_y, new_dims);
auto z = EigenMatrix<T>::From(*output_z); auto z = EigenVector<T>::Flatten(*output_z);
auto x_norm = EigenMatrix<T>::From(*output_x_norm); auto x_norm = EigenVector<T>::Flatten(*output_x_norm);
auto y_norm = EigenMatrix<T>::From(*output_y_norm); auto y_norm = EigenVector<T>::Flatten(*output_y_norm);
auto place = context.GetEigenDevice<Place>(); auto place = context.GetEigenDevice<Place>();
auto xy = (x * y).sum(Eigen::array<int, 1>({1})); auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({1})).sqrt(); x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({1})).sqrt(); y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
z.device(place) = xy / x_norm / y_norm; z.device(place) = xy / x_norm / y_norm;
} }
}; };
......
...@@ -19,12 +19,12 @@ template <typename T> ...@@ -19,12 +19,12 @@ template <typename T>
class CPUGaussianRandomKernel : public framework::OpKernel { class CPUGaussianRandomKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
float mean = context.GetAttr<float>("mean"); float mean = context.Attr<float>("mean");
float std = context.GetAttr<float>("std"); float std = context.Attr<float>("std");
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed")); unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine; std::minstd_rand engine;
if (seed == 0) { if (seed == 0) {
seed = std::random_device()(); seed = std::random_device()();
...@@ -45,7 +45,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { ...@@ -45,7 +45,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(const framework::InferShapeContext& context) const override { void InferShape(const framework::InferShapeContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
auto dims = GetAttr<std::vector<int>>("dims"); auto dims = Attr<std::vector<int>>("dims");
std::vector<int64_t> temp; std::vector<int64_t> temp;
temp.reserve(dims.size()); temp.reserve(dims.size());
for (auto dim : dims) { for (auto dim : dims) {
......
...@@ -42,13 +42,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel { ...@@ -42,13 +42,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed")); unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
if (seed == 0) { if (seed == 0) {
std::random_device rd; std::random_device rd;
seed = rd(); seed = rd();
} }
T mean = static_cast<T>(context.GetAttr<float>("mean")); T mean = static_cast<T>(context.Attr<float>("mean"));
T std = static_cast<T>(context.GetAttr<float>("std")); T std = static_cast<T>(context.Attr<float>("std"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
ssize_t N = framework::product(tensor->dims()); ssize_t N = framework::product(tensor->dims());
thrust::transform(index_sequence_begin, index_sequence_begin + N, thrust::transform(index_sequence_begin, index_sequence_begin + N,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/net_op.h"
#include "paddle/operators/scale_op.h"
namespace paddle {
namespace operators {
// identity is a alias of scale op. This is also a example for creating a alias
// operator.
template <typename AttrType>
class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
public:
IdentityOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "input tensor of identity op");
AddOutput("Out", "output tensor of identity op");
AddComment("identity operator. Just a alias of scale op which scale = 1.0");
}
};
template <typename AttrType>
class IdentityOp : public NetOp {
public:
IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
{{"scale", static_cast<AttrType>(1)}}));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
ops::IdentityOpMaker<float>);
if(WITH_GPU) if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu DEPS cblas device_context) nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
im2col.cu DEPS cblas device_context)
else() else()
cc_library(math_function SRCS math_function.cc DEPS cblas device_context) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context)
endif() endif()
nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/im2col.h"
namespace paddle {
namespace operators {
namespace math {
/*
* im = [input_channels, input_height, input_width]
* col =
* [input_channels, filter_height, filter_width, output_height, output_width]
*/
template <class T>
class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, T> {
public:
void operator()(const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[1];
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
int channels_col = input_channels * filter_height * filter_width;
const T* im_data = im.data<T>();
T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) {
for (int w = 0; w < output_width; ++w) {
int im_row_idx = h * stride_height + h_offset;
int im_col_idx = w * stride_width + w_offset;
if ((im_row_idx - padding_height) < 0 ||
(im_row_idx - padding_height) >= input_height ||
(im_col_idx - padding_width) < 0 ||
(im_col_idx - padding_width) >= input_width) {
col_data[(c * output_height + h) * output_width + w] = T(0);
} else {
im_row_idx += c_im * input_height - padding_height;
im_col_idx -= padding_width;
col_data[(c * output_height + h) * output_width + w] =
im_data[im_row_idx * input_width + im_col_idx];
}
}
}
}
}
};
/*
* im = [input_channels, input_height, input_width]
* col =
* [input_channels, filter_height, filter_width, output_height, output_width]
*/
template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, T> {
public:
void operator()(framework::Tensor& im, const framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[1];
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
int channels_col = input_channels * filter_height * filter_width;
T* im_data = im.data<T>();
const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) {
for (int w = 0; w < output_width; ++w) {
int im_row_idx = h * stride_height + h_offset;
int im_col_idx = w * stride_width + w_offset;
if ((im_row_idx - padding_height) >= 0 &&
(im_row_idx - padding_height) < input_height &&
(im_col_idx - padding_width) >= 0 &&
(im_col_idx - padding_width) < input_width) {
im_row_idx += c_im * input_height - padding_height;
im_col_idx -= padding_width;
im_data[im_row_idx * input_width + im_col_idx] +=
col_data[(c * output_height + h) * output_width + w];
}
}
}
}
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CPUPlace, double>;
/*
* im = [input_channels, input_height, input_width]
* col =
* [output_height, output_width, input_channels, filter_height, filter_width]
*/
template <class T>
class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, T> {
public:
void operator()(const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
const T* im_data = im.data<T>();
T* col_data = col.data<T>();
for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
for (int channel = 0; channel < input_channels; ++channel) {
for (int filter_row_idx = 0; filter_row_idx < filter_height;
++filter_row_idx) {
for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) {
int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_height;
int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_width;
int col_offset = (((col_row_idx * output_width + col_col_idx) *
input_channels +
channel) *
filter_height +
filter_row_idx) *
filter_width +
filter_col_idx;
if (im_row_offset < 0 || im_row_offset >= input_height ||
im_col_offset < 0 || im_col_offset >= input_width) {
col_data[col_offset] = T(0);
} else {
int im_offset =
(channel * input_height + im_row_offset) * input_width +
im_col_offset;
col_data[col_offset] = im_data[im_offset];
}
}
}
}
}
}
}
};
/*
* im = [input_channels, input_height, input_width]
* col =
* [output_height, output_width, input_channels, filter_height, filter_width]
*/
template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, T> {
public:
void operator()(framework::Tensor& im, const framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
T* im_data = im.data<T>();
const T* col_data = col.data<T>();
for (int col_row_idx = 0; col_row_idx < output_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < output_width; ++col_col_idx) {
for (int channel = 0; channel < input_channels; ++channel) {
for (int filter_row_idx = 0; filter_row_idx < filter_height;
++filter_row_idx) {
for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) {
int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_height;
int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_width;
int col_offset = (((col_row_idx * output_width + col_col_idx) *
input_channels +
channel) *
filter_height +
filter_row_idx) *
filter_width +
filter_col_idx;
if (im_row_offset >= 0 && im_row_offset < input_height &&
im_col_offset >= 0 && im_col_offset < input_width) {
int im_offset =
(channel * input_height + im_row_offset) * input_width +
im_col_offset;
im_data[im_offset] += col_data[col_offset];
}
}
}
}
}
}
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/im2col.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
template <class T>
__global__ void im2col(const T* data_im, int num_outs, int height, int width,
int filter_height, int filter_width, int stride_height,
int stride_width, int padding_height, int padding_width,
int output_height, int output_width, T* data_col) {
int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < num_outs) {
int w_out = index % output_width;
index /= output_width;
int h_out = index % output_height;
int channel_in = index / output_height;
int channel_out = channel_in * filter_height * filter_width;
int h_in = h_out * stride_height;
int w_in = w_out * stride_width;
data_col += (channel_out * output_height + h_out) * output_width + w_out;
for (int i = 0; i < filter_height; ++i) {
for (int j = 0; j < filter_width; ++j) {
int rIdx = int(h_in + i);
int cIdx = int(w_in + j);
if ((rIdx - (int)padding_height) >= (int)height ||
(rIdx - (int)padding_height) < 0 ||
(cIdx - (int)padding_width) >= (int)width ||
(cIdx - (int)padding_width) < 0) {
*data_col = 0;
} else {
rIdx = rIdx + channel_in * height - padding_height;
cIdx = cIdx - padding_width;
*data_col = data_im[rIdx * width + cIdx];
}
data_col += output_height * output_width;
}
}
}
}
/*
* im = [input_channels, input_height, input_width]
* col =
* [input_channels, filter_height, filter_width, output_height, output_width]
*/
template <class T>
class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, T> {
public:
void operator()(const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[1];
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
int num_outputs = input_channels * output_height * output_width;
int blocks = (num_outputs + 1024 - 1) / 1024;
int block_x = 512;
int block_y = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1);
dim3 grid(block_x, block_y);
im2col<T><<<
grid, threads, 0,
reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
im.data<T>(), num_outputs, input_height, input_width, filter_height,
filter_width, stride_height, stride_width, padding_height,
padding_width, output_height, output_width, col.data<T>());
}
};
template <class T>
__global__ void col2im(size_t n, const T* data_col, size_t height, size_t width,
size_t channels, size_t filter_height,
size_t filter_width, size_t stride_height,
size_t stride_width, size_t padding_height,
size_t padding_width, size_t output_height,
size_t output_width, T* data_im) {
size_t index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < n) {
T val = 0;
int w = int(index % width);
int h = int((index / width) % height);
int c = int(index / (width * height));
if ((w - (int)padding_width) >= 0 &&
(w - (int)padding_width) < (width - 2 * padding_width) &&
(h - (int)padding_height) >= 0 &&
(h - padding_height) < (height - 2 * padding_height)) {
// compute the start and end of the output
int w_col_start = (w < (int)filter_width)
? 0
: (w - int(filter_width)) / (int)stride_width + 1;
int w_col_end =
min((int)(w / (int)stride_width + 1), (int)(output_width));
int h_col_start = (h < (int)filter_height)
? 0
: (h - (int)filter_height) / (int)stride_height + 1;
int h_col_end = min(int(h / stride_height + 1), int(output_height));
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out]
int c_col = int(c * filter_height * filter_width) +
(h - h_col * (int)stride_height) * (int)filter_width +
(w - w_col * (int)stride_width);
val +=
data_col[(c_col * output_height + h_col) * output_width + w_col];
}
}
h -= padding_height;
w -= padding_width;
data_im[c * ((width - 2 * padding_width) *
(height - 2 * padding_height)) +
h * (width - 2 * padding_width) + w] += val;
}
}
}
/*
* im = [input_channels, input_height, input_width]
* col =
* [input_channels, filter_height, filter_width, output_height, output_width]
*/
template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, T> {
public:
void operator()(framework::Tensor& im, const framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[1];
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
size_t num_kernels = input_channels * (input_height + 2 * padding_height) *
(input_width + 2 * padding_width);
size_t blocks = (num_kernels + 1024 - 1) / 1024;
size_t block_x = 512;
size_t block_y = (blocks + 512 - 1) / 512;
dim3 threads(1024, 1);
dim3 grid(block_x, block_y);
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
col2im<T><<<
grid, threads, 0,
reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
num_kernels, col.data<T>(), input_height + 2 * padding_height,
input_width + 2 * padding_width, input_channels, filter_height,
filter_width, stride_height, stride_width, padding_height,
padding_width, output_height, output_width, im.data<T>());
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::GPUPlace, double>;
template <class T>
__global__ void im2colOCF(const T* im_data, T* col_data, int input_channels,
int input_height, int input_width, int filter_height,
int filter_width, int stride_height, int stride_width,
int padding_height, int padding_width,
int output_height, int output_width) {
int swid = blockIdx.x;
int shid = blockIdx.y;
for (int channelid = threadIdx.z; channelid < input_channels;
channelid += blockDim.z) {
for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
int width_offset = idx + swid * stride_width - padding_width;
int height_offset = idy + shid * stride_height - padding_height;
int im_offset = width_offset + height_offset * input_width +
channelid * input_height * input_width;
int col_offset = idx + idy * filter_width +
channelid * filter_height * filter_width +
(shid * output_width + swid) *
(input_channels * filter_height * filter_width);
if (height_offset >= input_height || height_offset < 0 ||
width_offset >= input_width || width_offset < 0) {
col_data[col_offset] = T(0);
} else {
col_data[col_offset] = im_data[im_offset];
}
}
}
}
}
/*
* im = [input_channels, input_height, input_width]
* col =
* [output_height, output_width, input_channels, filter_height, filter_width]
*/
template <class T>
class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, T> {
public:
void operator()(const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
int block_dim_x = 0;
int block_dim_y = 0;
if (filter_height <= 4 && filter_width <= 4) {
block_dim_x = 4;
block_dim_y = 4;
} else if (filter_height <= 8 && filter_width <= 8) {
block_dim_x = 8;
block_dim_y = 8;
} else if (filter_height <= 16 && filter_width <= 16) {
block_dim_x = 16;
block_dim_y = 16;
} else {
block_dim_x = 32;
block_dim_y = 32;
}
int block_dim_z = 1024 / block_dim_x / block_dim_y;
dim3 threads(block_dim_x, block_dim_y,
std::min(block_dim_z, input_channels));
dim3 grid(output_width, output_height);
im2colOCF<T><<<
grid, threads, 0,
reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
filter_height, filter_width, stride_height, stride_width,
padding_height, padding_width, output_height, output_width);
}
};
template <class T>
__global__ void col2imOCF(T* im_data, const T* col_data, int input_channels,
int input_height, int input_width, int filter_height,
int filter_width, int stride_height, int stride_width,
int padding_height, int padding_width,
int output_height, int output_width) {
int swid = blockIdx.x;
int shid = blockIdx.y;
for (int channelid = threadIdx.z; channelid < input_channels;
channelid += blockDim.z) {
for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
int width_offset = idx + swid * stride_width - padding_width;
int height_offset = idy + shid * stride_height - padding_height;
int im_offset = width_offset + height_offset * input_width +
channelid * input_height * input_width;
int col_offset = idx + idy * filter_width +
channelid * filter_height * filter_width +
(shid * output_width + swid) *
(input_channels * filter_height * filter_width);
if (height_offset >= 0 && height_offset < input_height &&
width_offset >= 0 && width_offset < input_width) {
paddle::platform::CudaAtomicAdd(im_data + im_offset,
col_data[col_offset]);
}
}
}
}
}
/*
* im = [input_channels, input_height, input_width]
* col =
* [output_height, output_width, input_channels, filter_height, filter_width]
*/
template <class T>
class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, T> {
public:
void operator()(framework::Tensor& im, const framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
int input_height = im.dims()[1];
int input_width = im.dims()[2];
int filter_height = col.dims()[3];
int filter_width = col.dims()[4];
int output_height = col.dims()[0];
int output_width = col.dims()[1];
int block_dim_x = 0;
int block_dim_y = 0;
if (filter_height <= 4 && filter_width <= 4) {
block_dim_x = 4;
block_dim_y = 4;
} else if (filter_height <= 8 && filter_width <= 8) {
block_dim_x = 8;
block_dim_y = 8;
} else if (filter_height <= 16 && filter_width <= 16) {
block_dim_x = 16;
block_dim_y = 16;
} else {
block_dim_x = 32;
block_dim_y = 32;
}
int block_dim_z = 1024 / block_dim_x / block_dim_y;
dim3 threads(block_dim_x, block_dim_y,
std::min(block_dim_z, input_channels));
dim3 grid(output_width, output_height);
col2imOCF<T><<<
grid, threads, 0,
reinterpret_cast<platform::CUDADeviceContext*>(context)->stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
filter_height, filter_width, stride_height, stride_width,
padding_height, padding_width, output_height, output_width);
}
};
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
enum class ColFormat { kCFO = 0, kOCF = 1 };
/*
* \brief Converts the image data of three dimensions(CHW) into a colData of
* five dimensions in the Im2ColFunctor calculation,
* And in the Col2ImFunctor calculation, it is reversed.
*
* \param imData Image data.
* \param imShape The shape of imData,
* [input_channels, input_height, input_width].
* \param colData Column data.
* \param colShape The shape of colData.
*
* If the template argument Format is kCFO, the shape of colData is:
* [input_channels, filter_height, filter_width, output_height, output_width]
* So, it is easy to reshape into a convolution matrix for convolution
* calculation based on matrix multiplication.
* The shape of convolution matrix is [height, width], where the height is equal
* input_channels * filter_height * filter_width, and the width is equal
* output_height * output_width.
*
* Reshape:
* shape of colData shape of convolution matrix
* [input_channels,
* filter_height,
* filter_width, ======> [height, width]
* output_height,
* output_width]
*
* If the template argument Format is kOCF, the shape of colData is:
* [output_height, output_width, input_channels, filter_height, filter_width]
* So, it is easy to reshape into a sequence matrix for rnn calculation.
* The shape of sequence matrix is [seq_length, step_size], where the seq_length
* is equal output_height * output_width, and the step_size is equal
* input_channels * filter_height * filter_width.
*
* Reshape:
* shape of colData shape of sequence matrix
* [output_height,
* output_width,
* input_channels, ======> [seqLength, stepSize]
* filter_height,
* filter_width]
*
* \note The caller needs to ensure that imShape.inputChannels is equal to
* colShape.inputChannels.
*/
template <ColFormat Format, typename Place, typename T>
class Im2ColFunctor {
public:
void operator()(const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context);
};
template <ColFormat Format, typename Place, typename T>
class Col2ImFunctor {
public:
void operator()(framework::Tensor& im, const framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width, platform::DeviceContext* context);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/im2col.h"
#include <gtest/gtest.h>
#include <iostream>
template <typename Place>
void testIm2col() {
paddle::framework::Tensor input_tmp;
paddle::framework::Tensor input;
paddle::framework::Tensor output_cfo;
paddle::framework::Tensor output_ocf;
paddle::framework::Tensor output_tmp;
/**
* input = [0, 1, 2,
* 3, 4, 5]
*
* output_cfo = [0, 1
* 1, 2
* 3, 4
* 4, 5]
*
* output_ocf = [0, 1, 3, 4
* 1, 2, 4, 5]
*/
int input_height = 2;
int input_width = 3;
int filter_size = 2;
int stride = 1;
int padding = 0;
int output_height = (input_height - filter_size + 2 * padding) / stride + 1;
int output_width = (input_width - filter_size + 2 * padding) / stride + 1;
float* input_ptr = input_tmp.mutable_data<float>(
{1, input_height, input_width}, paddle::platform::CPUPlace());
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input_ptr, arr, 6 * sizeof(float));
auto* place = new Place();
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
input.CopyFrom<float>(input_tmp, *place);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
output_ocf.mutable_data<float>(
{output_height, output_width, 1, filter_size, filter_size}, *place);
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, Place, float>
im2col;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kOCF, Place, float>
im2col_ocf;
paddle::platform::DeviceContext* context;
if (paddle::platform::is_cpu_place(*place)) {
context =
new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
} else {
context =
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
}
im2col(input, output_cfo, stride, stride, padding, padding, context);
im2col_ocf(input, output_ocf, stride, stride, padding, padding, context);
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace());
out_cfo_ptr = output_tmp.data<float>();
}
EXPECT_EQ(out_cfo_ptr[0], 0);
EXPECT_EQ(out_cfo_ptr[1], 1);
EXPECT_EQ(out_cfo_ptr[2], 1);
EXPECT_EQ(out_cfo_ptr[3], 2);
EXPECT_EQ(out_cfo_ptr[4], 3);
EXPECT_EQ(out_cfo_ptr[5], 4);
EXPECT_EQ(out_cfo_ptr[6], 4);
EXPECT_EQ(out_cfo_ptr[7], 5);
float* out_ocf_ptr;
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace());
out_ocf_ptr = output_tmp.data<float>();
}
EXPECT_EQ(out_ocf_ptr[0], 0);
EXPECT_EQ(out_ocf_ptr[1], 1);
EXPECT_EQ(out_ocf_ptr[2], 3);
EXPECT_EQ(out_ocf_ptr[3], 4);
EXPECT_EQ(out_ocf_ptr[4], 1);
EXPECT_EQ(out_ocf_ptr[5], 2);
EXPECT_EQ(out_ocf_ptr[6], 4);
EXPECT_EQ(out_ocf_ptr[7], 5);
}
TEST(math, im2col) {
testIm2col<paddle::platform::CPUPlace>();
#ifndef PADDLE_ONLY_CPU
testIm2col<paddle::platform::GPUPlace>();
#endif
}
\ No newline at end of file
...@@ -109,7 +109,7 @@ void InitArgument(const ArgumentName& name, Argument* arg, ...@@ -109,7 +109,7 @@ void InitArgument(const ArgumentName& name, Argument* arg,
arg->step_scopes = op.Output(name.step_scopes); arg->step_scopes = op.Output(name.step_scopes);
auto inlinks = op.Inputs(name.inlinks); auto inlinks = op.Inputs(name.inlinks);
auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias); auto inlink_alias = op.Attr<std::vector<std::string>>(name.inlink_alias);
PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(), PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
"the size of inlinks and inlink_alias don't match:%d,%d", "the size of inlinks and inlink_alias don't match:%d,%d",
inlinks.size(), inlink_alias.size()); inlinks.size(), inlink_alias.size());
...@@ -121,7 +121,7 @@ void InitArgument(const ArgumentName& name, Argument* arg, ...@@ -121,7 +121,7 @@ void InitArgument(const ArgumentName& name, Argument* arg,
} }
auto outlinks = op.Outputs(name.outlinks); auto outlinks = op.Outputs(name.outlinks);
auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias); auto outlink_alias = op.Attr<std::vector<std::string>>(name.outlink_alias);
PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(), PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
"the size of outlinks and outlink_alias don't match:%d,%d", "the size of outlinks and outlink_alias don't match:%d,%d",
outlinks.size(), outlink_alias.size()); outlinks.size(), outlink_alias.size());
...@@ -135,8 +135,8 @@ void InitArgument(const ArgumentName& name, Argument* arg, ...@@ -135,8 +135,8 @@ void InitArgument(const ArgumentName& name, Argument* arg,
auto boot_memories = op.Inputs(name.boot_memories); auto boot_memories = op.Inputs(name.boot_memories);
// attributes // attributes
auto memories = op.GetAttr<std::vector<std::string>>(name.memories); auto memories = op.Attr<std::vector<std::string>>(name.memories);
auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories); auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
PADDLE_ENFORCE(memories.size() == boot_memories.size(), PADDLE_ENFORCE(memories.size() == boot_memories.size(),
"the size of memories, boot_memories don't match:%d,%d", "the size of memories, boot_memories don't match:%d,%d",
......
...@@ -48,7 +48,7 @@ The equation is: Out = scale*X ...@@ -48,7 +48,7 @@ The equation is: Out = scale*X
} }
}; };
// Identity Op's gradient is identity op, too. // Scale Op's gradient is scale op, too.
// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out)) // Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
template <typename AttrType> template <typename AttrType>
class ScaleGradOp : public NetOp { class ScaleGradOp : public NetOp {
...@@ -60,38 +60,11 @@ class ScaleGradOp : public NetOp { ...@@ -60,38 +60,11 @@ class ScaleGradOp : public NetOp {
AppendOp(framework::OpRegistry::CreateOp( AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input(framework::GradVarName("Out"))}}}, "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
{{"Out", {Output(framework::GradVarName("X"))}}}, {{"Out", {Output(framework::GradVarName("X"))}}},
{{"scale", GetAttr<AttrType>("scale")}})); {{"scale", Attr<AttrType>("scale")}}));
CompleteAddOp(false); CompleteAddOp(false);
} }
}; };
// identity is a alias of scale op. This is also a example for creating a alias
// operator.
template <typename AttrType>
class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
public:
IdentityOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "input tensor of identity op");
AddOutput("Out", "output tensor of identity op");
AddComment("identity operator. Just a alias of scale op which scale = 1.0");
}
};
template <typename AttrType>
class IdentityOp : public NetOp {
public:
IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
{{"scale", static_cast<AttrType>(1)}}));
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -101,5 +74,3 @@ REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad, ...@@ -101,5 +74,3 @@ REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
ops::ScaleGradOp<float>); ops::ScaleGradOp<float>);
REGISTER_OP_CPU_KERNEL(scale, REGISTER_OP_CPU_KERNEL(scale,
ops::ScaleKernel<paddle::platform::CPUPlace, float>); ops::ScaleKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
ops::IdentityOpMaker<float>);
...@@ -27,7 +27,7 @@ class ScaleKernel : public framework::OpKernel { ...@@ -27,7 +27,7 @@ class ScaleKernel : public framework::OpKernel {
auto* in = context.Input<framework::Tensor>("X"); auto* in = context.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place()); tensor->mutable_data<T>(in->place());
auto scale = static_cast<T>(context.GetAttr<AttrType>("scale")); auto scale = static_cast<T>(context.Attr<AttrType>("scale"));
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor); auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*in); auto eigen_in = framework::EigenVector<T>::Flatten(*in);
......
...@@ -31,7 +31,7 @@ class SGDOpKernel : public framework::OpKernel { ...@@ -31,7 +31,7 @@ class SGDOpKernel : public framework::OpKernel {
auto param = ctx.Input<Tensor>("param"); auto param = ctx.Input<Tensor>("param");
auto grad = ctx.Input<Tensor>("grad"); auto grad = ctx.Input<Tensor>("grad");
auto param_out = ctx.Output<Tensor>("param_out"); auto param_out = ctx.Output<Tensor>("param_out");
float lr = ctx.GetAttr<float>("learning_rate"); float lr = ctx.Attr<float>("learning_rate");
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/squared_l2_distance_op.h"
namespace paddle {
namespace operators {
class SquaredL2DistanceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext& ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"),
"Input of SquaredL2DistanceOp "
"must be initialized.");
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"),
"Target of SquaredL2DistanceOp "
"must be initialized.");
auto* x = ctx.Input<Tensor>("X");
auto x_dims = x->dims();
auto* y = ctx.Input<Tensor>("Y");
auto y_dims = y->dims();
PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims),
"Tensor rank of both SquaredL2DistanceOp's "
"inputs must be same.");
int rank = framework::arity(x_dims);
PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
PADDLE_ENFORCE_EQ(framework::product(x_dims) / x_dims[0],
framework::product(y_dims) / y_dims[0],
"Product of dimensions expcet the first dimension of "
"input and target must be equal.");
PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
"First dimension of target must be equal to input "
"or to 1.");
ctx.Output<Tensor>("sub_result")
->Resize({static_cast<int>(x_dims[0]),
static_cast<int>(framework::product(x_dims) / x_dims[0])});
ctx.Output<Tensor>("Out")->Resize({x_dims[0], 1});
}
};
class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SquaredL2DistanceOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of SquaredL2DistanceOp.");
AddInput("Y", "Target of SquaredL2DistanceOp.");
AddOutput("sub_result",
"Buffering substraction result which "
"will be reused in backward.")
.AsIntermediate();
AddOutput("Out", "Squared l2 distance between input and target.");
AddComment(R"DOC(
SquaredL2DistanceOp will cacluate the squared L2 distance for
input and target. Number of distance value equals to the
first dimension of input. First dimension of target could be equal to
input or to 1. If the first dimension of target is 1, SquaredL2DistanceOp
will broadcast target's first dimension to input's first dimension.
You can decide whether calculate the gradient of input and target.
)DOC");
}
};
class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext& ctx) const override {
PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
"Gradient of Out should not be null");
auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
auto x_dims = ctx.Input<Tensor>("X")->dims();
auto y_dims = ctx.Input<Tensor>("Y")->dims();
PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
"First dimension of output gradient and "
"input value must be equal.");
PADDLE_ENFORCE_EQ(out_dims[1], 1,
"Second dimension of output gradient "
"must be 1.");
auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* y_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
if (x_grad) x_grad->Resize(x_dims);
if (y_grad) y_grad->Resize(y_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
ops::SquaredL2DistanceGradOp);
REGISTER_OP_CPU_KERNEL(
squared_l2_distance,
ops::SquaredL2DistanceKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
squared_l2_distance_grad,
ops::SquaredL2DistanceGradKernel<paddle::platform::CPUPlace, float>);
...@@ -13,8 +13,13 @@ ...@@ -13,8 +13,13 @@
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#include "paddle/operators/gather_op.h"
#include "paddle/operators/squared_l2_distance_op.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(gather, REGISTER_OP_GPU_KERNEL(
ops::GatherOpKernel<paddle::platform::GPUPlace, float>); squared_l2_distance,
ops::SquaredL2DistanceKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
squared_l2_distance_grad,
ops::SquaredL2DistanceGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T>
class SquaredL2DistanceKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in0 = context.Input<Tensor>("X");
auto* in1 = context.Input<Tensor>("Y");
auto* out0 = context.Output<Tensor>("sub_result");
auto* out1 = context.Output<Tensor>("Out");
auto in0_dims = in0->dims();
auto in1_dims = in1->dims();
int cols = framework::product(in0_dims) / in0_dims[0];
// reduce dimensions except the first
auto x =
EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
auto y =
EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols}));
out0->mutable_data<T>(context.GetPlace());
out1->mutable_data<T>(context.GetPlace());
auto sub_result = EigenMatrix<T>::From(*out0);
auto z = EigenVector<T>::Flatten(*out1);
auto place = context.GetEigenDevice<Place>();
auto x_dims = x.dimensions();
auto y_dims = y.dimensions();
// buffer the substraction result
if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
sub_result.device(place) =
x -
y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
} else {
sub_result.device(place) = x - y;
}
auto sub_res_pow2 = sub_result * sub_result;
z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
}
};
template <typename Place, typename T>
class SquaredL2DistanceGradKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in0 = context.Input<Tensor>("sub_result");
auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
auto sub_result = EigenMatrix<T>::From(*in0);
auto out_grad = EigenMatrix<T>::From(*in1);
auto x_dims = x_g->dims();
auto y_dims = y_g->dims();
int cols = framework::product(x_dims) / x_dims[0];
// calculate gradient
auto grad_mat = 2 *
(out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
sub_result;
// propagate back to input
auto eigen_place = context.GetEigenDevice<Place>();
if (x_g) {
x_g->mutable_data<T>(context.GetPlace());
// eigen matrix
auto x_grad =
EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
// dimensions are same with subResult
x_grad.device(eigen_place) = grad_mat;
}
if (y_g) {
y_g->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
"First dimension of gradient must be greater or "
"equal than first dimension of target.");
if (sub_result.dimensions()[0] == y_dims[0]) {
auto y_grad =
EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
y_grad.device(eigen_place) = -1 * grad_mat;
} else {
auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
auto y_grad = EigenVector<T>::Flatten(*y_g);
y_grad.device(eigen_place) = col_sum_res;
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -26,15 +26,15 @@ class CPUUniformRandomKernel : public framework::OpKernel { ...@@ -26,15 +26,15 @@ class CPUUniformRandomKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed")); unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
std::minstd_rand engine; std::minstd_rand engine;
if (seed == 0) { if (seed == 0) {
seed = std::random_device()(); seed = std::random_device()();
} }
engine.seed(seed); engine.seed(seed);
std::uniform_real_distribution<T> dist( std::uniform_real_distribution<T> dist(
static_cast<T>(context.GetAttr<float>("min")), static_cast<T>(context.Attr<float>("min")),
static_cast<T>(context.GetAttr<float>("max"))); static_cast<T>(context.Attr<float>("max")));
int64_t size = framework::product(tensor->dims()); int64_t size = framework::product(tensor->dims());
for (int64_t i = 0; i < size; ++i) { for (int64_t i = 0; i < size; ++i) {
data[i] = dist(engine); data[i] = dist(engine);
...@@ -48,10 +48,10 @@ class UniformRandomOp : public framework::OperatorWithKernel { ...@@ -48,10 +48,10 @@ class UniformRandomOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(const framework::InferShapeContext& ctx) const override { void InferShape(const framework::InferShapeContext& ctx) const override {
PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"), PADDLE_ENFORCE(Attr<float>("min") < Attr<float>("max"),
"uniform_random's min must less then max"); "uniform_random's min must less then max");
auto* tensor = ctx.Output<framework::Tensor>("Out"); auto* tensor = ctx.Output<framework::Tensor>("Out");
auto dims = GetAttr<std::vector<int>>("dims"); auto dims = Attr<std::vector<int>>("dims");
std::vector<int64_t> temp; std::vector<int64_t> temp;
temp.reserve(dims.size()); temp.reserve(dims.size());
for (auto dim : dims) { for (auto dim : dims) {
......
...@@ -45,13 +45,13 @@ class GPUUniformRandomKernel : public framework::OpKernel { ...@@ -45,13 +45,13 @@ class GPUUniformRandomKernel : public framework::OpKernel {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* tensor = context.Output<framework::Tensor>("Out"); auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed = static_cast<unsigned int>(context.GetAttr<int>("seed")); unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
if (seed == 0) { if (seed == 0) {
std::random_device rd; std::random_device rd;
seed = rd(); seed = rd();
} }
T min = static_cast<T>(context.GetAttr<float>("min")); T min = static_cast<T>(context.Attr<float>("min"));
T max = static_cast<T>(context.GetAttr<float>("max")); T max = static_cast<T>(context.Attr<float>("max"));
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
ssize_t N = framework::product(tensor->dims()); ssize_t N = framework::product(tensor->dims());
thrust::transform(index_sequence_begin, index_sequence_begin + N, thrust::transform(index_sequence_begin, index_sequence_begin + N,
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector>
#include "paddle/platform/dynload/cudnn.h" #include "paddle/platform/dynload/cudnn.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/macros.h" #include "paddle/platform/macros.h"
......
...@@ -31,7 +31,7 @@ limitations under the License. */ ...@@ -31,7 +31,7 @@ limitations under the License. */
namespace py = pybind11; namespace py = pybind11;
USE_OP(add_two); USE_OP(add);
USE_OP(onehot_cross_entropy); USE_OP(onehot_cross_entropy);
USE_OP(sgd); USE_OP(sgd);
USE_OP(mul); USE_OP(mul);
...@@ -50,6 +50,7 @@ USE_OP(minus); ...@@ -50,6 +50,7 @@ USE_OP(minus);
USE_OP(cos_sim); USE_OP(cos_sim);
USE_CPU_ONLY_OP(gather); USE_CPU_ONLY_OP(gather);
USE_CPU_ONLY_OP(scatter); USE_CPU_ONLY_OP(scatter);
USE_OP(squared_l2_distance);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -2,22 +2,58 @@ ...@@ -2,22 +2,58 @@
set -xe set -xe
mkdir -p /paddle/build_android BUILD_ROOT=/paddle/build_android
cd /paddle/build_android DEST_ROOT=/paddle/install
rm -rf /paddle/install 2>/dev/null || true
cmake -DCMAKE_SYSTEM_NAME=Android \ rm -rf $BUILD_ROOT 2>/dev/null || true
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \ mkdir -p $BUILD_ROOT
-DANDROID_ABI=armeabi-v7a \ cd $BUILD_ROOT
-DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \ if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-DHOST_C_COMPILER=/usr/bin/gcc \ cmake -DCMAKE_SYSTEM_NAME=Android \
-DHOST_CXX_COMPILER=/usr/bin/g++ \ -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \
-DCMAKE_INSTALL_PREFIX=/paddle/install \ -DANDROID_ABI=$ANDROID_ABI \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DANDROID_ARM_NEON=ON \
-DCMAKE_C_FLAGS_RELWITHDEBINFO="-O3" \ -DANDROID_ARM_MODE=ON \
-DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3" \ -DHOST_C_COMPILER=/usr/bin/gcc \
-DWITH_C_API=ON \ -DHOST_CXX_COMPILER=/usr/bin/g++ \
-DWITH_SWIG_PY=OFF \ -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
.. -DCMAKE_BUILD_TYPE=Release \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
elif [ $ANDROID_ABI == "arm64-v8a" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM64_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=Release \
-DUSE_EIGEN_FOR_BLAS=OFF \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
elif [ $ANDROID_ABI == "armeabi" ]; then
cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_ARM_STANDALONE_TOOLCHAIN \
-DANDROID_ABI=$ANDROID_ABI \
-DANDROID_ARM_MODE=ON \
-DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=/paddle/install \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \
..
else
echo "Invalid ANDROID_ABI: $ANDROID_ABI"
fi
make -j `nproc` make -j `nproc`
make install -j `nproc` make install -j `nproc`
...@@ -22,6 +22,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ ...@@ -22,6 +22,7 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
-DANDROID_ABI=armeabi-v7a \ -DANDROID_ABI=armeabi-v7a \
-DANDROID_ARM_NEON=ON \ -DANDROID_ARM_NEON=ON \
-DANDROID_ARM_MODE=ON \ -DANDROID_ARM_MODE=ON \
-DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \ -DWITH_STYLE_CHECK=OFF \
......
...@@ -287,6 +287,11 @@ message PadConfig { ...@@ -287,6 +287,11 @@ message PadConfig {
repeated uint32 pad_w = 4; repeated uint32 pad_w = 4;
} }
message ReshapeConfig {
repeated uint32 heightAxis = 1;
repeated uint32 widthAxis = 2;
}
message MultiBoxLossConfig { message MultiBoxLossConfig {
required uint32 num_classes = 1; required uint32 num_classes = 1;
required float overlap_threshold = 2; required float overlap_threshold = 2;
...@@ -339,7 +344,6 @@ message LayerInputConfig { ...@@ -339,7 +344,6 @@ message LayerInputConfig {
} }
message LayerConfig { message LayerConfig {
required string name = 1; required string name = 1;
required string type = 2; required string type = 2;
optional uint64 size = 3; optional uint64 size = 3;
...@@ -516,6 +520,9 @@ message LayerConfig { ...@@ -516,6 +520,9 @@ message LayerConfig {
optional double delta = 57 [ default = 1.0 ]; optional double delta = 57 [ default = 1.0 ];
optional uint64 depth = 58 [ default = 1 ]; optional uint64 depth = 58 [ default = 1 ];
// for switch order layer
optional ReshapeConfig reshape_conf = 59;
} }
message EvaluatorConfig { message EvaluatorConfig {
......
...@@ -3670,6 +3670,15 @@ class RecurrentLayerGroup(LayerBase): ...@@ -3670,6 +3670,15 @@ class RecurrentLayerGroup(LayerBase):
name, 'recurrent_layer_group', 0, inputs=[], device=device) name, 'recurrent_layer_group', 0, inputs=[], device=device)
@config_layer('switch_order')
class SwitchOrderLayer(LayerBase):
def __init__(self, name, inputs, reshape, **xargs):
super(SwitchOrderLayer, self).__init__(
name, 'switch_order', 0, inputs=inputs, **xargs)
self.config.reshape_conf.heightAxis.extend(reshape['height'])
self.config.reshape_conf.widthAxis.extend(reshape['width'])
# Deprecated, use a new layer specific class instead # Deprecated, use a new layer specific class instead
@config_func @config_func
def Layer(name, type, **xargs): def Layer(name, type, **xargs):
......
...@@ -131,6 +131,7 @@ __all__ = [ ...@@ -131,6 +131,7 @@ __all__ = [
'row_conv_layer', 'row_conv_layer',
'dropout_layer', 'dropout_layer',
'prelu_layer', 'prelu_layer',
'switch_order_layer',
'gated_unit_layer', 'gated_unit_layer',
'crop_layer', 'crop_layer',
'sub_nested_seq_layer', 'sub_nested_seq_layer',
...@@ -239,6 +240,7 @@ class LayerType(object): ...@@ -239,6 +240,7 @@ class LayerType(object):
SMOOTH_L1 = 'smooth_l1' SMOOTH_L1 = 'smooth_l1'
PRELU = 'prelu' PRELU = 'prelu'
SWITCH_ORDER_LAYER = 'switch_order'
CROP_LAYER = 'crop' CROP_LAYER = 'crop'
SUB_NESTED_SEQ = 'sub_nested_seq' SUB_NESTED_SEQ = 'sub_nested_seq'
CLIP_LAYER = 'clip' CLIP_LAYER = 'clip'
...@@ -6404,6 +6406,48 @@ def gated_unit_layer(input, ...@@ -6404,6 +6406,48 @@ def gated_unit_layer(input,
layer_attr=layer_attr) layer_attr=layer_attr)
@layer_support()
@wrap_name_default('switch_order')
def switch_order_layer(input,
name=None,
reshape=None,
act=None,
layer_attr=None):
"""
This layer switch dimension order of image input.
From order "batchSize, channels, height, width"
to order "batchSize, height, width, channels".
The example usage is:
.. code-block:: python
reshape = {'height':[ 0, 1, 2], 'width':[3]}
switch = switch_order(input=layer, name='switch', reshape=reshape)
:param input: The input layer.
:type input: LayerOutput
:param name: Name of this layer.
:type name: basestring
:param reshape: reshape matrix by axises.
:type reshape: Dict
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert isinstance(input, LayerOutput)
l = Layer(
name=name,
inputs=input.name,
reshape=reshape,
type=LayerType.SWITCH_ORDER_LAYER,
active_type=act.name,
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(
name=name,
layer_type=LayerType.SWITCH_ORDER_LAYER,
parents=input,
size=l.config.size)
@wrap_name_default() @wrap_name_default()
@layer_support() @layer_support()
def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
......
...@@ -33,3 +33,4 @@ py_test(test_gradient_checker SRCS test_gradient_checker.py) ...@@ -33,3 +33,4 @@ py_test(test_gradient_checker SRCS test_gradient_checker.py)
py_test(test_lookup_table SRCS test_lookup_table.py) py_test(test_lookup_table SRCS test_lookup_table.py)
py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py) py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
py_test(mnist SRCS mnist.py) py_test(mnist SRCS mnist.py)
py_test(test_squared_l2_distance_op SRCS test_squared_l2_distance_op.py)
...@@ -66,7 +66,7 @@ class OpTestMeta(type): ...@@ -66,7 +66,7 @@ class OpTestMeta(type):
self.assertTrue( self.assertTrue(
numpy.allclose( numpy.allclose(
actual, expect, atol=1e-05), actual, expect, atol=1e-05),
"output name: " + out_name + "has diff") "output name: " + out_name + " has diff")
obj.test_all = test_all obj.test_all = test_all
return obj return obj
...@@ -11,7 +11,7 @@ class TestAddOp(unittest.TestCase): ...@@ -11,7 +11,7 @@ class TestAddOp(unittest.TestCase):
__metaclass__ = OpTestMeta __metaclass__ = OpTestMeta
def setUp(self): def setUp(self):
self.type = "add_two" self.type = "add"
self.inputs = { self.inputs = {
'X': numpy.random.random((102, 105)).astype("float32"), 'X': numpy.random.random((102, 105)).astype("float32"),
'Y': numpy.random.random((102, 105)).astype("float32") 'Y': numpy.random.random((102, 105)).astype("float32")
......
...@@ -7,7 +7,7 @@ from gradient_checker import get_numeric_gradient ...@@ -7,7 +7,7 @@ from gradient_checker import get_numeric_gradient
class GetNumericGradientTest(unittest.TestCase): class GetNumericGradientTest(unittest.TestCase):
def test_add_op(self): def test_add_op(self):
add_op = Operator('add_two', X="X", Y="Y", Out="Z") add_op = Operator('add', X="X", Y="Y", Out="Z")
x = numpy.random.random((10, 1)).astype("float32") x = numpy.random.random((10, 1)).astype("float32")
y = numpy.random.random((10, 1)).astype("float32") y = numpy.random.random((10, 1)).astype("float32")
......
...@@ -15,7 +15,7 @@ def fc(X, W, Y): ...@@ -15,7 +15,7 @@ def fc(X, W, Y):
class TestNet(unittest.TestCase): class TestNet(unittest.TestCase):
def test_net_all(self): def test_net_all(self):
net = core.Net.create() net = core.Net.create()
op1 = Operator("add_two", X="X", Y="Y", Out="Out") op1 = Operator("add", X="X", Y="Y", Out="Out")
net.append_op(op1) net.append_op(op1)
net2 = core.Net.create() net2 = core.Net.create()
...@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase): ...@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
expected = ''' expected = '''
Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}. Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
Op(add_two), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}. Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}. Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}. Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}. Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
......
...@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase): ...@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
class TestOpCreations(unittest.TestCase): class TestOpCreations(unittest.TestCase):
def test_all(self): def test_all(self):
add_op = op.Operator("add_two", X="a", Y="b", Out="z") add_op = op.Operator("add", X="a", Y="b", Out="z")
self.assertIsNotNone(add_op) self.assertIsNotNone(add_op)
# Invoke C++ DebugString() # Invoke C++ DebugString()
self.assertEqual('Op(add_two), inputs:{X[a], Y[b]}, outputs:{Out[z]}.', self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
str(add_op)) str(add_op))
......
...@@ -146,7 +146,7 @@ class TestRecurrentOp(unittest.TestCase): ...@@ -146,7 +146,7 @@ class TestRecurrentOp(unittest.TestCase):
stepnet = core.Net.create() stepnet = core.Net.create()
x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx") x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh") h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
sum_op = Operator("add_two", X="Wx", Y="Uh", Out="sum") sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
sig_op = Operator("sigmoid", X="sum", Y="h@alias") sig_op = Operator("sigmoid", X="sum", Y="h@alias")
for op in [x_fc_op, h_fc_op, sum_op, sig_op]: for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
......
import unittest
from op_test_util import OpTestMeta
from gradient_checker import GradientChecker, create_op
import numpy as np
class TestSquaredL2DistanceOp_f0(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = 'squared_l2_distance'
self.inputs = {
'X': np.random.uniform(0.1, 1., (32, 64)).astype('float32'),
'Y': np.random.uniform(0.1, 1., (32, 64)).astype('float32')
}
sub_res = self.inputs['X'] - self.inputs['Y']
output = sub_res * sub_res
self.outputs = {
'sub_result': sub_res,
'Out': np.expand_dims(output.sum(1), 1)
}
class TestSquaredL2DistanceOp_f1(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = 'squared_l2_distance'
self.inputs = {
'X': np.random.uniform(0.1, 1., (32, 64)).astype('float32'),
'Y': np.random.uniform(0.1, 1., (1, 64)).astype('float32')
}
sub_res = self.inputs['X'] - self.inputs['Y']
output = sub_res * sub_res
self.outputs = {
'sub_result': sub_res,
'Out': np.expand_dims(output.sum(1), 1)
}
class TestSquaredL2DistanceOp_f2(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = 'squared_l2_distance'
self.inputs = {
'X': np.random.uniform(0.1, 1., (32, 64, 128)).astype('float32'),
'Y': np.random.uniform(0.1, 1., (1, 64, 128)).astype('float32')
}
sub_res = self.inputs['X'] - self.inputs['Y']
sub_res = sub_res.reshape((32, 64 * 128))
output = sub_res * sub_res
self.outputs = {
'sub_result': sub_res,
'Out': np.expand_dims(output.sum(1), 1)
}
class TestSquaredL2DistanceGradOp(GradientChecker):
def test_squared_l2_distance_b0(self):
op = create_op("squared_l2_distance")
inputs = {
'X': np.random.uniform(0.1, .6, (2, 3)).astype('float32'),
'Y': np.random.uniform(0.1, .6, (2, 3)).astype('float32')
}
self.compare_grad(op, inputs)
self.check_grad(op, inputs, set(["X", "Y"]), "Out")
def test_squared_l2_distance_b1(self):
op = create_op("squared_l2_distance")
inputs = {
'X': np.random.uniform(0.1, .6, (2, 3)).astype('float32'),
'Y': np.random.uniform(0.1, .6, (1, 3)).astype('float32')
}
self.compare_grad(op, inputs)
self.check_grad(op, inputs, set(["X", "Y"]), "Out")
def test_squared_l2_distance_b2(self):
op = create_op("squared_l2_distance")
inputs = {
'X': np.random.uniform(0.1, .6, (2, 3, 4)).astype('float32'),
'Y': np.random.uniform(0.1, .6, (1, 3, 4)).astype('float32')
}
self.compare_grad(op, inputs)
self.check_grad(op, inputs, set(["X", "Y"]), "Out")
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册