提交 abfac74c 编写于 作者: H hedaoyuan

Merge branch 'develop' of https://github.com/baidu/Paddle into conv_op

......@@ -51,7 +51,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9"
GIT_TAG "v0.10"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
......
......@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170720")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
......@@ -54,7 +54,8 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${MKLML_SOURCE_DIR}
DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -qO- ${MKLML_URL} | tar xz -C ${MKLML_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz
&& tar zxf ${MKLML_VER}.tgz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
......
......@@ -25,7 +25,12 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
ELSE()
SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
ENDIF()
IF(CMAKE_CROSSCOMPILING)
IF(ANDROID)
......@@ -40,11 +45,11 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.19")
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
SET(OPENBLAS_COMMIT "v0.2.19")
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
......
# Design Doc: Functions, Operators, and Layers
In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator.
Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation.
In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
```c++
template <typename T> T add(T x, T y) { return x + y; }
template <typename T> T mul(T x, T y) { return x * y; }
```
Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation
```c++
#define MAKE_FUNCTION_OPERATOR(mul);
```
generates
```c++
template <typename T> class mulOp : public OperatorBase {...};
REGISTER_OP(mulOp<float32>, "mul");
```
so that in Python we can create operator mul by:
```python
X1 = Var()
X2 = Var()
Y = Var()
paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
```
Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
```c++
template <typename T>
class FCOp : public OperatorBase {
public:
void Run(...) {
add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
}
};
REGISTER_OP(FCOp, "fc");
```
We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API.
Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
```python
def operator.mul(X1, X2):
O = Var()
paddle.cpp.create_operator("mul", input={X1, Y1], output=O)
return O
def operator.add(X1, X2):
O = Var()
paddle.cpp.create_operator("add", input={X1, X2], output=O)
return O
```
Above code snippets are automatically generated. Given them, users can define
```python
def layer.fc(X):
W = Var()
b = Var()
return operator.add(operator.mul(X, W), b)
```
If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
```python
def layer.fc(X):
W = Var()
b = Var()
O1 = Var()
paddle.cpp.create_operator("mul", input=[X, W], output=O1)
O2 = Var()
paddle.cpp.create_operator("add", input=[O1, b], output=O2)
return O2
```
We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example:
```
| C++ functions/functors | mul | add | | |
| C++ operator class | mulOp | addOp | FCOp | |
| Python binding | operator.mul | operator.add | operator.fc | |
| Python function | | | | layer.fc |
```
This is how we differentiate layer and operators in PaddlePaddle:
- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has M (M<=N) instances, each corresponds to a true element in `cond`.
```python
import paddle as pd
x = var()
y = var()
cond = var()
b = pd.create_ifop(inputs=[x], output_num=1)
with b.true_block():
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
If we want the output still has N instances, we can use IfElseOp with a default value, whose minibatch size must be N:
```python
import paddle as pd
x = var()
y = var()
cond = var()
default_value = var()
b = pd.create_ifelseop(inputs=[x], output_num=1)
with b.true_block():
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
with b.false_block():
x = b.inputs(0)
z = layer.fc(x)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
If only true_block is set in an IfElseOp, we can have a default value for false as:
```python
import paddle as pd
x = var()
y = var()
cond = var()
default_value = var()
b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
with b.true_block():
x = b.inputs(0)
z = operator.add(x, y)
b.set_output(0, operator.softmax(z))
out = b(cond)
```
where default_value is a list of vars for `cond` == False.
......@@ -6,14 +6,12 @@
安装流程
++++++++
PaddlePaddle提供数个预编译的二进制来进行安装,包括Docker镜像,ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境,同时欢迎贡献更多的安装包
PaddlePaddle提供Docker镜像来部署环境
.. toctree::
:maxdepth: 1
docker_install_cn.rst
ubuntu_install_cn.rst
编译流程
......
......@@ -8,14 +8,13 @@ Install PaddlePaddle
:maxdepth: 1
docker_install_en.rst
ubuntu_install_en.rst
Build from Source
-----------------
.. warning::
Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
.. toctree::
:maxdepth: 1
......
Ubuntu部署PaddlePaddle
===================================
PaddlePaddle提供了ubuntu 14.04 deb安装包。
安装
------
安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
它包含四个版本\:
* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
* cpu-noavx版本:支持主流x86处理器平台,没有使用avx指令集。
* gpu版本:支持主流x86处理器平台,支持nvidia cuda平台,使用了avx指令集。
* gpu-noavx版本:支持主流x86处理器平台,支持nvidia cuda平台,没有使用avx指令集。
下载完相关安装包后,执行:
.. code-block:: shell
sudo apt-get install gdebi
gdebi paddle-*-cpu.deb
或者:
.. code-block:: shell
dpkg -i paddle-*-cpu.deb
apt-get install -f
在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的,
在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
.. code-block:: shell
PaddlePaddle 0.8.0b1, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
with_predict_sdk:
可能遇到的问题
--------------
libcudart.so/libcudnn.so找不到
++++++++++++++++++++++++++++++
安装完成后,运行 :code:`paddle train` 报错\:
.. code-block:: shell
0831 12:36:04.151525 1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle,请安装CUDA 7.5 和CUDNN 5到本地环境中,并设置:
.. code-block:: shell
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
export PATH=/usr/local/cuda/bin:$PATH
Debian Package installation guide
=================================
PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
.. code-block:: bash
gdebi paddle-*.deb
If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
Or you can use following commands to install PaddlePaddle.
.. code-block:: bash
dpkg -i paddle-*.deb
apt-get install -f
And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences.
......@@ -178,13 +178,13 @@ class MulKernel : public framework::OpKernel {
```c++
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpGrad);
REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::CPUPlace, float>);
```
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker``ops::MulOpMaker`注册`ops::MulOpGrad`,类型名为`mul_grad`
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker``ops::MulOpMaker`并且注册`ops::MulOpGrad`为其反向Op。
- `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace``float`类型,同理,注册`ops::MulKernel`类。
......
......@@ -173,6 +173,96 @@ extern void hl_avgpool_backward(const int frameCnt,
real* backGrad,
const int outStride);
extern void hl_maxpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
real* maxPoolIdxData,
const int tgtStride);
extern void hl_maxpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* targetGrad,
real* maxPoolIdxData,
const int outStride);
extern void hl_avgpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
const int tgtStride);
extern void hl_avgpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
int paddingD,
int paddingH,
int paddingW,
real scaleA,
real scaleB,
real* backGrad,
const int outStride);
/**
* @brief Bilinear interpolation forward.
*
......@@ -275,4 +365,4 @@ extern void hl_maxout_backward(real* inGrad,
size_t featLen,
size_t groups);
#endif /* HL_CNN_H_ */
#endif // HL_CNN_H_
......@@ -224,4 +224,80 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
extern void hl_matrix_rotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise);
/**
* @brief Matrix vol2Col: Convert 3D volume into col matrix
*
* @param[in] matSrc input matrix.
* @param[in] channel channel of matSrc.
* @param[in] depth depth of matSrc.
* @param[in] height height of matSrc.
* @param[in] width width of matSrc.
* @param[in] filterD depth of filter.
* @param[in] filterH height of filter.
* @param[in] filterW width of filter.
* @param[in] strideD stride in the depth.
* @param[in] strideH stride in the height.
* @param[in] strideW stride in the width.
* @param[in] paddingD padding in the depth.
* @param[in] paddingH padding in the height.
* @param[in] paddingW padding in the width.
* @param[out] dataDst output matrix.
*
*/
extern void hl_matrix_vol2Col(const real* dataSrc,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
real* dataDst);
/**
* @brief Matrix col2Vol: Convert col matrix into 3D volume
*
* @param[out] matDst output matrix.
* @param[in] channel channel of matDst.
* @param[in] depth depth of matDst.
* @param[in] height height of matDst.
* @param[in] width width of matDst.
* @param[in] filterD depth of filter.
* @param[in] filterH height of filter.
* @param[in] filterW width of filter.
* @param[in] strideD stride in the depth.
* @param[in] strideH stride in the height.
* @param[in] strideW stride in the width.
* @param[in] paddingD padding in the depth.
* @param[in] paddingH padding in the height.
* @param[in] paddingW padding in the width.
* @param[in] matSrc input matrix.
* @param[in] beta input
* @param[in] alpha input
*
*/
extern void hl_matrix_col2Vol(real* dataDst,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
const real* dataSrc,
real alpha,
real beta);
#endif /* HL_MATRIX_H_ */
......@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
real* backGrad,
const int outStride) {}
inline void hl_maxpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
real* maxPoolIdxData,
const int tgtStride) {}
inline void hl_maxpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* targetGrad,
real* maxPoolIdxData,
const int outStride) {}
inline void hl_avgpool3D_forward(const int frameCnt,
const real* inputData,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real* tgtData,
const int tgtStride) {}
inline void hl_avgpool3D_backward(const int frameCnt,
const real* outGrad,
const int channels,
const int depth,
const int height,
const int width,
const int pooledD,
const int pooledH,
const int pooledW,
const int sizeZ,
const int sizeY,
const int sizeX,
const int strideD,
const int strideH,
const int strideW,
const int paddingD,
const int paddingH,
const int paddingW,
real scaleA,
real scaleB,
real* backGrad,
const int outStride) {}
inline void hl_bilinear_forward(const real* inData,
const size_t inImgH,
const size_t inImgW,
......
......@@ -99,4 +99,38 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
inline void hl_matrix_rotate(
real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
inline void hl_matrix_vol2Col(const real* dataSrc,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
real* dataDst) {}
inline void hl_matrix_col2Vol(real* dataDst,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
const real* dataSrc,
real alpha,
real beta) {}
#endif // HL_MATRIX_STUB_H_
此差异已折叠。
......@@ -592,3 +592,204 @@ void hl_matrix_rotate(
mat, matRot, dimM, dimN, clockWise);
CHECK_SYNC("hl_matrix_rotate failed");
}
__global__ void keMatrixVol2Col(int num_kernels,
const real* dataSrc,
real* dataDst,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
int depth_col,
int height_col,
int width_col) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
index += blockDim.x * gridDim.x) {
int w_out = index % width_col;
int h_out = (index / width_col) % height_col;
int d_out = (index / width_col / height_col) % depth_col;
int channel_in = index / width_col / height_col / depth_col;
int channel_out = channel_in * filterD * filterH * filterW;
int w_in = w_out * strideW - paddingW;
int h_in = h_out * strideH - paddingH;
int d_in = d_out * strideD - paddingD;
dataDst +=
((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
w_out;
dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
for (int k = 0; k < filterD; ++k) {
for (int i = 0; i < filterH; ++i) {
for (int j = 0; j < filterW; ++j) {
int d = d_in + k;
int h = h_in + i;
int w = w_in + j;
*dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
w < width)
? dataSrc[(k * height + i) * width + j]
: 0;
dataDst += depth_col * height_col * width_col;
}
}
}
}
}
void hl_matrix_vol2Col(const real* dataSrc,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
real* dataDst) {
int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
int num_kernels = channels * depth_col * height_col * width_col;
const int threads = 512;
const int blocks = DIVUP(num_kernels, threads);
keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
dataSrc,
dataDst,
depth,
height,
width,
filterD,
filterH,
filterW,
strideD,
strideH,
strideW,
paddingD,
paddingH,
paddingW,
depth_col,
height_col,
width_col);
CHECK_SYNC("hl_matrix_vol2Col failed");
}
__global__ void keMatrixCol2Vol(int num_kernels,
real* dataDst,
const real* dataSrc,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
int depth_col,
int height_col,
int width_col,
real alpha,
real beta) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
index += blockDim.x * gridDim.x) {
real srcVal = 0;
real dstVal = dataDst[index];
int w = index % width + paddingW;
int h = (index / width) % height + paddingH;
int d = (index / width / height) % depth + paddingD;
int c = index / width / height / depth;
// compute the start and end of the output
int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
int w_col_end = min(w / strideW + 1, width_col);
int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
int h_col_end = min(h / strideH + 1, height_col);
int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
int d_col_end = min(d / strideD + 1, depth_col);
int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
h * filterW + w) *
depth_col * height_col * width_col;
int coeff_d_col =
(1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
int coeff_h_col =
(1 - strideH * filterW * depth_col * height_col) * width_col;
int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
w_col * coeff_w_col];
}
}
}
dataDst[index] = alpha * srcVal + beta * dstVal;
}
}
void hl_matrix_col2Vol(real* dataDst,
int channels,
int depth,
int height,
int width,
int filterD,
int filterH,
int filterW,
int strideD,
int strideH,
int strideW,
int paddingD,
int paddingH,
int paddingW,
const real* dataSrc,
real alpha,
real beta) {
int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
int num_kernels = channels * depth * height * width;
const int threads = 512;
const int blocks = DIVUP(num_kernels, threads);
keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
dataDst,
dataSrc,
depth,
height,
width,
filterD,
filterH,
filterW,
strideD,
strideH,
strideW,
paddingD,
paddingH,
paddingW,
depth_col,
height_col,
width_col,
alpha,
beta);
CHECK_SYNC("hl_matrix_col2Vol failed");
}
......@@ -18,7 +18,7 @@ A backward network is built up with several backward operators. Backward operato
For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
```cpp
REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
REGISTER_OP(mul, MulOp, MulOpMaker, MulOpGrad);
```
`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
......
......@@ -127,8 +127,8 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
public:
FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("x", "x");
AddOutput("out", "out");
AddInput("Src", "x");
AddOutput("Dst", "out");
AddComment("");
}
};
......@@ -138,7 +138,7 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "x").AsDuplicable();
AddOutput("Y", "y");
AddOutput("Out", "out");
AddComment("");
}
};
......@@ -148,16 +148,14 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
namespace f = paddle::framework;
namespace ops = paddle::operators;
using EnforceNotMet = paddle::platform::EnforceNotMet;
REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
f::NOP);
REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, f::NOP);
REGISTER_OP(mul, f::NOP, f::MulOpMaker, f::NOP);
REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
REGISTER_OP(add, f::NOP, f::AddOpMaker, f::NOP);
REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
f::NOP);
REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, f::NOP);
TEST(Backward, simple_op_grad) {
auto fwd = f::OpRegistry::CreateOp(
......
......@@ -54,8 +54,8 @@ TEST(GradOpBuilder, AddTwo) {
EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
}
REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, f::NOP);
REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, f::NOP);
TEST(GradOpBuilder, MutiInOut) {
std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
......
......@@ -19,25 +19,24 @@
namespace paddle {
namespace framework {
LODTensor::LOD LODTensor::LOD::SliceLevels(size_t level_begin,
size_t level_end) const {
LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end) {
LOD new_lod;
new_lod.reserve(level_end - level_begin);
for (size_t i = level_begin; i < level_end; i++) {
new_lod.emplace_back(at(i));
new_lod.emplace_back(in.at(i));
}
return new_lod;
}
LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
size_t elem_end) const {
LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
size_t elem_end) {
// slice the lod.
LOD new_lod;
new_lod.reserve(size() - level);
auto start = this->at(level)[elem_begin];
auto end = this->at(level)[elem_end];
new_lod.reserve(in.size() - level);
auto start = in.at(level)[elem_begin];
auto end = in.at(level)[elem_end];
for (auto it = this->begin() + level; it != this->end(); it++) {
for (auto it = in.begin() + level; it != in.end(); it++) {
auto it_begin = std::find(it->begin(), it->end(), start);
auto it_end = std::find(it_begin, it->end(), end);
PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
......@@ -49,11 +48,11 @@ LODTensor::LOD LODTensor::LOD::SliceInLevel(size_t level, size_t elem_begin,
[start](int v) { return v - start; });
PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LOD");
}
PADDLE_ENFORCE_LE(new_lod.size(), this->size());
PADDLE_ENFORCE_LE(new_lod.size(), in.size());
return new_lod;
}
bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
bool operator==(const LOD& a, const LOD& b) {
if (a.size() != b.size()) {
return false;
}
......@@ -70,9 +69,27 @@ bool operator==(const LODTensor::LOD& a, const LODTensor::LOD& b) {
}
}
}
return true;
}
void LODTensor::SliceLevels(size_t level_begin, size_t level_end) {
auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
lod_ = new_lod;
}
void LODTensor::SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) {
PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
NumLevels());
PADDLE_ENFORCE(elem_begin < NumElements(level),
"element begin [%d] out of range [%d]", elem_begin,
NumElements(level));
PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
"element end [%d] out of range [%d]", elem_end,
NumElements(level));
auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
lod_ = new_lod;
}
} // namespace framework
} // namespace paddle
......@@ -15,7 +15,7 @@
#pragma once
#include <memory>
#if !defined(PADDLE_ONLY_CPU)
#ifndef PADDLE_ONLY_CPU
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#endif
......@@ -27,33 +27,39 @@
namespace paddle {
namespace framework {
#ifdef PADDLE_ONLY_CPU
template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<T>;
#endif
using LOD = std::vector<Vector<size_t>>;
LOD SliceLevels(const LOD& in, size_t level_begin, size_t level_end);
LOD SliceInLevel(const LOD& in, size_t level, size_t elem_begin,
size_t elem_end);
bool operator==(const LOD& a, const LOD& b);
/*
* LODTensor (Level of details Tensor)
* see https://en.wikipedia.org/wiki/Level_of_details for reference.
*/
class LODTensor : public Tensor {
class LODTensor {
public:
// Level save offsets of each unit.
#ifdef PADDLE_ONLY_CPU
template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<T>;
#endif
// LoD stores offsets of each level of units, the largest units level first,
// then the smaller units level. Each Level stores the offsets of units in
// Tesor.
class LOD : public std::vector<Vector<size_t>> {
public:
LOD SliceLevels(size_t level_begin, size_t level_end) const;
LOD SliceInLevel(size_t level, size_t elem_begin, size_t elem_end) const;
};
LODTensor() {}
explicit LODTensor(const LOD &lod) : lod_(lod) {}
LODTensor(const LOD& lod, Tensor* t) : lod_(lod), tensor_(t) {}
void set_lod(const LOD& lod) { lod_ = lod; }
virtual Tensor *Clone() const { return new LODTensor(lod_); }
void set_tensor(Tensor* tensor) { tensor_ = tensor; }
Tensor& tensor() { return *tensor_; }
LOD lod() { return lod_; }
/*
* Get a element from LOD.
......@@ -79,71 +85,23 @@ class LODTensor : public Tensor {
PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
NumLevels());
// the last offset is the end of last element
return lod_[level].size() - 1;
return (lod_)[level].size() - 1;
}
/*
* Slice of levels[level_begin:level_end], with tensor shared.
* Slice of levels[level_begin:level_end]
*/
template <typename T>
LODTensor SliceLevels(size_t level_begin, size_t level_end) const;
void SliceLevels(size_t level_begin, size_t level_end);
/*
* Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
* Slice of elements of a level, [elem_begin: elem_end]
* @note: low performance in slice lod_.
*/
template <typename T>
LODTensor SliceInLevel(size_t level, size_t elem_begin,
size_t elem_end) const;
/*
* Copy other's lod_'s content, free to mutate.
*/
void CopyLOD(const LODTensor &other) { lod_ = other.lod_; }
/*
* Determine whether LODTensor has a valid LOD info.
*/
const LOD &lod() const { return lod_; }
LOD *mutable_lod() { return &lod_; }
virtual ~LODTensor() {}
void SliceInLevel(size_t level, size_t elem_begin, size_t elem_end);
private:
LOD lod_;
Tensor* tensor_; // not owned
};
bool operator==(const LODTensor::LOD &a, const LODTensor::LOD &b);
template <typename T>
LODTensor LODTensor::SliceLevels(size_t level_begin, size_t level_end) const {
auto new_lod = lod_.SliceLevels(level_begin, level_end);
// slice levels just need to update LOD info, each level will contains the
// whole tensor_, so no need to modify tensor_.
LODTensor new_tensor(new_lod);
new_tensor.ShareDataWith<T>(*this);
return new_tensor;
}
template <typename T>
LODTensor LODTensor::SliceInLevel(size_t level, size_t elem_begin,
size_t elem_end) const {
PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
NumLevels());
PADDLE_ENFORCE(elem_begin < NumElements(level),
"element begin [%d] out of range [%d]", elem_begin,
NumElements(level));
PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
"element end [%d] out of range [%d]", elem_end,
NumElements(level));
auto new_lod = lod_.SliceInLevel(level, elem_begin, elem_end);
// slice elements just need to update LOD info, because offsets are not
// changed, so the original tensor_ can be reused.
LODTensor new_tensor(new_lod);
new_tensor.ShareDataWith<T>(*this);
return new_tensor;
}
} // namespace framework
} // namespace paddle
# Design Doc: LoD (Level-of-Detail) Tensor
PaddlePaddle's RNN doesn't require that all instances have the same length. To do so, we introduce an extension to Tensor, namely, LoD Tensor.
## Challenge of Variable-length Inputs
People usually represent a mini-batch by a Tensor. For example, a mini-batch of 32 images, each of size 32x32, is a 10x32x32 Tensor. So a transformation, T, of all images can be a matrix multiplication of the 32x32xO-dimensional tensor T and the 10x32x32 Tensor.
Another example is that each mini-batch contains 32 sentences, where each word is a D-dimensional one-hot vector. If all sentences have the same length L, we can represent this mini-batch by a 32xLxD tensor. However, in most cases, sentences have variable lengths, and we will need an index data structure to record these variable lengths.
## LoD as a Solution
### Mini-Batch of variable-length sentenses
Let's imagine a mini-batch of 3 variable lengths sentences, containing 3, 1, and 2 words respectively. We can represent it by a (3+1+2)xD tensor plus some index information:
```
3
3 1 2
||| | ||
```
Each `|` represents a D-dimensional word vectors. The number 3 on top indicate 3 sentences, and numbers 3, 1, and 2 on the second level represent the number of words in each sentence.
### Mini-Batch of variable-length videos
This approach generalizes to the case where elements are not words, but higher dimensional objects, like images. Suppose that a mini-batch contains videos of the same frame size 640x480. If a mini-batch contains 3 videos of 3, 1, and 2 frames respectively. The underlying tensor is of size (3+1+2)x640x480. The index information illustrates as:
```
3
3 1 2
口口口 口 口口
```
where each `口` represents an image.
### Mini-Batch of fixed-size images
Let's get back to a typical example, image classification, where each mini-batch has M fixed-sized images. The LoD Tensor representation is
```
M
1 1 1 1 1
口口口口 ... 口
```
The many 1's on the second level seem duplicated. For this particular case of 2 levels and the second level always have length 1, we can ignore the LoD index.
### Design and summarization
In summary, as long as that the essential elements (words or images) have the same size, we can represent mini-batches by a LoD Tensor:
- The underlying tensor has size LxD1xD2x..., where D1xD2... is the size of the essential elements, and
- the first dimension size L has an additon property -- a LoD index as a nested vector:
```c++
typedef std::vector<std::vector> > LoD;
```
- The LoD index can is not necessary when there are only two levels and all elements of the second level have length 1.
## Slicing of LoD Tensor
Consider that we have a network with three levels of RNN: the top level one handles articles, the second level one handles sentences, and the basic level one handles words. This network requires that mini-batches represented by 4 level LoD Tensor, for example,
```
3
3 1 2
3 2 4 1 2 3
||| || |||| | || |||
```
To allow each level of RNN to handle its input, we define **the slicing of a LoD Tensor is defined as getting the j-th sequence on level i, or the <i,j>-slice**
For example, the <2,1>-slice of above slice is
```
2
||
```
and the <1,2>-slice of above example is
```
2
2 3
|| |||
```
Let's go on slicing this slice. Its <1,1>-slice is
```
3
|||
```
### The General Slicing Algorithm
The algorithm, with over-simplified data structure, is defined as
```c++
typedef vector<vector<int> > LoD;
struct LoDTensor {
LoD lod_;
float* tensor_;
};
LoDTensor Slice(const LoDTensor& lodt, int level, int sequence) {
}
```
### Slicing the Top Level
Please be aware that an RNN operator only slices the top level of a LoD Tensor to get the step inputs.
```c++
LoDTensor Slice(const LoDTensor& lodt, int sequence) {
}
```
......@@ -24,13 +24,12 @@ namespace framework {
class LODTensorTester : public ::testing::Test {
public:
virtual void SetUp() override {
lod_tensor.reset(new LODTensor);
// tensor's batch_size: 30
// 3 levels
// 0 10 20
// 0 5 10 15 20
// 0 2 5 7 10 12 15 20
LODTensor::LOD lod;
LOD lod;
lod.push_back(std::vector<size_t>{0, 10, 20});
lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
......@@ -41,75 +40,65 @@ class LODTensorTester : public ::testing::Test {
// malloc memory
tensor.mutable_data<float>(place);
lod_tensor.reset(new LODTensor(lod));
lod_tensor->Resize({20 /*batch size*/, 128 /*dim*/});
lod_tensor->ShareDataWith<float>(tensor);
// lod_tensor->ShareDataWith<Tensor>(tensor);
lod_tensor.set_lod(lod);
lod_tensor.set_tensor(&tensor);
}
protected:
std::unique_ptr<LODTensor> lod_tensor;
platform::CPUPlace place;
Tensor tensor;
LODTensor lod_tensor;
};
TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor.NumLevels(), 3UL); }
TEST_F(LODTensorTester, NumElements) {
ASSERT_EQ(lod_tensor->NumElements(0), 2UL);
ASSERT_EQ(lod_tensor->NumElements(1), 4UL);
ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
ASSERT_EQ(lod_tensor.NumElements(0), 2UL);
ASSERT_EQ(lod_tensor.NumElements(1), 4UL);
ASSERT_EQ(lod_tensor.NumElements(2), 8UL);
}
TEST_F(LODTensorTester, SliceLevels) {
// slice 1 level
for (size_t level = 0; level < 3UL; ++level) {
auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 1);
LODTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceLevels(level, level + 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
// ASSERT_EQ(new_lod_tensor, *lod_tensor);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
lod_tensor.tensor().data<float>());
}
// slice 2 level
for (size_t level = 0; level < 2UL; ++level) {
auto new_lod_tensor = lod_tensor->SliceLevels<float>(level, level + 2);
LODTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceLevels(level, level + 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
ASSERT_EQ(new_lod_tensor.NumElements(1),
lod_tensor->NumElements(level + 1));
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor.NumElements(level));
ASSERT_EQ(new_lod_tensor.NumElements(1), lod_tensor.NumElements(level + 1));
ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
lod_tensor.tensor().data<float>());
}
}
TEST_F(LODTensorTester, SliceInLevel) {
size_t level = 0;
auto new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
LODTensor new_lod_tensor = lod_tensor;
new_lod_tensor.SliceInLevel(level, 0, 2);
EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
lod_tensor.tensor().data<float>());
level = 1;
new_lod_tensor = lod_tensor->SliceInLevel<float>(level, 0, 2);
new_lod_tensor = lod_tensor;
new_lod_tensor.SliceInLevel(level, 0, 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor->data<float>());
}
TEST_F(LODTensorTester, ShareLOD) {
LODTensor new_lod_tensor;
new_lod_tensor.CopyLOD(*lod_tensor);
ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
}
TEST_F(LODTensorTester, CopyLOD) {
LODTensor new_lod_tensor;
new_lod_tensor.CopyLOD(*lod_tensor);
bool equals = std::equal(lod_tensor->lod().begin(), lod_tensor->lod().end(),
new_lod_tensor.lod().begin());
ASSERT_TRUE(equals);
ASSERT_EQ(new_lod_tensor.tensor().data<float>(),
lod_tensor.tensor().data<float>());
}
} // namespace framework
......
......@@ -80,9 +80,19 @@ class OpInfoMap {
}
const OpInfo& Get(const std::string& type) const {
auto op_info_ptr = GetNullable(type);
PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
type);
return *op_info_ptr;
}
const OpInfo* GetNullable(const std::string& type) const {
auto it = map_.find(type);
PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
return it->second;
if (it == map_.end()) {
return nullptr;
} else {
return &it->second;
}
}
template <typename Callback>
......
......@@ -33,8 +33,7 @@ namespace framework {
class OpRegistry {
public:
template <typename OpType, typename ProtoMakerType, typename GradOpType>
static void RegisterOp(const std::string& op_type,
const std::string& grad_op_type) {
static void RegisterOp(const std::string& op_type) {
PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
"'%s' is registered more than once.", op_type);
OpInfo op_info;
......@@ -43,9 +42,9 @@ class OpRegistry {
const VariableNameMap& outputs, const AttributeMap& attrs) {
return new OpType(type, inputs, outputs, attrs);
};
op_info.grad_op_type_ = grad_op_type;
if (std::type_index(typeid(ProtoMakerType)) !=
std::type_index(typeid(NOPMaker))) {
op_info.grad_op_type_ = op_type + "_grad";
op_info.proto_ = new OpProto;
op_info.checker_ = new OpAttrChecker;
auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
......@@ -55,15 +54,14 @@ class OpRegistry {
op_info.proto_->IsInitialized(),
"Fail to initialize %s's OpProto, because %s is not initialized",
op_type, op_info.proto_->InitializationErrorString());
// register gradient op
RegisterOp<GradOpType, NOPMaker, NOP>(op_info.grad_op_type_);
} else {
op_info.grad_op_type_ = "";
op_info.proto_ = nullptr;
op_info.checker_ = nullptr;
}
OpInfoMap::Instance().Insert(op_type, op_info);
// register gradient op
if (!grad_op_type.empty()) {
RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
}
}
static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
......@@ -92,10 +90,8 @@ class Registrar {
template <typename OpType, typename ProtoMakerType, typename GradOpType>
class OpRegistrar : public Registrar {
public:
explicit OpRegistrar(const char* op_type) { OpRegistrar(op_type, ""); }
OpRegistrar(const char* op_type, const char* grad_op_type) {
OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type,
grad_op_type);
explicit OpRegistrar(const char* op_type) {
OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type);
}
};
......@@ -121,8 +117,7 @@ class OpKernelRegistrar : public Registrar {
/**
* Macro to register Operator.
*/
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
grad_op_class) \
#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_class) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
class _OpClass_##op_type##_ : public op_class { \
......@@ -137,14 +132,14 @@ class OpKernelRegistrar : public Registrar {
}; \
static ::paddle::framework::OpRegistrar< \
_OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_> \
__op_registrar_##op_type##__(#op_type, #grad_op_type); \
__op_registrar_##op_type##__(#op_type); \
int TouchOpRegistrar_##op_type() { \
__op_registrar_##op_type##__.Touch(); \
return 0; \
}
#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
REGISTER_OP(op_type, op_class, op_maker_class, ::paddle::framework::NOP)
/**
* Macro to register OperatorKernel.
......
......@@ -33,12 +33,12 @@ ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
}
#endif
const std::string& OperatorBase::Input(const std::string& name) const {
std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name);
PADDLE_ENFORCE_EQ(ins.size(), 1UL,
PADDLE_ENFORCE_LE(ins.size(), 1UL,
"Op %s input %s should contain only one variable", type_,
name);
return ins[0];
return ins.empty() ? kEmptyVarName : ins[0];
}
const std::vector<std::string>& OperatorBase::Inputs(
......@@ -49,12 +49,12 @@ const std::vector<std::string>& OperatorBase::Inputs(
return it->second;
}
const std::string& OperatorBase::Output(const std::string& name) const {
std::string OperatorBase::Output(const std::string& name) const {
auto& outs = Outputs(name);
PADDLE_ENFORCE_EQ(outs.size(), 1UL,
PADDLE_ENFORCE_LE(outs.size(), 1UL,
"Op %s output %s should contain only one variable", type_,
name);
return outs[0];
return outs.empty() ? kEmptyVarName : outs[0];
}
const std::vector<std::string>& OperatorBase::Outputs(
......@@ -119,16 +119,8 @@ OperatorBase::OperatorBase(const std::string& type,
const VariableNameMap& outputs,
const AttributeMap& attrs)
: type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
static std::atomic<size_t> gUniqId(0UL);
for (auto& output : outputs_) {
for (auto& output_name : output.second) {
if (output_name == kTempVarName) {
output_name += type_;
output_name += "@";
output_name += std::to_string(gUniqId.fetch_add(1));
}
}
}
GenerateTemporaryNames();
CheckAllInputOutputSet();
}
std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
......@@ -156,6 +148,35 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
return ret_val;
}
void OperatorBase::CheckAllInputOutputSet() const {
auto& info_map = OpInfoMap::Instance();
auto* op_info = info_map.GetNullable(Type());
if (op_info == nullptr || op_info->proto_ == nullptr) return;
for (auto& in : op_info->Proto().inputs()) {
PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
"Type %s's input %s is not set", Type(), in.name());
}
for (auto& out : op_info->Proto().outputs()) {
PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
"Type %s's output %s is not set", Type(), out.name());
}
}
void OperatorBase::GenerateTemporaryNames() {
static std::atomic<size_t> gUniqId(0UL);
for (auto& output : outputs_) {
for (auto& output_name : output.second) {
if (output_name == kTempVarName) {
output_name += type_;
output_name += "@";
output_name += std::to_string(gUniqId.fetch_add(1));
}
}
}
}
void OpProtoAndCheckerMaker::Validate() {
validated_ = true;
CheckNoDuplicatedInOutAttrs();
......
......@@ -95,12 +95,12 @@ class OperatorBase {
const VariableNameMap& Inputs() const { return inputs_; }
const VariableNameMap& Outputs() const { return outputs_; }
//! Get a input with argument's name described in `op_proto`
const std::string& Input(const std::string& name) const;
std::string Input(const std::string& name) const;
//! Get a input which has multiple variables.
const std::vector<std::string>& Inputs(const std::string& name) const;
//! Get a output with argument's name described in `op_proto`
const std::string& Output(const std::string& name) const;
std::string Output(const std::string& name) const;
//! Get an output which has multiple variables.
//! TODO add a vector_view to prevent memory copy.
const std::vector<std::string>& Outputs(const std::string& name) const;
......@@ -127,6 +127,10 @@ class OperatorBase {
// IG (Inputs Gradients)
VariableNameMap outputs_;
AttributeMap attrs_;
private:
void GenerateTemporaryNames();
void CheckAllInputOutputSet() const;
};
// Macro for define a clone method.
......@@ -238,11 +242,13 @@ class InferShapeContext {
}
const Variable* InputVar(const std::string& name) const {
return scope_.FindVar(op_.Input(name));
auto ipt = op_.Input(name);
return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
}
Variable* OutputVar(const std::string& name) const {
return scope_.FindVar(op_.Output(name));
auto opt = op_.Output(name);
return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
}
const std::vector<const Variable*> MultiInputVar(
......@@ -250,9 +256,11 @@ class InferShapeContext {
auto names = op_.Inputs(name);
std::vector<const Variable*> res;
res.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(res),
[this](const std::string& name) { return scope_.FindVar(name); });
std::transform(names.begin(), names.end(), std::back_inserter(res),
[this](const std::string& name) {
return name == kEmptyVarName ? nullptr
: scope_.FindVar(name);
});
return res;
}
......@@ -260,24 +268,24 @@ class InferShapeContext {
auto names = op_.Outputs(name);
std::vector<const Variable*> res;
res.reserve(names.size());
std::transform(
names.begin(), names.end(), std::back_inserter(res),
[this](const std::string& name) { return scope_.FindVar(name); });
std::transform(names.begin(), names.end(), std::back_inserter(res),
[this](const std::string& name) {
return name == kEmptyVarName ? nullptr
: scope_.FindVar(name);
});
return res;
}
template <typename T>
const T* Input(const std::string& name) const {
auto* var = InputVar(name);
PADDLE_ENFORCE_NOT_NULL(var, "Input(%s) should not be nullptr", name);
return &var->Get<T>();
return var == nullptr ? nullptr : &var->Get<T>();
}
template <typename T>
T* Output(const std::string& name) const {
auto var = OutputVar(name);
PADDLE_ENFORCE_NOT_NULL(var, "Output(%s) should not be nullptr", name);
return var->GetMutable<T>();
return var == nullptr ? nullptr : var->GetMutable<T>();
}
template <typename T>
......@@ -288,10 +296,7 @@ class InferShapeContext {
std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) {
auto var = scope_.FindVar(sub_name);
PADDLE_ENFORCE_NOT_NULL(
var, "MultiInput(%s:%s) should not be nullptr", name,
sub_name);
return &var->Get<T>();
return var == nullptr ? nullptr : &var->Get<T>();
});
return res;
}
......@@ -304,10 +309,7 @@ class InferShapeContext {
std::transform(names.begin(), names.end(), std::back_inserter(res),
[&](const std::string& sub_name) {
auto var = scope_.FindVar(sub_name);
PADDLE_ENFORCE_NOT_NULL(
var, "MultiOutput(%s:%s) should not be nullptr.", name,
sub_name);
return var->GetMutable<T>();
return var == nullptr ? nullptr : var->GetMutable<T>();
});
return res;
}
......
......@@ -117,6 +117,8 @@ inline void Tensor::CopyFrom(const Tensor& src,
memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
}
PADDLE_ENFORCE(cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in Tensor CopyFrom");
#endif
}
......
......@@ -21,6 +21,8 @@ if(USE_NNPACK)
endif()
endif()
list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
add_dependencies(paddle_function ${external_project_dependencies})
add_dependencies(paddle_function paddle_proto)
......@@ -42,11 +44,11 @@ if(WITH_GPU)
add_simple_unittest(RowConvOpTest)
add_simple_unittest(BlockExpandOpTest)
add_simple_unittest(CropOpTest)
add_simple_unittest(DepthwiseConvOpTest)
endif()
add_simple_unittest(Im2ColTest)
add_simple_unittest(GemmConvOpTest)
add_simple_unittest(DepthwiseConvOpTest)
endif()
add_style_check_target(paddle_function ${h_files})
......
......@@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
}
#endif
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
TEST(DepthwiseConv, Forward) {
DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
"GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
}
#endif
} // namespace paddle
......@@ -16,6 +16,7 @@ limitations under the License. */
#include "TensorShape.h"
#include "TensorType.h"
#include "neon/neon_util.h"
namespace paddle {
......@@ -93,4 +94,95 @@ public:
int paddingWidth);
};
template <class T>
struct Padding {
static void run(const T* src,
T* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
memcpy(dest, src, inputWidth * sizeof(T));
dest += inputWidth;
src += inputWidth;
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
}
}
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
struct Padding<float> {
static void run(const float* src,
float* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
int step = inputWidth >> 2;
int remain = inputWidth & 3;
for (int s = 0; s < step; s++) {
float32x4_t s0 = vld1q_f32(src);
vst1q_f32(dest, s0);
src += 4;
dest += 4;
}
for (int r = 0; r < remain; r++) {
*dest++ = *src++;
}
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
}
}
};
#endif
} // namespace paddle
此差异已折叠。
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
namespace paddle {
namespace neon {
inline float32x4_t vld1q_f32_aligned(const float* p) {
return vld1q_f32(
(const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
}
#ifndef __aarch64__
inline float32_t vaddvq_f32(float32x4_t a) {
float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
return vget_lane_f32(vpadd_f32(v, v), 0);
}
inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
float32x4_t b,
float32x4_t v,
const int lane) {
return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
}
#endif
} // namespace neon
} // namespace paddle
#endif
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Conv3DLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(conv3d, Conv3DLayer);
bool Conv3DLayer::init(const LayerMap &layerMap,
const ParameterMap &parameterMap) {
if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
int index = 0;
for (auto &inputConfig : config_.inputs()) {
const ConvConfig &conf = inputConfig.conv_conf();
M_.push_back(numFilters_ / conf.groups());
K_.push_back(filterPixels_[index] * filterChannels_[index]);
// create a new weight
size_t height, width;
width = filterPixels_[index] * filterChannels_[index];
height = numFilters_;
CHECK_EQ(parameters_[index]->getSize(), width * height);
Weight *w = new Weight(height, width, parameters_[index]);
weights_.emplace_back(w);
++index;
}
if (biasParameter_.get()) {
if (sharedBiases_) {
CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
biases_ =
std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
} else {
biases_ =
std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
}
return true;
}
size_t Conv3DLayer::getSize() {
CHECK_NE(inputLayers_.size(), 0UL);
outputH_.clear();
outputW_.clear();
outputD_.clear();
N_.clear();
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); ++i) {
outputW_.push_back(outputSize(
imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
outputH_.push_back(outputSize(
imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
outputD_.push_back(outputSize(
imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
layerSize += N_[i] * numFilters_;
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
getOutput().setFrameDepth(outputD_[0]);
return layerSize;
}
void Conv3DLayer::forward(PassType passType) {
Layer::forward(passType);
int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
int outWidth = getSize();
resetOutput(batchSize, outWidth);
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
const MatrixPtr &inMat = getInputValue(i);
const MatrixPtr &outMat = getOutputValue();
int M = M_[i];
int N = N_[i];
int K = K_[i];
Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
MatrixPtr wMat = weights_[i]->getW();
for (int n = 0; n < batchSize; ++n) {
colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
channels_[i],
imgSizeD_[i],
imgSizeH_[i],
imgSizeW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
strideZ_[i],
strideY_[i],
stride_[i],
paddingZ_[i],
paddingY_[i],
padding_[i]);
real *outData = outMat->getData() + n * outMat->getStride();
MatrixPtr outMatSub =
Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
for (int g = 0; g < groups_[i]; g++) {
MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
MatrixPtr in = colBuf_->subMatrix(g * K, K);
MatrixPtr out = outMatSub->subMatrix(g * M, M);
out->mul(*wMatSub, *in, 1.0, 1.0);
}
}
}
if (nullptr != this->biasParameter_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
this->addBias();
}
forwardActivation();
}
void Conv3DLayer::backward(const UpdateCallback &callback) {
backwardActivation();
if (biases_ && biases_->getWGrad()) {
bpropBiases();
biases_->getParameterPtr()->incUpdate(callback);
}
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
if (weights_[i]->getWGrad()) {
bpropWeights(i);
}
if (getInputGrad(i)) {
bpropData(i);
}
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
weights_[i]->getParameterPtr()->incUpdate(callback);
}
}
void Conv3DLayer::bpropWeights(int i) {
int M = M_[i];
int N = N_[i];
int K = K_[i];
const MatrixPtr &inMat = getInputValue(i);
Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
MatrixPtr wGradMat = weights_[i]->getWGrad();
int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
for (int n = 0; n < batchSize; ++n) {
colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
channels_[i],
imgSizeD_[i],
imgSizeH_[i],
imgSizeW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
strideZ_[i],
strideY_[i],
stride_[i],
paddingZ_[i],
paddingY_[i],
padding_[i]);
real *outGradData =
getOutputGrad()->getData() + n * getOutputGrad()->getStride();
MatrixPtr outGradSub =
Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
for (int g = 0; g < groups_[i]; ++g) {
MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
MatrixPtr outG = outGradSub->subMatrix(g * M, M);
MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
}
}
}
void Conv3DLayer::bpropData(int i) {
int M = M_[i];
int N = N_[i];
int K = K_[i];
Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
MatrixPtr wMat = weights_[i]->getW();
int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
for (int n = 0; n < batchSize; ++n) {
real *outGradData =
getOutputGrad()->getData() + n * getOutputGrad()->getStride();
real *preGradData =
getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
MatrixPtr outGradSub =
Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
for (int g = 0; g < groups_[i]; ++g) {
MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
MatrixPtr outG = outGradSub->subMatrix(g * M, M);
MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
}
colBuf_->col2Vol(preGradData,
channels_[i],
imgSizeD_[i],
imgSizeH_[i],
imgSizeW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
strideZ_[i],
strideY_[i],
stride_[i],
paddingZ_[i],
paddingY_[i],
padding_[i],
1.0,
1.0);
}
}
void Conv3DLayer::bpropBiases() {
MatrixPtr outGradMat = getOutputGrad();
if (this->sharedBiases_) {
biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
} else {
biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
}
}
void Conv3DLayer::addBias() {
MatrixPtr outMat = getOutputValue();
if (this->sharedBiases_) {
outMat->addSharedBias(*(biases_->getW()), 1.0f);
} else {
outMat->addBias(*(biases_->getW()), 1.0f);
}
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "ConvBaseLayer.h"
#include "paddle/math/MathUtils.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief A subclass of convolution layer.
* This layer expands input and use matrix multiplication to
* calculate convolution operation.
*/
class Conv3DLayer : public ConvBaseLayer {
public:
explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
~Conv3DLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void addBias();
void backward(const UpdateCallback& callback);
void bpropBiases();
void bpropData(int i);
void bpropWeights(int i);
size_t getSize();
protected:
// Figure out the dimensions for individual gemms.
IntV M_; /// numFilters_ / filter_group_;
IntV N_; /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
IntV K_; /// outputD_ * outputH_ * outputW_
MatrixPtr colBuf_;
};
} // namespace paddle
......@@ -38,7 +38,6 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
strideY_.push_back(conf.stride_y());
dilationY_.push_back(conf.dilation_y());
filterSizeY_.push_back(conf.filter_size_y());
filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
channels_.push_back(conf.channels());
imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
: conf.img_size());
......@@ -47,31 +46,20 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
filterChannels_.push_back(conf.filter_channels());
outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
outputW_.push_back(conf.output_x());
paddingZ_.push_back(conf.padding_z());
strideZ_.push_back(conf.stride_z());
filterSizeZ_.push_back(conf.filter_size_z());
imgSizeD_.push_back(conf.img_size_z());
outputD_.push_back(conf.output_z());
filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
filterSizeZ_.back());
}
CHECK(inputLayers_.size() == parameters_.size());
for (size_t i = 0; i < inputLayers_.size(); i++) {
size_t height, width;
height = filterPixels_[i] * filterChannels_[i];
width = (!isDeconv_) ? numFilters_ : channels_[i];
// create a new weight
CHECK_EQ(parameters_[i]->getSize(), width * height);
Weight* w = new Weight(height, width, parameters_[i]);
weights_.emplace_back(w);
}
/* initialize the biases_ */
if (biasParameter_.get()) {
if (sharedBiases_) {
CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
biases_ =
std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
} else {
biases_ =
std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
}
}
// create new weights_ in derived class
// create new biases_ in derived class
// default caffe model
caffeMode_ = true;
......
......@@ -62,6 +62,13 @@ protected:
IntV outputH_;
/// The spatial dimensions of output feature map width.
IntV outputW_;
IntV outputD_;
IntV imgSizeD_;
IntV filterSizeZ_;
IntV strideZ_;
IntV paddingZ_;
/// Group size, refer to grouped convolution in
/// Alex Krizhevsky's paper: when group=2, the first half of the
/// filters are only connected to the first half of the input channels,
......
......@@ -318,7 +318,9 @@ public:
void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
void backwardImp(Matrix& outputValue,
Argument& label,
Matrix& outputGrad) override {}
};
/**
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "CrossEntropyOverBeam.h"
namespace paddle {
void CostForOneSequence::calValidExpandStep() {
validExpansionCount_ = 0;
goldAsExtraPath_ = true;
for (size_t i = 0; i < beams_->expansionCount; ++i) {
real gold = static_cast<real>(beams_->gold[i]);
if (i) {
real* start = beams_->candidateIds[i - 1]->getData();
goldRowIds_[i] = std::count_if(
start,
start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1],
[](const real& val) { return val != -1.; });
} else {
goldRowIds_[i] = 0;
}
real* start =
beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_;
real* findEnd = std::find(start, start + beamSize_, gold);
validExpansionCount_++;
if (start + beamSize_ == findEnd) return;
goldColIds_[i] = findEnd - start;
}
if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false;
}
size_t CostForOneSequence::initLastExpansion() {
int beamId = validExpansionCount_ - 1;
const MatrixPtr candidates = beams_->candidateIds[beamId];
size_t height = candidates->getHeight();
/* initialization the last expansion. */
size_t pathCount = std::count_if(candidates->getData(),
candidates->getData() + height * beamSize_,
[](const real& val) { return val != -1; });
/*
* if the gold sequence falls off the beam during search, add the gold
* sequence as the last path into the all expanded candidates.
*/
if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++;
pathRowIdsInEachBeam_.clear();
pathRowIdsInEachBeam_.resize(validExpansionCount_,
std::vector<int>(pathCount, 0));
parentIdsInBeam_.clear();
parentIdsInBeam_.resize(pathCount, 0);
if (goldAsExtraPath_) {
/* add gold sequence into the total expansion. */
pathRowIdsInEachBeam_[beamId].back() =
beams_->gold[beamId] +
getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]);
parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1];
} else {
size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId];
goldIdsInFinalExpansion_ =
std::count_if(candidates->getData(),
candidates->getData() + goldOffset,
[](const real& val) { return val != -1.; });
}
/*
* TODO(caoying): fix this, store the indices of selected candidate
* paths into Argument.ids
*/
real* ids = candidates->getData();
size_t curIdx = 0;
for (size_t i = 0; i < height; ++i) {
int basePos = getSeqStartPos(beamId, i);
for (size_t j = 0; j < beamSize_; ++j) {
int id = ids[i * beamSize_ + j];
if (id == -1) continue;
pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos;
parentIdsInBeam_[curIdx++] = i;
}
}
return pathCount;
}
void CostForOneSequence::constructTotalExpansion() {
/*
* construct the entire expanded beam by begining with the last search
* in which gold falls off the beam.
*/
size_t totalPathCount = initLastExpansion();
for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) {
const MatrixPtr candidates = beams_->candidateIds[beamId];
real* ids = candidates->getData();
int lastParentIdInBeam = -1;
int basePos = -1;
for (size_t i = 0;
i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount);
++i) {
int id = ids[parentIdsInBeam_[i]];
int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot;
if (parentIdsInBeam_[i] != lastParentIdInBeam)
basePos = getSeqStartPos(beamId, parentRowId);
pathRowIdsInEachBeam_[beamId][i] = id + basePos;
lastParentIdInBeam = parentIdsInBeam_[i];
parentIdsInBeam_[i] = parentRowId;
if (goldAsExtraPath_)
pathRowIdsInEachBeam_[beamId][totalPathCount - 1] =
beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]);
}
}
}
real CostForOneSequence::globallyNormalizedScore() {
expandedPathScores_.resize(validExpansionCount_);
Matrix::resizeOrCreate(
softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false);
softmaxOut_->zeroMem();
MatrixPtr tmp = Matrix::create(
softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
for (size_t i = 0; i < validExpansionCount_; ++i) {
Matrix::resizeOrCreate(expandedPathScores_[i],
pathRowIdsInEachBeam_[i].size(),
1,
false,
false);
expandedPathScores_[i]->zeroMem();
IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
pathRowIdsInEachBeam_[i].size(),
false);
expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds);
tmp->add(*expandedPathScores_[i]);
}
softmaxOut_->softmax(*softmaxOut_);
return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]);
}
real CostForOneSequence::forward() {
calValidExpandStep();
constructTotalExpansion();
return globallyNormalizedScore();
}
void CostForOneSequence::backward() {
/*
* when softmax layer is the output layer, and it is combined with
* cross-entropy as cost. The derivate with regard to softmax's input
* is simply:
*
* grad_i = softmax_out_i - target_i,
*
* and here hard label is used.
*/
softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
MatrixPtr tmp = Matrix::create(
softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
for (size_t i = 0; i < validExpansionCount_; ++i) {
IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
pathRowIdsInEachBeam_[i].size(),
false);
/*
beams_->scoreGrad[i] has been intialized outside this class, this
class only keeps a pointer pointing to the original input gradients,
so here does not need to allocate or initalize the memory.
*/
tmp->addToRows(*beams_->scoreGrad[i], *rowIds);
}
}
REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam);
bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number.";
beamExpanCount_ = inputLayers_.size() / 3;
candidateScores_.resize(beamExpanCount_);
candidateScoreGrad_.resize(beamExpanCount_);
candidateInBeam_.resize(beamExpanCount_);
goldSequence_.resize(beamExpanCount_);
gradToInputs_.resize(beamExpanCount_);
setNeedSequenceInfo(false);
return true;
}
void CrossEntropyOverBeam::checkInputs() {
batchSize_ = 0;
for (size_t i = 0; i < beamExpanCount_; ++i) {
const Argument& scores = getInput(i * 3);
const Argument& selCandidates = getInput(i * 3 + 1);
const Argument& goldSeq = getInput(i * 3 + 2);
if (i) {
CHECK(scores.hasSubseq()) << "input " << i << " "
<< inputLayers_[i * 3]->getName()
<< " should be a nested sequence";
CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
CHECK_EQ(batchSize_, static_cast<size_t>(scores.getNumSequences()));
CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
} else {
CHECK(scores.hasSeq()) << "input " << i << " "
<< inputLayers_[i]->getName()
<< " should be a sequence";
batchSize_ = scores.getNumSequences();
beamSize_ = getInputValue(i * 3 + 1)->getWidth();
CHECK_EQ(batchSize_, static_cast<size_t>(selCandidates.getBatchSize()));
}
CHECK_EQ(1U, scores.value->getWidth());
CHECK_EQ(batchSize_, static_cast<size_t>(goldSeq.getBatchSize()));
}
}
void CrossEntropyOverBeam::copyInputsToCpu() {
auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) {
if (dynamic_cast<GpuMatrix*>(src.get())) {
Matrix::resizeOrCreate(
trg, src->getHeight(), src->getWidth(), false, false);
trg->copyFrom(*src);
} else {
trg = std::move(src);
}
};
auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) {
if (dynamic_cast<GpuIVector*>(src.get())) {
IVector::resizeOrCreate(trg, src->getSize(), false);
trg->copyFrom(*src);
} else {
trg = std::move(src);
}
};
beamSplitPos_.clear();
beamSplitPos_.resize(batchSize_, std::vector<int>(beamExpanCount_, 0));
for (size_t i = 0; i < beamExpanCount_; ++i) {
copyValue(getInputValue(i * 3), candidateScores_[i]);
copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]);
copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]);
if (i) {
ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions;
const int* seqStarts = seqInfo->getMutableData(false);
ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions;
const int* subSeqStarts = subSeqInfo->getMutableData(false);
size_t seqId = 1;
for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1;
++subSeqId) {
CHECK_LT(seqId, seqInfo->getSize());
if (subSeqStarts[subSeqId] == seqStarts[seqId]) {
beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i];
seqId++;
}
beamSplitPos_[seqId - 1][i]++;
}
} else {
for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1;
}
}
}
void CrossEntropyOverBeam::splitBatchBeams() {
beamCosts_.resize(batchSize_);
beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_));
for (size_t i = 0; i < beamExpanCount_; ++i) {
int* seqStarts =
getInput(i * 3).sequenceStartPositions->getMutableData(false);
int* subSeqStarts = nullptr;
int maxLen = 0;
if (i) {
subSeqStarts =
getInput(i * 3).subSequenceStartPositions->getMutableData(false);
maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1;
} else {
maxLen = getInput(i).sequenceStartPositions->getSize() - 1;
}
for (size_t j = 0; j < batchSize_; ++j) {
beamPerSeq_[j].scores[i] =
Matrix::create(candidateScores_[i]->getData() + seqStarts[j],
seqStarts[j + 1] - seqStarts[j],
1,
false,
false);
beamPerSeq_[j].scoreGrad[i] =
Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j],
seqStarts[j + 1] - seqStarts[j],
1,
false,
false);
int offset = j ? beamSplitPos_[j - 1][i] : 0;
int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0);
CHECK_GE(maxLen, offset + height);
beamPerSeq_[j].seqInfo[i] = IVector::create(
(i ? subSeqStarts : seqStarts) + offset, height + 1, false);
beamPerSeq_[j].candidateIds[i] =
Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_,
height,
beamSize_,
false,
false);
beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j];
CHECK_LE(beamPerSeq_[j].gold[i], seqStarts[j + 1] - seqStarts[j]);
}
}
}
void CrossEntropyOverBeam::resizeOutput() {
Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false);
output_.value->zeroMem();
for (size_t i = 0; i < beamExpanCount_; ++i) {
MatrixPtr inGrad = getInputGrad(i * 3);
if (dynamic_cast<GpuMatrix*>(inGrad.get())) {
Matrix::resizeOrCreate(candidateScoreGrad_[i],
inGrad->getHeight(),
inGrad->getWidth(),
false,
false);
} else {
candidateScoreGrad_[i] = std::move(inGrad);
}
candidateScoreGrad_[i]->zeroMem();
}
}
void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) {
for (size_t i = 0; i < beamExpanCount_; ++i) {
if (dynamic_cast<GpuMatrix*>(getInputGrad(i * 3).get()))
getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]);
if (i == copyCount - 1) break;
}
}
void CrossEntropyOverBeam::forward(PassType passType) {
Layer::forward(passType);
checkInputs();
copyInputsToCpu();
resizeOutput();
splitBatchBeams();
MatrixPtr outputValue = getOutputValue();
for (size_t i = 0; i < batchSize_; ++i) {
BeamExpansionPtr ptr = std::make_shared<BeamExpansion>(beamPerSeq_[i]);
beamCosts_[i].setData(std::move(ptr), beamSize_);
outputValue->getData()[i] = beamCosts_[i].forward();
}
}
void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {
for (size_t i = 0; i < batchSize_; ++i) {
beamCosts_[i].backward();
copyGradToGpu(beamCosts_[i].getValidExpansionCount());
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "CrossEntropyOverBeam.h"
#include "Layer.h"
namespace paddle {
/* This struct stores the beams in all search steps for a single sequence. */
struct BeamExpansion {
std::vector<MatrixPtr> scores;
std::vector<IVectorPtr> seqInfo;
std::vector<MatrixPtr> candidateIds;
std::vector<int> gold;
std::vector<MatrixPtr> scoreGrad;
size_t expansionCount;
explicit BeamExpansion(int n) {
expansionCount = n;
scores.resize(expansionCount);
seqInfo.resize(expansionCount);
candidateIds.resize(expansionCount);
scoreGrad.resize(expansionCount);
gold.resize(expansionCount);
}
};
typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
class CostForOneSequence {
public:
CostForOneSequence()
: beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
beams_ = bPtr;
beamSize_ = beamSize;
expandedPathScores_.clear();
expandedPathScores_.resize(beams_->expansionCount);
goldRowIds_.clear();
goldRowIds_.resize(beams_->expansionCount, 0);
goldColIds_.clear();
goldColIds_.resize(beams_->expansionCount, -1);
}
size_t getValidExpansionCount() { return validExpansionCount_; }
real forward();
void backward();
private:
void calValidExpandStep();
void constructTotalExpansion();
size_t initLastExpansion();
real globallyNormalizedScore();
int getSeqStartPos(size_t beamId, size_t rowId) {
CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
int* starts = beams_->seqInfo[beamId]->getData();
return starts[rowId] - starts[0];
}
size_t beamSize_;
size_t validExpansionCount_;
bool goldAsExtraPath_;
std::vector<int> goldRowIds_;
std::vector<int> goldColIds_;
BeamExpansionPtr beams_;
std::vector<std::vector<int>> pathRowIdsInEachBeam_;
std::vector<int> parentIdsInBeam_;
size_t goldIdsInFinalExpansion_;
std::vector<MatrixPtr> expandedPathScores_;
MatrixPtr softmaxOut_;
};
class CrossEntropyOverBeam : public Layer {
public:
explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback) override;
private:
void checkInputs();
void copyInputsToCpu();
void resizeOutput();
void copyGradToGpu(size_t copyCount);
void splitBatchBeams();
size_t beamExpanCount_;
size_t batchSize_;
size_t beamSize_;
/*
* the process of constructing beams is not friendly to GPU, currently, this
* layer only runs on CPU, if any of its inputs is on GPU memory, then copy
* it to CPU memory.
*/
std::vector<MatrixPtr> candidateScores_;
std::vector<MatrixPtr> candidateScoreGrad_;
std::vector<MatrixPtr> candidateInBeam_;
std::vector<MatrixPtr> gradToInputs_;
std::vector<IVectorPtr> goldSequence_;
std::vector<std::vector<int>> beamSplitPos_;
/*
* split entire bath of beams into beam per sequnence and store the result
* into this member.
*/
std::vector<BeamExpansion> beamPerSeq_;
/* beamCosts_ is used to propagate error in one sequence. */
std::vector<CostForOneSequence> beamCosts_;
};
} // namespace paddle
......@@ -46,8 +46,26 @@ bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
projConf_.emplace_back(conf);
projections_.emplace_back(
Projection::create(*projConf_[i], parameters_[i], useGpu_));
// create a new weight
size_t height, width;
height = filterPixels_[i] * filterChannels_[i];
width = (!isDeconv_) ? numFilters_ : channels_[i];
CHECK_EQ(parameters_[i]->getSize(), width * height);
Weight *w = new Weight(height, width, parameters_[i]);
weights_.emplace_back(w);
}
if (biasParameter_.get()) {
if (sharedBiases_) {
CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
biases_ =
std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
} else {
biases_ =
std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
}
}
if (biases_.get() && sharedBiases_) {
hl_create_tensor_descriptor(&biasDesc_);
hl_create_tensor_descriptor(&outputDesc_);
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DeConv3DLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(deconv3d, DeConv3DLayer);
bool DeConv3DLayer::init(const LayerMap &layerMap,
const ParameterMap &parameterMap) {
if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
// for Deconv, the dimension of Kernel is
// channel * output * depth * height * weigth
// Matrix storage format: (output * depth * height * weigth) x channel
for (int index = 0; index < config_.inputs().size(); ++index) {
M_.push_back(filterChannels_[index]);
K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
// create a new weight
size_t height, width;
height = filterPixels_[index] * numFilters_;
width = filterChannels_[index];
CHECK_EQ(parameters_[index]->getSize(), width * height);
Weight *w = new Weight(height, width, parameters_[index]);
weights_.emplace_back(w);
}
if (biasParameter_.get()) {
if (sharedBiases_) {
CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
biases_ =
std::unique_ptr<Weight>(new Weight(1, numFilters_, biasParameter_));
} else {
biases_ =
std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
}
}
return true;
}
size_t DeConv3DLayer::getSize() {
CHECK_NE(inputLayers_.size(), 0UL);
outputH_.clear();
outputW_.clear();
outputD_.clear();
N_.clear();
NOut_.clear();
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); ++i) {
outputW_.push_back(
imageSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
outputH_.push_back(imageSize(
imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
outputD_.push_back(imageSize(
imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
NOut_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
N_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
layerSize += NOut_[i] * numFilters_;
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
getOutput().setFrameDepth(outputD_[0]);
return layerSize;
}
void DeConv3DLayer::forward(PassType passType) {
Layer::forward(passType);
int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
int outWidth = getSize();
resetOutput(batchSize, outWidth);
const MatrixPtr outMat = getOutputValue();
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
const MatrixPtr &inMat = getInputValue(i);
int M = M_[i];
int N = N_[i];
int K = K_[i];
MatrixPtr wMat = weights_[i]->getW();
Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
for (int n = 0; n < batchSize; ++n) {
real *inData = inMat->getData() + n * inMat->getStride();
for (int g = 0; g < groups_[i]; ++g) {
MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
inData += M * N;
}
colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
numFilters_,
outputD_[i],
outputH_[i],
outputW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
strideZ_[i],
strideY_[i],
stride_[i],
paddingZ_[i],
paddingY_[i],
padding_[i],
1.0,
1.0);
}
}
if (nullptr != this->biasParameter_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
this->addBias();
}
forwardActivation();
}
void DeConv3DLayer::backward(const UpdateCallback &callback) {
backwardActivation();
int batchSize = getOutputGrad()->getHeight();
if (biases_ && biases_->getWGrad()) {
bpropBiases();
biases_->getParameterPtr()->incUpdate(callback);
}
for (size_t i = 0; i < inputLayers_.size(); ++i) {
if (weights_[i]->getWGrad() || this->needGradient_) {
int M = M_[i];
int N = N_[i];
int K = K_[i];
REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
const MatrixPtr &inMat = getInputValue(i);
for (int n = 0; n < batchSize; ++n) {
colBuf_->vol2Col(
getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
numFilters_,
outputD_[i],
outputH_[i],
outputW_[i],
filterSizeZ_[i],
filterSizeY_[i],
filterSize_[i],
strideZ_[i],
strideY_[i],
stride_[i],
paddingZ_[i],
paddingY_[i],
padding_[i]);
if (weights_[i]->getWGrad()) {
real *inData = inMat->getData() + n * inMat->getStride();
for (int g = 0; g < groups_[i]; ++g) {
MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
MatrixPtr wGradMatSub =
weights_[i]->getWGrad()->subMatrix(g * K, K);
MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
wGradMatSub->mul(
*colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
inData += M * N;
}
}
if (getInputGrad(i)) {
real *preGrad =
getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
for (int g = 0; g < groups_[i]; ++g) {
MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
MatrixPtr inGradMatSub =
Matrix::create(preGrad, M, N, false, useGpu_);
inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
preGrad += M * N;
}
}
}
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
weights_[i]->getParameterPtr()->incUpdate(callback);
}
}
}
void DeConv3DLayer::bpropWeights(int i) {}
void DeConv3DLayer::bpropData(int i) {}
void DeConv3DLayer::bpropBiases() {
const MatrixPtr &outGradMat = getOutputGrad();
if (this->sharedBiases_) {
biases_->getWGrad()->collectSharedBias(*outGradMat, 1.0f);
} else {
biases_->getWGrad()->collectBias(*outGradMat, 1.0f);
}
}
void DeConv3DLayer::addBias() {
MatrixPtr outMat = getOutputValue();
if (this->sharedBiases_) {
outMat->addSharedBias(*(biases_->getW()), 1.0f);
} else {
outMat->addBias(*(biases_->getW()), 1.0f);
}
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "ConvBaseLayer.h"
#include "paddle/math/MathUtils.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief A subclass of deconvolution3D layer.
* This layer expands input and use matrix multiplication to
* calculate deconvolution3D operation.
*/
class DeConv3DLayer : public ConvBaseLayer {
public:
explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
~DeConv3DLayer() {}
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
void forward(PassType passType);
void addBias();
void backward(const UpdateCallback& callback);
void bpropBiases();
void bpropData(int i);
void bpropWeights(int i);
size_t getSize();
protected:
// Figure out the dimensions for individual gemms.
IntV M_; /// numFilters_ / filter_group_;
IntV N_; /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
IntV K_; /// outputD_ * outputH_ * outputW_
IntV NOut_;
MatrixPtr colBuf_;
};
} // namespace paddle
......@@ -22,12 +22,31 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
/* Initialize the basic convolutional parent class */
ConvBaseLayer::init(layerMap, parameterMap);
int index = 0;
for (auto &inputConfig : config_.inputs()) {
const ConvConfig &conf = inputConfig.conv_conf();
/* Consistent caffe mode for multiple input */
caffeMode_ = conf.caffe_mode();
}
// create a new weight
size_t height, width;
height = filterPixels_[index] * filterChannels_[index];
width = (!isDeconv_) ? numFilters_ : channels_[index];
CHECK_EQ(parameters_[index]->getSize(), width * height);
Weight *w = new Weight(height, width, parameters_[index]);
weights_.emplace_back(w);
index++;
}
if (biasParameter_.get()) {
if (sharedBiases_) {
CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
biases_ =
std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
} else {
biases_ =
std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
}
}
getOutputSize();
return true;
......
......@@ -29,6 +29,10 @@ namespace paddle {
REGISTER_LAYER(exconv, ExpandConvLayer);
REGISTER_LAYER(exconvt, ExpandConvLayer);
inline bool isDepthwiseConv(int channels, int groups) {
return channels == groups;
}
bool ExpandConvLayer::init(const LayerMap &layerMap,
const ParameterMap &parameterMap) {
/* Initialize the basic convolutional parent class */
......@@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
// Convolution Layer uses the GemmConv function by default.
convType = "GemmConv";
convGradInputType = "GemmConvGradInput";
convGradFilterType = "GemmConvGradFilter";
// If depth wise convolution and useGpu == true
if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
convType = "DepthwiseConv";
convGradInputType = "DepthwiseConvGradInput";
convGradFilterType = "DepthwiseConvGradFilter";
} else {
convType = "GemmConv";
convGradInputType = "GemmConvGradInput";
convGradFilterType = "GemmConvGradFilter";
}
// If depth wise convolution and useGpu == false and ARM-NEON
if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
if ((filterSize_[i] == filterSizeY_[i]) &&
(filterSize_[i] == 3 || filterSize_[i] == 4) &&
(stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
convType = "NeonDepthwiseConv";
}
#endif
}
if (FLAGS_use_nnpack && !isDeconv_) {
......
......@@ -41,7 +41,7 @@ namespace paddle {
Layer::Layer(const LayerConfig& config, bool useGpu)
: config_(config),
useGpu_(useGpu),
deviceId_(-1),
deviceId_(CPU_DEVICE),
needSequenceInfo_(true) {}
bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
......
......@@ -59,7 +59,12 @@ protected:
LayerConfig config_;
/// whether to use GPU
bool useGpu_;
/// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
/// Paddle device ID, MKLDNN is -2, CPU is -1
enum PADDLE_DEVICE_ID {
MKLDNN_DEVICE = -2,
CPU_DEVICE = -1,
};
/// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
int deviceId_;
/// Input layers
std::vector<LayerPtr> inputLayers_;
......@@ -77,6 +82,7 @@ protected:
Argument output_;
/// Several outputs stored on different devices, used in 'parallel_nn' case,
/// and record them by deviceId_.
/// Also used in 'use_mkldnn' case.
std::vector<Argument> outputOtherDevice_;
/// If there are several outputs, map them by each name.
std::map<std::string, Argument*> outputMap_;
......@@ -172,6 +178,13 @@ protected:
return inputLayer.getOutput(deviceId_);
}
/**
* Get the argument of input layer with deviceId.
*/
const Argument& getInput(size_t inputIndex, int deviceId) const {
return inputLayers_[inputIndex]->getOutput(deviceId);
}
/**
* Get the forward-input value.
*/
......@@ -186,6 +199,13 @@ protected:
return inputLayer.getOutput(deviceId_).value;
}
/**
* Get the forward-input value with deviceId.
*/
const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).value;
}
/**
* Get the forward-input grad.
*/
......@@ -200,6 +220,13 @@ protected:
return inputLayer.getOutput(deviceId_).grad;
}
/**
* Get the forward-input grad.
*/
const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).grad;
}
/**
* Get the forward-input label.
*/
......
......@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
return;
}
// TODO(TJ): dst format should get from wgtVal_
int dstFmt = PARAM_FORMAT_MKLDNN_OI;
int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
if (srcFmt == dstFmt) {
return;
}
// The weight_ is transposed from initial paddle weight
MatrixPtr paddleWgt = Matrix::create(
weight_->getW()->getData(), iLayerSize_, oc_, false, false);
// TODO(TJ): remove this print when do not need differ weights
std::ostringstream ostr;
paddleWgt->print(ostr);
VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
// The mkldnn weight is transposed from initial paddle matrix
MatrixPtr paddleWgtT;
paddleWgt->transpose(paddleWgtT, true);
weight_->getW()->copyFrom(*paddleWgtT);
weight_->getParameterPtr()->setHeaderFormat(dstFmt);
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
hasInitedWgt_ = true;
}
void MKLDNNFcLayer::convertWeightsToPaddle() {
MatrixPtr dnnWgt = weight_->getW();
MatrixPtr paddleWgt;
dnnWgt->transpose(paddleWgt, true);
// copy paddle weight and override on weight_
MatrixPtr dnnWgtT = Matrix::create(
dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
dnnWgtT->copyFrom(*paddleWgt);
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
}
void MKLDNNFcLayer::convertOutputToOtherDevice() {
copyOutputInfoToOtherDevice();
// find other cpu device and reorder output to cpu device
int cnt = 0;
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
// fc cpu output value do not need convert
// just share point
outputOtherDevice_[i].value = output_.value;
++cnt;
}
}
if (cnt > 1) {
LOG(WARNING) << "should not have more than one CPU devie";
}
}
void MKLDNNFcLayer::reshape() {
const Argument& input = getInput(0);
const Argument& input = getInput(0, getPrev(0)->getDeviceId());
int batchSize = input.getBatchSize();
if (bs_ == batchSize) {
return;
......@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
if (iw_ == 0) {
iw_ = 1;
}
hasSpatial_ = true;
if (ih_ == 1 && iw_ == 1) {
hasSpatial_ = false;
}
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
ic_ = iLayerSize_ / (ih_ * iw_);
CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
......@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {
void MKLDNNFcLayer::resetFwd() {
bool hasBias = biases_ && biases_->getW();
real* iData = getInputValue(0)->getData();
real* oData = getOutputValue()->getData();
real* wData = weight_->getW()->getData();
real* bData = hasBias ? biases_->getW()->getData() : NULL;
// TODO(TJ): below create should be covered in MkldnnMatrix
// create memory desc
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
: createMD({bs_, ic_}, format::nc);
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi);
memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
: createMD({}, format::format_undef);
memory::desc oMD = createMD({bs_, oc_}, format::nc);
// create memory primitive desc and memory self
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
const MatrixPtr& wgt = weight_->getW();
const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
const MatrixPtr& out = output_.value;
if (inputIsOnlyMKLDNN()) {
const MatrixPtr& in = getInputValue(0);
inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
CHECK(inVal_) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
inVal_ = MKLDNNMatrix::create(
in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
inVal_->downSpatial();
wgtVal_ = MKLDNNMatrix::create(
wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
wgtVal_->downSpatial();
biasVal_ =
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
// change original output value to mkldnn output value
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
if (!outputIsOnlyMKLDNN()) {
convertOutputToOtherDevice();
}
// create forward handle
prop_kind pk = prop_kind::forward;
fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
: fc_fwd::desc(pk, iMD, wMD, oMD);
fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
biasVal_->getMemoryDesc(),
outVal_->getMemoryDesc())
: fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
outVal_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
if (bData != NULL) {
biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
if (hasBias) {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
} else {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
}
printValueFormatFlow();
pipelineFwd_.clear();
pipelineFwd_.push_back(*fwd_);
}
......@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
return;
}
needResetBwd_ = false;
bool hasBias = biases_ && biases_->getWGrad();
real* iData = getInputValue(0)->getData();
real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
real* oDiff = getOutputGrad()->getData();
real* wDiff = weight_->getWGrad()->getData();
real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
/// backward weight
// create memory desc for backward memory
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
: createMD({bs_, ic_}, format::nc);
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi);
memory::desc oMD = createMD({bs_, oc_}, format::nc);
memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
: createMD({}, format::format_undef);
if (inVal_) {
// update data
inVal_->set_data_handle(iData);
} else {
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
}
// create memory primitive desc and memory self
wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
CHECK(inVal_) << "Should have input value";
const MatrixPtr& wgt = weight_->getWGrad();
const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
// TODO(TJ): merge outgrad
int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
// for MKLDNN device:
// can not directly cast outputgrad to mkldnnmatrix,
// since each layer can not write the inputgrad to mkldnn inputgrad.
// So just create from matrix with outputvalue format.
// for CPU device:
// fc do not need to convert from cpu device since output is always nc format
// only need create from cpu device
const MatrixPtr& out = getOutput(device).grad;
outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
: nullptr;
// create memory primitive desc
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
: fc_bwdWgt::desc(iMD, wMD, oMD);
fc_bwdWgt::desc bwdWgtDesc = hasBias
? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
biasGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc())
: fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdWgt::primitive_desc bwdWgtPD =
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
if (bDiff != NULL) {
biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
if (hasBias) {
bwdWgt_.reset(
new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
} else {
......@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
pipelineBwd_.push_back(*bwdWgt_);
/// backward data
if (iDiff == NULL) {
device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
const MatrixPtr& in = getInputGrad(0, device);
if (in == nullptr) {
return;
}
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
if (getInput(0, device).getAllCount() > 1) {
// TODO(TJ): use outputMaps_ ways when merge outgrad done
} else {
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
}
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdData::primitive_desc bwdDataPD =
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
CHECK(wgtVal_) << "Should have weight memory";
bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
printGradFormatFlow();
pipelineBwd_.push_back(*bwdData_);
}
......@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
// update input data
// since it might be changed if this is after data layer
real* iData = getInputValue(0)->getData();
inVal_->set_data_handle(iData);
syncInputValue();
// just submit forward pipeline
stream_->submit(pipelineFwd_);
......@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
resetBwd();
// update diff
real* oDiff = getOutputGrad()->getData();
outGrad_->set_data_handle(oDiff);
syncOutputGrad();
// just sumbmit backward pipeline
stream_->submit(pipelineBwd_);
}
......
......@@ -32,16 +32,13 @@ protected:
// if has already init the weight
bool hasInitedWgt_;
// if input layer has image size info (ih>1 && iw>1)
bool hasSpatial_;
// fc weight and bias
std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> biases_;
public:
explicit MKLDNNFcLayer(const LayerConfig& config)
: MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
: MKLDNNLayer(config), hasInitedWgt_(false) {}
~MKLDNNFcLayer() {}
......@@ -75,6 +72,8 @@ protected:
* only would be called when needed
*/
void resetBwd();
void convertOutputToOtherDevice() override;
};
} // namespace paddle
......@@ -18,9 +18,9 @@ limitations under the License. */
#include "Layer.h"
#include "MKLDNNBase.h"
#include "mkldnn.hpp"
#include "paddle/math/MKLDNNMatrix.h"
DECLARE_bool(use_mkldnn);
DECLARE_bool(use_mkldnn_wgt);
namespace paddle {
......@@ -52,15 +52,15 @@ protected:
std::vector<mkldnn::primitive> pipelineFwd_;
std::vector<mkldnn::primitive> pipelineBwd_;
// TODO(TJ): change below memory as MKLDNNMatrixPtr type
std::shared_ptr<mkldnn::memory> inVal_;
std::shared_ptr<mkldnn::memory> inGrad_;
std::shared_ptr<mkldnn::memory> outVal_;
std::shared_ptr<mkldnn::memory> outGrad_;
std::shared_ptr<mkldnn::memory> wgtVal_;
std::shared_ptr<mkldnn::memory> wgtGrad_;
std::shared_ptr<mkldnn::memory> biasVal_;
std::shared_ptr<mkldnn::memory> biasGrad_;
// MKLDNNMatrixPtr
MKLDNNMatrixPtr inVal_;
MKLDNNMatrixPtr inGrad_;
MKLDNNMatrixPtr outVal_;
MKLDNNMatrixPtr outGrad_;
MKLDNNMatrixPtr wgtVal_;
MKLDNNMatrixPtr wgtGrad_;
MKLDNNMatrixPtr biasVal_;
MKLDNNMatrixPtr biasGrad_;
public:
explicit MKLDNNLayer(const LayerConfig& config)
......@@ -83,17 +83,21 @@ public:
virtual bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
// set device id before Layer::init
setDevice(MKLDNN_DEVICE);
// change param device to MKLDNN device
setParamsDevice(MKLDNN_DEVICE, parameterMap);
if (!Layer::init(layerMap, parameterMap)) {
return false;
}
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
stream_.reset(new MKLDNNStream());
engine_ = CPUEngine::Instance().getEngine();
// TODO(TJ): deivecId
return true;
}
......@@ -109,6 +113,12 @@ public:
*/
virtual void convertWeightsToPaddle() {}
/**
* convert MKLDNN output to other device.
* only support CPU device yet
*/
virtual void convertOutputToOtherDevice() {}
/**
* print info about sizes
*/
......@@ -118,14 +128,124 @@ public:
<< ", oh: " << oh_ << ", ow: " << ow_;
}
// TODO(TJ): move to MkldnnMatrix
// create memory desc
inline mkldnn::memory::desc createMD(
mkldnn::memory::dims dims,
mkldnn::memory::format fmt,
mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
// TODO(TJ): isFmtSuppoted(fmt)
return mkldnn::memory::desc(dims, type, fmt);
/**
* Print the mkldnn memory format flow of value
*/
virtual void printValueFormatFlow() {
if (inVal_ && outVal_) {
VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
<< " >>> " << outVal_->getFormat();
}
}
/**
* Print the mkldnn memory format flow of grad
*/
virtual void printGradFormatFlow() {
if (inGrad_ && outGrad_) {
VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
<< " <<< " << outGrad_->getFormat();
}
}
protected:
/**
* copy image size and sequence info to other device
* @note: can not directly use Layer::copyOutputToOtherDevice since here only
* copy base info and do not copy data value
*/
void copyOutputInfoToOtherDevice() {
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
outputOtherDevice_[i].sequenceStartPositions =
output_.sequenceStartPositions;
outputOtherDevice_[i].subSequenceStartPositions =
output_.subSequenceStartPositions;
outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
}
}
/**
* If input only has MKLDNN device.
* Otherwise, only support the previous layer using CPU device.
*/
bool inputIsOnlyMKLDNN(int index = 0) {
int prevDevice = getPrev(index)->getDeviceId();
if (prevDevice == MKLDNN_DEVICE) {
return true;
} else {
// do not support GPU yet
CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
return false;
}
}
/**
* If output only has MKLDNN device.
* Otherwise, other devices should only using CPU device.
*/
bool outputIsOnlyMKLDNN() {
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
<< "Only support other device is CPU yet";
}
return outputOtherDevice_.size() == 0;
}
/**
* Sync input value data
*/
void syncInputValue() {
if (inputIsOnlyMKLDNN()) {
return;
}
real* iData = getInputValue(0, CPU_DEVICE)->getData();
// update input data
// since it might be changed if this is after data layer
inVal_->updateData(iData);
}
/**
* Sync output grad data
*/
void syncOutputGrad() {
if (outputIsOnlyMKLDNN()) {
return;
}
// update diff
real* oDiff = getOutput(CPU_DEVICE).grad->getData();
outGrad_->updateData(oDiff);
}
/**
* Set deviceId of this layer.
*/
void setDevice(int id) { deviceId_ = id; }
/**
* Set deviceId of the params used in this layer.
*/
void setParamsDevice(int id, const ParameterMap& parameterMap) {
for (auto& inputConfig : config_.inputs()) {
if (inputConfig.has_input_parameter_name()) {
ParameterPtr parameter;
std::string name = inputConfig.input_parameter_name();
CHECK(mapGet(name, parameterMap, &parameter))
<< "Cannot find input parameter " << name << " for layer "
<< getName();
parameter->setDevice(id);
}
}
if (config_.has_bias_parameter_name()) {
ParameterPtr parameter;
std::string name = config_.bias_parameter_name();
CHECK(mapGet(name, parameterMap, &parameter))
<< "Cannot find bias parameter " << name << " for layer "
<< getName();
parameter->setDevice(id);
}
}
};
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Pool3DLayer.h"
#include "PoolProjectionLayer.h"
#include "paddle/utils/Logging.h"
namespace paddle {
REGISTER_LAYER(pool3d, Pool3DLayer);
bool Pool3DLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
/* the size of inputs for pool-layer is 1 */
CHECK_EQ(config_.inputs_size(), 1);
const PoolConfig& conf = config_.inputs(0).pool_conf();
poolType_ = conf.pool_type();
channels_ = conf.channels();
sizeX_ = conf.size_x();
sizeY_ = conf.size_y();
sizeZ_ = conf.size_z();
strideW_ = conf.stride();
strideH_ = conf.stride_y();
strideD_ = conf.stride_z();
imgSizeW_ = conf.img_size();
imgSizeH_ = conf.img_size_y();
imgSizeD_ = conf.img_size_z();
paddingW_ = conf.padding();
paddingH_ = conf.padding_y();
paddingD_ = conf.padding_z();
outputW_ = conf.output_x();
outputH_ = conf.output_y();
outputD_ = conf.output_z();
return true;
}
size_t Pool3DLayer::getSize() {
CHECK_EQ(inputLayers_.size(), 1UL);
size_t layerSize = 0;
outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
layerSize = outputD_ * outputH_ * outputW_ * channels_;
getOutput().setFrameHeight(outputH_);
getOutput().setFrameWidth(outputW_);
getOutput().setFrameDepth(outputD_);
return layerSize;
}
void Pool3DLayer::forward(PassType passType) {
Layer::forward(passType);
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
size_t batchSize = inMat->getHeight();
size_t outWidth = getSize();
resetOutput(batchSize, outWidth);
Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
const MatrixPtr outMat = getOutputValue();
if (poolType_ == "avg") {
outMat->avgPool3DForward(*inMat,
channels_,
imgSizeD_,
imgSizeH_,
imgSizeW_,
outputD_,
outputH_,
outputW_,
sizeZ_,
sizeY_,
sizeX_,
strideD_,
strideH_,
strideW_,
paddingD_,
paddingH_,
paddingW_);
} else if (poolType_ == "max") {
outMat->maxPool3DForward(*inMat,
*maxPoolIdx_,
channels_,
imgSizeD_,
imgSizeH_,
imgSizeW_,
outputD_,
outputH_,
outputW_,
sizeZ_,
sizeY_,
sizeX_,
strideD_,
strideH_,
strideW_,
paddingD_,
paddingH_,
paddingW_);
} else {
LOG(FATAL) << "Unknown pool type: " << poolType_;
}
forwardActivation();
}
void Pool3DLayer::backward(const UpdateCallback& callback) {
backwardActivation();
(void)callback;
if (NULL == getInputGrad(0)) return;
MatrixPtr inMat = inputLayers_[0]->getOutputValue();
MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
MatrixPtr outMat = getOutputValue();
MatrixPtr outGradMat = getOutputGrad();
if (poolType_ == "avg") {
inGradMat->avgPool3DBackward(*outGradMat,
imgSizeD_,
imgSizeH_,
imgSizeW_,
outputD_,
outputH_,
outputW_,
sizeZ_,
sizeY_,
sizeZ_,
strideD_,
strideH_,
strideW_,
paddingD_,
paddingH_,
paddingW_,
1.0,
1.0);
} else if (poolType_ == "max") {
inGradMat->maxPool3DBackward(*outGradMat,
*maxPoolIdx_,
imgSizeD_,
imgSizeH_,
imgSizeW_,
outputD_,
outputH_,
outputW_,
sizeZ_,
sizeY_,
sizeZ_,
strideD_,
strideH_,
strideW_,
paddingD_,
paddingH_,
paddingW_,
1.0,
1.0);
} else {
LOG(FATAL) << "Unknown pool type: " << poolType_;
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "Layer.h"
#include "paddle/math/MathUtils.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief Basic parent layer of pooling
* Pools the input within regions
*/
class Pool3DLayer : public Layer {
public:
explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
~Pool3DLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback) override;
size_t getSize();
protected:
int channels_;
int sizeX_, sizeY_, sizeZ_;
int strideW_, strideH_, strideD_;
int paddingW_, paddingH_, paddingD_;
int imgSizeW_, imgSizeH_, imgSizeD_;
int outputW_, outputH_, outputD_;
std::string poolType_;
MatrixPtr maxPoolIdx_;
};
} // namespace paddle
......@@ -48,7 +48,16 @@ public:
<< inputLayers_.size() << ") at " << getName();
}
s << format.substr(pos);
LOG(INFO) << s.str();
const std::string delimiter("\n");
std::string content = s.str();
std::string::size_type foundPos = 0;
std::string::size_type prevPos = 0;
while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
prevPos = foundPos + delimiter.size();
}
LOG(INFO) << content.substr(prevPos);
}
void backward(const UpdateCallback& callback) override {}
......
......@@ -34,6 +34,13 @@ add_unittest_without_exec(test_CRFLayerGrad
add_test(NAME test_CRFLayerGrad
COMMAND test_CRFLayerGrad)
################ test_CrossEntropyOverBeam ####################
add_unittest_without_exec(test_CrossEntropyOverBeam
test_CrossEntropyOverBeamGrad.cpp
LayerGradUtil.cpp)
add_test(NAME test_CrossEntropyOverBeam
COMMAND test_CrossEntropyOverBeam)
################ test_SeqSliceLayerGrad ####################
add_unittest_without_exec(test_SeqSliceLayerGrad
test_SeqSliceLayerGrad.cpp
......
此差异已折叠。
......@@ -48,7 +48,13 @@ public:
*/
virtual void* alloc(size_t size) {
void* ptr;
#ifdef PADDLE_USE_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment
CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
#else
CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
#endif
CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
return ptr;
}
......
......@@ -14,6 +14,17 @@
#
file(GLOB MATH_HEADERS . *.h)
file(GLOB MATH_SOURCES . *.cpp)
if(NOT WITH_MKLDNN)
set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
message(STATUS "Skip compiling with MKLDNNMatrix")
else()
message(STATUS "Compile with MKLDNNMatrix")
endif()
set(MATH_SOURCES
"${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
"${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100755 更改为 100644
此差异已折叠。
文件模式从 100755 更改为 100644
......@@ -17,3 +17,4 @@ from paddle.trainer.config_parser import parse_config_and_serialize
if __name__ == '__main__':
parse_config_and_serialize(
'trainer_config_helpers/tests/layers_test_config.py', '')
# layers_test_config.py
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册