提交 0cb3906d 编写于 作者: L Liu Yiqun

Merge branch 'develop' into build_ios

...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
- id: clang-format-with-version-check - id: clang-format-with-version-check
name: clang-format name: clang-format
description: Format files with ClangFormat. description: Format files with ClangFormat.
entry: ./.clang_format.hook -i entry: bash ./.clang_format.hook -i
language: system language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
- repo: https://github.com/PaddlePaddle/pre-commit-golang - repo: https://github.com/PaddlePaddle/pre-commit-golang
......
...@@ -55,6 +55,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) ...@@ -55,6 +55,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON) option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
......
...@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub ...@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
ARG WITH_GPU ARG WITH_GPU
ARG WITH_AVX ARG WITH_AVX
ARG WITH_DOC ARG WITH_DOC
ARG WITH_STYLE_CHECK
ENV WOBOQ OFF ENV WOBOQ OFF
ENV WITH_GPU=${WITH_GPU:-OFF} ENV WITH_GPU=${WITH_GPU:-ON}
ENV WITH_AVX=${WITH_AVX:-ON} ENV WITH_AVX=${WITH_AVX:-ON}
ENV WITH_DOC=${WITH_DOC:-OFF} ENV WITH_DOC=${WITH_DOC:-OFF}
ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
ENV HOME /root ENV HOME /root
# Add bash enhancements # Add bash enhancements
......
...@@ -28,6 +28,10 @@ if(NOT WITH_TIMER) ...@@ -28,6 +28,10 @@ if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER) add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER) endif(NOT WITH_TIMER)
if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(NOT WITH_PROFILER) if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)
......
...@@ -51,7 +51,7 @@ ExternalProject_Add( ...@@ -51,7 +51,7 @@ ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} ${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS} DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9" GIT_TAG "v0.10"
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
......
...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject) ...@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170720") SET(MKLML_VER "mklml_lnx_2018.0.20170720")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz") SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
......
关于PaddlePaddle
================
PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台,兼备易用性、高效性、灵活性和可扩展性,目前已被百度内部多个产品线广泛使用。
PaddlePaddle目前已经开放源码, 但是远未完善,我们希望能在这个基础上不断的改进、扩展和延伸。
同时我们希望广大开发者积极提供反馈和贡献源代码,建立一个活跃的开源社区。
致谢
--------
在此,特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)
ABOUT
=======
PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
Credits
--------
We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
...@@ -257,6 +257,11 @@ seq_concat ...@@ -257,6 +257,11 @@ seq_concat
.. autoclass:: paddle.v2.layer.seq_concat .. autoclass:: paddle.v2.layer.seq_concat
:noindex: :noindex:
seq_slice
---------
.. autoclass:: paddle.v2.layer.seq_slice
:noindex:
kmax_sequence_score kmax_sequence_score
------------------- -------------------
.. autoclass:: paddle.v2.layer.kmax_sequence_score .. autoclass:: paddle.v2.layer.kmax_sequence_score
...@@ -362,6 +367,11 @@ trans ...@@ -362,6 +367,11 @@ trans
.. autoclass:: paddle.v2.layer.trans .. autoclass:: paddle.v2.layer.trans
:noindex: :noindex:
scale_shift
-----------
.. autoclass:: paddle.v2.layer.scale_shift
:noindex:
Sampling Layers Sampling Layers
=============== ===============
...@@ -409,9 +419,14 @@ multi_binary_label_cross_entropy_cost ...@@ -409,9 +419,14 @@ multi_binary_label_cross_entropy_cost
.. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost .. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
:noindex: :noindex:
huber_cost huber_regression_cost
---------- -------------------------
.. autoclass:: paddle.v2.layer.huber_cost .. autoclass:: paddle.v2.layer.huber_regression_cost
:noindex:
huber_classification_cost
-------------------------
.. autoclass:: paddle.v2.layer.huber_classification_cost
:noindex: :noindex:
lambda_cost lambda_cost
......
...@@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below: ...@@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below:
<img src="src/paddle-task-states.png"/> <img src="src/paddle-task-states.png"/>
1. When a new pass of training starts, all tasks will be placed in the todo queue. 1. When a new pass of training starts, all tasks will be placed in the todo queue.
1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion. 1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer. 1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded. 1. If a task fails for any reason in trainer, or takes longer than a specific period of time, the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero. 1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
### Trainer Process ### Trainer Process
The trainer process will: The trainer process will:
- Receive tasks from the master. - Request tasks from the master.
- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. - Work on the tasks
- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
### Parameter Server Process ### Parameter Server Process
...@@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at ...@@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at
1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations. 1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them. 1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers. 1. Write its ip address to */master/addr* so that trainers can discover it.
1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update. 1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes. When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
...@@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i ...@@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i
When the trainer is started by the Kubernetes, it executes the following steps at startup: When the trainer is started by the Kubernetes, it executes the following steps at startup:
1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count. 1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline. 1. Finds and watches */master/addr* to get master's address.
1. Waits for tasks from the master to start training. 1. Requests for tasks from the master to start training.
If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again. When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
### Parameter Server Process ### Parameter Server Process
......
# 编译PaddlePaddle和运行单元测试
## 需要的软硬件
为了开发PaddlePaddle,我们需要
1. 一台电脑,可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统,以及
1. Docker。
不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker image 里。
## 总体流程
1. 获取源码
```bash
git clone https://github.com/paddlepaddle/paddle
```
2. 安装开发工具到 Docker image 里
```bash
cd paddle; docker build -t paddle:dev .
```
请注意这个命令结尾处的 `.`;它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile),按照其内容创建一个名为 `paddle:dev` 的 Docker image,并且把各种开发工具安装进去。
3. 编译
以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image,同时把当前目录(源码树根目录)映射为 container 里的 `/paddle` 目录,并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake``make` 来编译 `/paddle` 里的源码,结果输出到 `/paddle/build`,也就是本地的源码树根目录里的 `build` 子目录。
```bash
docker run --rm -v $PWD:/paddle paddle:dev
```
上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本,可以用
```bash
docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
```
4. 运行单元测试
用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试:
```bash
NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
如果编译的时候我们用了 `WITH_GPU=OFF` 选项,那么编译过程只会产生 CPU-based 单元测试,那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要:
```bash
docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
有时候我们只想运行一个特定的单元测试,比如 `memory_test`,我们可以
```bash
nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
```
5. 清理
有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要:
```bash
rm -rf build
```
## 为什么要 Docker 呀?
- 什么是 Docker?
如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
- Docker 还是虚拟机?
有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
- 为什么用 Docker?
把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。
- 我可以选择不用Docker吗?
当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
- 学习 Docker 有多难?
理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
- 我可以用 IDE 吗?
当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
```emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
就可以按 `Ctrl-C``c` 键来启动编译了。
- 可以并行编译吗?
是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
## 可能碰到的问题
- Docker 需要 sudo
如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
- 在 Windows/MacOS 上编译很慢
Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)
- 磁盘不够
本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
# Build PaddlePaddle from Source Code and Run Unit Test
## What Developers Need
To contribute to PaddlePaddle, you need
1. A computer -- Linux, BSD, Windows, MacOS, and
1. Docker.
Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image.
## General Process
1. Retrieve source code.
```bash
git clone https://github.com/paddlepaddle/paddle
```
2. Install build tools into a Docker image.
```bash
cd paddle; docker build -t paddle:dev .
```
Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
3. Build from source.
This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile. `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
```bash
docker run -v $PWD:/paddle paddle:dev
```
Above command builds a CUDA-enabled version. If we want to build a CPU-only version, we can type
```bash
docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
```
4. Run unit tests.
To run all unit tests using the first GPU of a node:
```bash
NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them. We can just run
```bash
docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
```
Sometimes we want to run a specific unit test, say `memory_test`, we can run
```bash
nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
```
5. Clean Build.
Sometimes, we might want to clean all thirt-party dependents and built binaries. To do so, just
```bash
rm -rf build
```
## Docker, Or Not?
- What is Docker?
If you haven't heard of it, consider it something like Python's virtualenv.
- Docker or virtual machine?
Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
- Why Docker?
Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
- Can I choose not to use Docker?
Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer. This document exists because Docker would make the development way easier.
- How difficult is it to learn Docker?
It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have.
- Can I use my favorite IDE?
Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like.
Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file:
```emacs
(global-set-key "\C-cc" 'compile)
(setq compile-command
"docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
```
so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
- Does Docker do parallel building?
Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
## Some Gotchas
- Docker requires sudo
An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo.
- Docker on Windows/MacOS builds slowly
On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
- Not enough disk space
Examples in this article uses option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
# 如何写新的Operator
- [概念简介](#概念简介)
- [实现C++类](#实现C++类)
- [定义ProtoMaker类](#定义ProtoMaker类)
- [定义Operator类](#定义Operator类)
- [定义OpKernel类](#定义OpKernel类)
- [注册Operator](#注册Operator)
- [编译](#编译)
- [绑定Python](#绑定Python)
- [实现单元测试](#实现单元测试)
- [前向Operator单测](#前向Operator单测)
- [反向Operator单测](#反向Operator单测)
- [编译和执行](#编译和执行)
## 概念简介
简单介绍需要用到基类,详细介绍请参考设计文档。
- `framework::OperatorBase`: Operator(简写,Op)基类。
- `framework::OpKernel`: Op计算函数的基类,称作Kernel。
- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
依据是否包含kernel,将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorBase`,后者继承自`OperatorWithKernel`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
内容 | 定义位置
-------------- | :----------------------
OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake
Op定义 | `.cc`文件
Kernel实现 | CPU、GPU共享Kernel在`.h`文件,否则,CPU可以在`.cc`文件,GPU可在`.cu`文件。
注册Op | Op注册在`.cc`文件;Kernel注册CPU在`.cc`文件,GPU在`.cu`文件
下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
## 实现C++类
### 1. 定义ProtoMaker类
矩阵乘的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。首先定义`ProtoMaker`来描述该Op的输入、输出及注释:
```
class MulOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of mul op");
AddInput("Y", "The second input of mul op");
AddOutput("Out", "The output of mul op");
AddComment(R"DOC(
Two Element Mul Operator.
The equation is: Out = X * Y
)DOC");
}
};
```
[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`,构造函数包括2个:
- `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。
- `framework::OpAttrChecker` :后者用于检查参数属性的合法性。
构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加该Op的注释,这些函数会将对应内容添加到`OpProto`中。
`MulOp`中添加两个输入`X``Y`,添加了一个输出`Out`,并解释了各自含义,该命名尽可能的规范。
再举个[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)的例子:
```
template <typename AttrType>
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of scale operator.").NotInGradient();
AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
AddComment(R"DOC(Scale operator
The equation is: Out = scale*X
)DOC");
AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
}
};
```
在这个例子里,两处不同:
- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中。
- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
### 2. 定义Operator类
```c++
class MulOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto dim0 = ctx.Input<Tensor>("X")->dims();
auto dim1 = ctx.Input<Tensor>("Y")->dims();
PADDLE_ENFORCE_EQ(dim0.size(), 2,
"input X(%s) should be a tensor with 2 dims, a matrix",
ctx.op_.Input("X"));
PADDLE_ENFORCE_EQ(dim1.size(), 2,
"input Y(%s) should be a tensor with 2 dims, a matrix",
ctx.op_.Input("Y"));
PADDLE_ENFORCE_EQ(
dim0[1], dim1[0],
"First matrix's width must be equal with second matrix's height.");
ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
}
};
```
[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel``public`成员:
```c++
using framework::OperatorWithKernel::OperatorWithKernel;
```
这句表示使用基类`OperatorWithKernel`的构造函数,也可写成:
```c++
MulOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
```
还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是:
- 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。
- 2). 设置输出Tensor的形状。
通常`OpProtoMaker``Op`类的定义写在`.cc`文件中,和要讲到的注册函数一起放在`.cc`
### 3. 定义OpKernel类
```C++
template <typename Place, typename T>
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<Tensor>("X");
auto* Y = context.Input<Tensor>("Y");
auto* Z = context.Output<Tensor>("Out");
Z->mutable_data<T>(context.GetPlace());
auto* device_context =
const_cast<platform::DeviceContext*>(context.device_context_);
math::matmul<Place, T>(*X, false, *Y, false, 1, Z, 0, device_context);
}
};
```
`MulKernel`继承自`framework::OpKernel`,带有模板参数:
- `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)
- `typename T` : 表示数据类型,如`float`, `double`等。
`MulKernel`需要重写`Compute`接口,该接口参数为`const framework::ExecutionContext& context`, `ExecutionContext`相比`InferShapeContext`增加了设备类型,同样可获取到输入输出和属性参数,`Compute`函数里写具体实现时。
注意,不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。`MulOp`的CPU、GPU实现共享同一个`Kernel``OpKernel`不共享的例子可以参考[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)
到此前向Op实现完成,需要在`.cc`文件中注册该op和kernel。反向Op类的定义和Kernel定义与前向Op类似,这里不再重复。但注意,反向Op没有`ProtoMaker`
### 4. 注册Operator
`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
```c++
namespace ops = paddle::operators;
REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::CPUPlace, float>);
```
- `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker``ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`
- `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。
- `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace``float`类型,同理,注册`ops::MulKernel`类。
`.cu`文件中注册GPU Kernel。
```c++
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(mul_grad,
ops::MulGradKernel<paddle::platform::GPUPlace, float>);
```
### 5. 编译
[paddle/operators/CMakeLists.txt](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/CMakeLists.txt)文件中添加编译。
```
op_library(mul_op SRCS mul_op.cc mul_op.cu DEPS math_function)
```
下面命令可以编译:
```
make mul_op
```
## 绑定Python
- 绑定Python
[`paddle/pybind/pybind.cc
`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc)文件中添加该类:
```
USE_OP(mul);
```
如果只实现了CPU版本,则使用`USE_CPU_ONLY_OP`:
```
USE_CPU_ONLY_OP(gather);
```
使用`USE_OP`告知编译器需要链接该Op的目标文件,具体解释参考[代码注释](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h#L81)。
- 生成库
[`paddle/pybind/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/CMakeLists.txt)文件添加类到`DEPS`中,使得该Op可以链接到生成的lib库中。
```
if(WITH_PYTHON)
cc_library(paddle_pybind SHARED
SRCS pybind.cc
DEPS pybind python backward
mul_op
minus_op)
endif(WITH_PYTHON)
```
## 实现单元测试
单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单测](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)
### 前向Operator单测
前向Op单测继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`,具体单测流程在`OpTestMeta`里完成。需在`setUp`函数定义输入输出和属性参数,以及Python对比的输出值。
```
import unittest
import numpy as np
from gradient_checker import GradientChecker, create_op
from op_test_util import OpTestMeta
class TestMulOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "mul"
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
```
首先需要`import`必要的包,下面详细解释其他值:
- `self.type = "mul" ` : 定义类型,和注册的类型一致。
- `self.inputs` : 定义输入,类型为Numpy.array,并初始化。
- `self.outputs` : 定义输出,并得到Python结算结果。
### 反向Operator单测
反向Op单测继承自`GradientChecker`,而`GradientChecker`集成自`unittest.TestCase`,所以反向单测函数需要`test_`开头。
```
class MulGradOpTest(GradientChecker):
def test_mul(self):
op = create_op("mul")
inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32")
}
self.compare_grad(op, inputs)
# mul op will enlarge the relative error
self.check_grad(
op, inputs, set(["X", "Y"]), "Out", max_relative_error=0.5)
```
- 调用`create_op("mul")`创建反向Op对应的前向Op。
- 定义输入`inputs`
- 调用`compare_grad`函数对比CPU、GPU计算结果。
- 调用`check_grad`检查梯度稳定性,这里采用数值法检测梯度正确性。
- 第一个参数`op` : 前向op。
- 第二个参数`inputs` : 输入词典,词典的Key和`ProtoMaker`定义保持一致。
- 第三个参数`set(["X", "Y"])` : 指定对输入变量`X``Y`做梯度检测。
- 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
### 编译和执行
单测完成之后,在[`python/paddle/v2/framework/tests/CMakeLists.txt`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/CMakeLists.txt)里添加编译:
```
py_test(test_mul_op SRCS test_mul_op.py)
```
编译时需要打开`WITH_TESTING`, 即 `cmake paddle_dir -DWITH_TESTING=ON`,编译成功之后执行单测命令为:
```
make test ARGS="-R test_mul_op -V"
```
或者:
```
ctest -R test_mul_op
```
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
dev/build_cn.rst
dev/write_docs_cn.rst dev/write_docs_cn.rst
dev/contribute_to_paddle_cn.md dev/contribute_to_paddle_cn.md
......
...@@ -18,6 +18,7 @@ Development ...@@ -18,6 +18,7 @@ Development
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
dev/build_en.rst
dev/new_layer_en.rst dev/new_layer_en.rst
dev/contribute_to_paddle_en.md dev/contribute_to_paddle_en.md
......
...@@ -7,4 +7,3 @@ PaddlePaddle Documentation ...@@ -7,4 +7,3 @@ PaddlePaddle Documentation
getstarted/index_en.rst getstarted/index_en.rst
howto/index_en.rst howto/index_en.rst
api/index_en.rst api/index_en.rst
about/index_en.rst
...@@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error { ...@@ -63,13 +63,24 @@ func WithAddr(addr string) func(c *Client) error {
// WithEtcd sets the client to use etcd for master discovery. // WithEtcd sets the client to use etcd for master discovery.
func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error { func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
return func(c *Client) error { return func(c *Client) error {
cli, err := clientv3.New(clientv3.Config{ var cli *clientv3.Client
Endpoints: endpoints, f := func() error {
DialTimeout: timeout, var err error
}) cli, err = clientv3.New(clientv3.Config{
if err != nil { Endpoints: endpoints,
DialTimeout: timeout,
})
return err return err
} }
for {
err := f()
if err != nil {
log.Warningln(err)
} else {
break
}
time.Sleep(time.Second)
}
ch := make(chan string, 1) ch := make(chan string, 1)
a, err := GetKey(cli, DefaultAddrPath, timeout) a, err := GetKey(cli, DefaultAddrPath, timeout)
...@@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) { ...@@ -101,9 +112,6 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
} }
} }
c.ch = make(chan record, c.bufSize) c.ch = make(chan record, c.bufSize)
// FIXME: connection is created asyncrosly in monitorMaster go routine,
// ensure the connection is ready for use before calling c.addClient.
time.Sleep(time.Second)
return c, nil return c, nil
} }
......
...@@ -19,6 +19,7 @@ if(Boost_FOUND) ...@@ -19,6 +19,7 @@ if(Boost_FOUND)
add_subdirectory(platform) add_subdirectory(platform)
add_subdirectory(framework) add_subdirectory(framework)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(pybind)
endif() endif()
if(WITH_C_API) if(WITH_C_API)
......
...@@ -28,7 +28,6 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER} ...@@ -28,7 +28,6 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
add_dependencies(paddle_capi paddle_proto) add_dependencies(paddle_capi paddle_proto)
# combine all paddle static libraries together, into libpaddle_capi_whole.a # combine all paddle static libraries together, into libpaddle_capi_whole.a
# user should use PaddleCAPI as -lpaddle_capi_whole # user should use PaddleCAPI as -lpaddle_capi_whole
set(PADDLE_INFER_LIBS set(PADDLE_INFER_LIBS
...@@ -47,7 +46,10 @@ cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_INFER_LIBS}) ...@@ -47,7 +46,10 @@ cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_INFER_LIBS})
# No shared library for iOS # No shared library for iOS
if(NOT IOS) if(NOT IOS)
set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
# TODO: merge mkl into paddle_capi_shared
add_library(paddle_capi_shared SHARED ${CAPI_SOURCES}) add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
set_target_properties(paddle_capi_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
link_paddle_exe(paddle_capi_shared) link_paddle_exe(paddle_capi_shared)
endif() endif()
......
{
global:
paddle_*;
local:
*;
};
...@@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo, int* convBwdDataAlgo,
size_t* bwdDataLimitBytes, size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo, int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes); size_t* bwdFilterLimitBytes,
bool useDilation);
/** /**
* @brief destroy filter descriptor. * @brief destroy filter descriptor.
...@@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, ...@@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width); int stride_width,
int dilation_h = 1,
int dilation_w = 1);
/** /**
* @brief reset convolution descriptor. * @brief reset convolution descriptor.
...@@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, ...@@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width); int stride_width,
int dilation_h = 1,
int dilation_w = 1);
/** /**
* @brief destroy convolution descriptor. * @brief destroy convolution descriptor.
......
...@@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, ...@@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width) {} int stride_width,
int dilation_h,
int dilation_w) {}
inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
hl_tensor_descriptor image, hl_tensor_descriptor image,
...@@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, ...@@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width) {} int stride_width,
int dilation_h,
int dilation_w) {}
inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {} inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
...@@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo, int* convBwdDataAlgo,
size_t* bwdDataLimitBytes, size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo, int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes) {} size_t* bwdFilterLimitBytes,
bool useDilation) {}
inline void hl_convolution_forward(hl_tensor_descriptor input, inline void hl_convolution_forward(hl_tensor_descriptor input,
real* input_data, real* input_data,
......
...@@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input,
int* convBwdDataAlgo, int* convBwdDataAlgo,
size_t* bwdDataLimitBytes, size_t* bwdDataLimitBytes,
int* convBwdFilterAlgo, int* convBwdFilterAlgo,
size_t* bwdFilterLimitBytes) { size_t* bwdFilterLimitBytes,
bool useDilation) {
#if CUDNN_VERSION >= 4000 #if CUDNN_VERSION >= 4000
CHECK_NOTNULL(input); CHECK_NOTNULL(input);
...@@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input,
size_t memoryLimitBytes = size_t memoryLimitBytes =
(1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb; (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
// For dilation
int algo = 0;
// cudnn convolution forward configuration // cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input); cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output); cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter); cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
// cudnn convolution backward data configuration
cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnConvolutionDescriptor_t bwd_data_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
// cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm( if (useDilation) {
t_resource.cudnn_handle, convFwdAlgo = &algo;
fwd_src_desc, convBwdDataAlgo = &algo;
fwd_filter_desc, convBwdFilterAlgo = &algo;
fwd_conv_desc, } else {
fwd_dest_desc, CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, t_resource.cudnn_handle,
memoryLimitBytes, fwd_src_desc,
reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo))); fwd_filter_desc,
fwd_conv_desc,
fwd_dest_desc,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
}
CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize( CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
t_resource.cudnn_handle, t_resource.cudnn_handle,
...@@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo), static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
fwdLimitBytes)); fwdLimitBytes));
// cudnn convolution backward data configuration
cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnConvolutionDescriptor_t bwd_data_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
t_resource.cudnn_handle,
bwd_data_filter_desc,
bwd_data_diff_desc,
bwd_data_conv_desc,
bwd_data_grad_desc,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
t_resource.cudnn_handle, t_resource.cudnn_handle,
bwd_data_filter_desc, bwd_data_filter_desc,
...@@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input, ...@@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo), static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
bwdDataLimitBytes)); bwdDataLimitBytes));
// cudnn convolution backward filter configuration
cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
GET_CONVOLUTION_DESCRIPTOR(conv);
cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
t_resource.cudnn_handle,
bwd_filter_src_desc,
bwd_filter_diff_desc,
bwd_filter_conv_desc,
bwd_filter_grad_desc,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
memoryLimitBytes,
reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
t_resource.cudnn_handle, t_resource.cudnn_handle,
bwd_filter_src_desc, bwd_filter_src_desc,
...@@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, ...@@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width) { int stride_width,
int dilation_h,
int dilation_w) {
CHECK_NOTNULL(conv); CHECK_NOTNULL(conv);
cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc( cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
...@@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, ...@@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
padding_width, padding_width,
stride_height, stride_height,
stride_width, stride_width,
1, dilation_h,
1, dilation_w,
mode, mode,
data_type)); data_type));
#else #else
if (dilation_h > 1 || dilation_w > 1) {
LOG(FATAL)
<< "Current cuDNN version does't support for dilation convolution. "
<< "The dilation convolution requires cuDNN >= v6.0.";
}
CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc, CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
padding_height, padding_height,
padding_width, padding_width,
stride_height, stride_height,
stride_width, stride_width,
1, dilation_h,
1, dilation_w,
mode)); mode));
#endif #endif
...@@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, ...@@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
int padding_height, int padding_height,
int padding_width, int padding_width,
int stride_height, int stride_height,
int stride_width) { int stride_width,
int dilation_h,
int dilation_w) {
CHECK_NOTNULL(conv); CHECK_NOTNULL(conv);
CHECK_NOTNULL(image); CHECK_NOTNULL(image);
CHECK_NOTNULL(filter); CHECK_NOTNULL(filter);
...@@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, ...@@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
padding_width, padding_width,
stride_height, stride_height,
stride_width, stride_width,
1, dilation_h,
1, dilation_w,
mode, mode,
data_type)); data_type));
#else #else
...@@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, ...@@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
padding_width, padding_width,
stride_height, stride_height,
stride_width, stride_width,
1, dilation_h,
1, dilation_w,
mode)); mode));
#endif #endif
......
...@@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope) ...@@ -18,8 +18,8 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
cc_library(attribute SRCS attribute.cc DEPS framework_proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(operator SRCS operator.cc DEPS framework_proto device_context tensor scope attribute) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator) cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
...@@ -39,21 +39,3 @@ add_custom_command(TARGET framework_py_proto POST_BUILD ...@@ -39,21 +39,3 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
cc_library(backward SRCS backward.cc DEPS net_op) cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
if(WITH_PYTHON)
cc_library(paddle_pybind SHARED
SRCS pybind.cc
DEPS pybind python backward
sgd_op
add_op
mul_op
rowwise_add_op
sigmoid_op
softmax_op
mean_op
cross_entropy_op
recurrent_op
uniform_random_op
gaussian_random_op
fill_zeros_like_op)
endif(WITH_PYTHON)
...@@ -110,7 +110,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive( ...@@ -110,7 +110,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
dup_output_ops[out].emplace_back(local_op_id); dup_output_ops[out].emplace_back(local_op_id);
return false; return false;
}); });
net->AddOp(std::move(bwd)); net->AppendOp(std::move(bwd));
} }
// Get unique ID for this method. // Get unique ID for this method.
auto uid = uniq_id++; auto uid = uniq_id++;
...@@ -124,6 +124,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive( ...@@ -124,6 +124,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
std::list<Pos> insert_position; std::list<Pos> insert_position;
for (auto& dup_output_op : dup_output_ops) { for (auto& dup_output_op : dup_output_ops) {
const std::string& name = dup_output_op.first; const std::string& name = dup_output_op.first;
// duplicate @Empty@ don't need to be added
if (name == kEmptyVarName) continue;
auto& dup_op = dup_output_op.second; auto& dup_op = dup_output_op.second;
// no duplicate output // no duplicate output
if (dup_op.size() == 1) continue; if (dup_op.size() == 1) continue;
...@@ -163,8 +166,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive( ...@@ -163,8 +166,9 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
// If part of input gradient of that operator is not calculated, fill // If part of input gradient of that operator is not calculated, fill
// zero variables to that input gradient. // zero variables to that input gradient.
net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {{"Src", {prefix}}}, net->AppendOp(OpRegistry::CreateOp("fill_zeros_like",
{{"Dst", {grad_input}}}, {})); {{"Src", {prefix}}},
{{"Dst", {grad_input}}}, {}));
} }
return false; return false;
}); });
...@@ -195,7 +199,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive( ...@@ -195,7 +199,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
if (net->ops_.empty()) { // Current no aux op is added to network if (net->ops_.empty()) { // Current no aux op is added to network
return grad_op; return grad_op;
} }
net->AddOp(std::move(grad_op)); net->AppendOp(std::move(grad_op));
} }
net->SetType("@GENERATED_BACKWARD@"); net->SetType("@GENERATED_BACKWARD@");
net->CompleteAddOp(); net->CompleteAddOp();
...@@ -208,7 +212,7 @@ std::unique_ptr<OperatorBase> Backward( ...@@ -208,7 +212,7 @@ std::unique_ptr<OperatorBase> Backward(
const OperatorBase& forwardOp, const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars) { const std::unordered_set<std::string>& no_grad_vars) {
std::unordered_set<std::string> no_grad_names; std::unordered_set<std::string> no_grad_names;
no_grad_names.reserve(no_grad_vars.size()); no_grad_names.reserve(no_grad_vars.size() + 1);
no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix); no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
......
## Operator/expression 's Backward # Operator/expression 's Backward
### Motivation ## Motivation
In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass. In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation graph, the operator/expression's backward pass will be generated respect to forward pass.
## Backward Operator Registry
### Implement : gradient operator registry A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs and output gradients and then calculate its input gradients.
| | forward operator | backward operator | | | forward operator | backward operator
| ---------------------- | ---------------- | -------------------------------- | | ---------------------- | ---------------- |------------------------- |
| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients | | **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients |
| **Operator::outputs_** | Outputs | InputGradients | | **Operator::outputs_** | Outputs | InputGradients |
Inputs/Outputs means the input/output of the operator, InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute. In most cases, there is a one-to-one correspondence between forward and backward operators. These correspondences are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and make operators pluggable, the registry mechanism is introduced.
We use a global hash map record the gradient operators available, follow the philosophy of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. For example, we have got a `mul_op`, and we can register it's information and corresponding backward operator by the following macro:
grad_op_builder(fengjiayi) ```cpp
REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
```
### Implement : Backward network `mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
## Backward Opeartor Creating
Given a certain forward operator, we can get its corresponding backward opeartor by calling:
```cpp
OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
```
The function `BuildGradOp` will sequentially execute following processes:
1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
2. Build two maps named `inputs` and `outputs` to temporary storage backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these are not necessary for gradient computing.
3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
## Backward Network Building
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and put them together.
In our design, the network itself is also a kind of operator. So the operators contained by a big network may be some small network.
given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`. given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
1. bla bla bla (yuyang) 1. Op
when the input forward network is a Op, return its gradient Operator Immediately.
2. NetOp 2. NetOp
when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name. when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to forward NetOp.
**shared variable**. As illustrated in the pictures, two operator's `Output` `Gradient` will overwirte their shared input variable.
<p align="center">
<img src="./images/duplicate_op.png" width="70%" ><br/>
1. shared variable in two operators.
</p>
Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator replace the overwirte links.
<p align="center">
<img src="images/duplicate_op2.png" width="90%" ><br/>
We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable. 2. replace shared variable gradient with `Add` Operator
![./images/duplicate_op]() </p>
Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead.
![./images/duplicate_op2]()
​ Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it. ​ Then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
...@@ -72,16 +72,16 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker { ...@@ -72,16 +72,16 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
class FcOp : public operators::NetOp { class FcOp : public operators::NetOp {
public: public:
FcOp(const std::string &type, const VarNameMap &inputs, FcOp(const std::string &type, const VariableNameMap &inputs,
const VarNameMap &outputs, const AttributeMap &attrs) const VariableNameMap &outputs, const AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) { : NetOp(type, inputs, outputs, attrs) {
AddOp(OpRegistry::CreateOp("mul", AppendOp(OpRegistry::CreateOp("mul",
{{"X", {Input("X")}}, {"Y", {Input("W")}}}, {{"X", {Input("X")}}, {"Y", {Input("W")}}},
{{"Out", {Output("mul_result")}}}, {})); {{"Out", {Output("mul_result")}}}, {}));
auto input_b = Inputs("b"); auto input_b = Inputs("b");
std::string before_act = "mul_result"; std::string before_act = "mul_result";
if (input_b.size() != 0) { if (input_b.size() != 0) {
AddOp(OpRegistry::CreateOp( AppendOp(OpRegistry::CreateOp(
"rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}}, "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
{{"Out", {Output("add_result")}}}, {})); {{"Out", {Output("add_result")}}}, {}));
before_act = "add_result"; before_act = "add_result";
...@@ -92,8 +92,8 @@ class FcOp : public operators::NetOp { ...@@ -92,8 +92,8 @@ class FcOp : public operators::NetOp {
} }
} }
AddOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}}, AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
{{"Out", {Output("Out")}}}, {})); {{"Out", {Output("Out")}}}, {}));
CompleteAddOp(false); CompleteAddOp(false);
} }
}; };
...@@ -234,13 +234,13 @@ TEST(Backward, net_fc_backward_not_have_b) { ...@@ -234,13 +234,13 @@ TEST(Backward, net_fc_backward_not_have_b) {
TEST(Backward, net_input_of_network_not_need_grad) { TEST(Backward, net_input_of_network_not_need_grad) {
ops::NetOp net; ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp( net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}}, "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_tmp_0"}}, {{"mul_result", {"mul_tmp_0"}},
{"add_result", {"add_tmp_0"}}, {"add_result", {"add_tmp_0"}},
{"Out", {"hidden0"}}}, {"Out", {"hidden0"}}},
{})); {}));
net.AddOp(f::OpRegistry::CreateOp( net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}}, "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_tmp_1"}}, {{"mul_result", {"mul_tmp_1"}},
{"add_result", {"add_tmp_1"}}, {"add_result", {"add_tmp_1"}},
...@@ -273,10 +273,10 @@ TEST(Backward, net_input_of_network_not_need_grad) { ...@@ -273,10 +273,10 @@ TEST(Backward, net_input_of_network_not_need_grad) {
TEST(Backward, net_shared_weight) { TEST(Backward, net_shared_weight) {
ops::NetOp net; ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}}, net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
{{"Out", {"out"}}}, {})); {{"Out", {"out"}}}, {}));
net.AddOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}}, net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
{{"Out", {"FinalOut"}}}, {})); {{"Out", {"FinalOut"}}}, {}));
net.CompleteAddOp(); net.CompleteAddOp();
auto bwd = f::Backward(net, {}); auto bwd = f::Backward(net, {});
...@@ -357,19 +357,19 @@ TEST(Backward, op_part_of_input_are_not_need) { ...@@ -357,19 +357,19 @@ TEST(Backward, op_part_of_input_are_not_need) {
TEST(Backward, linear_net_intermediate_variable_has_no_grad) { TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
ops::NetOp net; ops::NetOp net;
net.AddOp(f::OpRegistry::CreateOp( net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}}, "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"mul_result", {"mul_out1"}}, {{"mul_result", {"mul_out1"}},
{"add_result", {"add_out1"}}, {"add_result", {"add_out1"}},
{"Out", {"out1"}}}, {"Out", {"out1"}}},
{})); {}));
net.AddOp(f::OpRegistry::CreateOp( net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}}, "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
{{"mul_result", {"mul_out2"}}, {{"mul_result", {"mul_out2"}},
{"add_result", {"tmp_out2"}}, {"add_result", {"tmp_out2"}},
{"Out", {"out2"}}}, {"Out", {"out2"}}},
{})); {}));
net.AddOp(f::OpRegistry::CreateOp( net.AppendOp(f::OpRegistry::CreateOp(
"fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}}, "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
{{"mul_result", {"mul_out3"}}, {{"mul_result", {"mul_out3"}},
{"add_result", {"tmp_out3"}}, {"add_result", {"tmp_out3"}},
......
...@@ -20,13 +20,13 @@ namespace framework { ...@@ -20,13 +20,13 @@ namespace framework {
enum class OpArgType { IN, OUT }; enum class OpArgType { IN, OUT };
static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type, static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
bool is_grad, OperatorBase::VarNameMap* vars) { bool is_grad, VariableNameMap* vars) {
const auto& src_inout = const auto& src_inout =
src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs(); src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
auto& dst_inout = *vars; auto& dst_inout = *vars;
const OpProto* proto = OpRegistry::op_info_map().at(src_op->Type()).proto_; auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
const auto& src_arg_list = const auto& src_arg_list =
src_type == OpArgType::IN ? proto->inputs() : proto->outputs(); src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
for (const auto& arg : src_arg_list) { for (const auto& arg : src_arg_list) {
if (arg.not_in_gradient() && !is_grad) continue; if (arg.not_in_gradient() && !is_grad) continue;
const std::string src_name = arg.name(); const std::string src_name = arg.name();
...@@ -40,26 +40,18 @@ static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type, ...@@ -40,26 +40,18 @@ static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
} }
OperatorBase* BuildGradOp(const OperatorBase* op) { OperatorBase* BuildGradOp(const OperatorBase* op) {
auto it = OpRegistry::op_info_map().find(op->Type()); auto& info = OpInfoMap::Instance().Get(op->Type());
PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(), PADDLE_ENFORCE(info.HasGradientOp());
"'%s' has not been registered.", op->Type());
PADDLE_ENFORCE(it->second.proto_ != nullptr, "'%s' has no OpProto.",
op->Type());
std::string grad_op_type = it->second.grad_op_type_;
PADDLE_ENFORCE(!grad_op_type.empty(), "'%s' has no gradient operator.",
op->Type());
OperatorBase::VarNameMap inputs; VariableNameMap inputs;
OperatorBase::VarNameMap outputs; VariableNameMap outputs;
TransOpArg(op, OpArgType::IN, false, &inputs); // I TransOpArg(op, OpArgType::IN, false, &inputs); // I
TransOpArg(op, OpArgType::OUT, false, &inputs); // O TransOpArg(op, OpArgType::OUT, false, &inputs); // O
TransOpArg(op, OpArgType::OUT, true, &inputs); // OG TransOpArg(op, OpArgType::OUT, true, &inputs); // OG
TransOpArg(op, OpArgType::IN, true, &outputs); // IG TransOpArg(op, OpArgType::IN, true, &outputs); // IG
it = OpRegistry::op_info_map().find(grad_op_type); auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
PADDLE_ENFORCE(it != OpRegistry::op_info_map().end(), return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
"'%s' has not been registered.", grad_op_type);
return it->second.creator_(grad_op_type, inputs, outputs, op->Attrs());
} }
} // namespace framework } // namespace framework
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_info.h"
namespace paddle {
namespace framework {
static OpInfoMap* g_op_info_map = nullptr;
OpInfoMap& OpInfoMap::Instance() {
if (g_op_info_map == nullptr) {
g_op_info_map = new OpInfoMap();
}
return *g_op_info_map;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include <map>
#include <string>
#include <unordered_map>
#include "paddle/framework/attribute.h"
namespace paddle {
namespace framework {
class OperatorBase;
using VariableNameMap = std::map<std::string, std::vector<std::string>>;
using OpCreator = std::function<OperatorBase*(
const std::string& /*type*/, const VariableNameMap& /*inputs*/,
const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
struct OpInfo {
OpCreator creator_;
std::string grad_op_type_;
OpProto* proto_;
OpAttrChecker* checker_;
bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr;
}
const OpProto& Proto() const {
PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
PADDLE_ENFORCE(proto_->IsInitialized(),
"Operator Proto must be initialized in op info");
return *proto_;
}
const OpAttrChecker& Checker() const {
PADDLE_ENFORCE_NOT_NULL(checker_,
"Operator Checker has not been registered");
return *checker_;
}
const OpCreator& Creator() const {
PADDLE_ENFORCE_NOT_NULL(creator_,
"Operator Creator has not been registered");
return creator_;
}
bool HasGradientOp() const { return !grad_op_type_.empty(); }
};
class OpInfoMap {
public:
static OpInfoMap& Instance();
OpInfoMap(const OpInfoMap& o) = delete;
OpInfoMap(OpInfoMap&& o) = delete;
OpInfoMap& operator=(const OpInfoMap& o) = delete;
OpInfoMap& operator=(OpInfoMap&& o) = delete;
bool Has(const std::string& op_type) const {
return map_.find(op_type) != map_.end();
}
void Insert(const std::string& type, const OpInfo& info) {
PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
map_.insert({type, info});
}
const OpInfo& Get(const std::string& type) const {
auto it = map_.find(type);
PADDLE_ENFORCE(it != map_.end(), "Operator %s are not found", type);
return it->second;
}
template <typename Callback>
void IterAllInfo(Callback callback) {
for (auto& it : map_) {
callback(it.first, it.second);
}
}
private:
OpInfoMap() = default;
std::unordered_map<std::string, const OpInfo> map_;
};
} // namespace framework
} // namespace paddle
...@@ -19,32 +19,18 @@ limitations under the License. */ ...@@ -19,32 +19,18 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const std::string& type, std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
const VarNameMap& inputs, const std::string& type, const VariableNameMap& inputs,
const VarNameMap& outputs, const VariableNameMap& outputs, AttributeMap attrs) {
AttributeMap attrs) { auto& info = OpInfoMap::Instance().Get(type);
auto it = op_info_map().find(type); info.Checker().Check(attrs);
PADDLE_ENFORCE(it != op_info_map().end(), auto op = info.Creator()(type, inputs, outputs, attrs);
"Operator '%s' has not been registered.", type);
it->second.checker_->Check(attrs);
auto op = it->second.creator_(type, inputs, outputs, attrs);
return std::unique_ptr<OperatorBase>(op); return std::unique_ptr<OperatorBase>(op);
} }
std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) { static VariableNameMap ConvertOpDescVarsToVarNameMap(
VarNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
VarNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
AttributeMap attrs;
for (auto& attr : op_desc.attrs()) {
attrs[attr.name()] = GetAttrValue(attr);
}
return CreateOp(op_desc.type(), inputs, outputs, attrs);
}
OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) { const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
VarNameMap ret_val; VariableNameMap ret_val;
for (auto& var : op_desc_vars) { for (auto& var : op_desc_vars) {
auto& var_names = ret_val[var.parameter()]; auto& var_names = ret_val[var.parameter()];
auto& var_names_in_proto = var.arguments(); auto& var_names_in_proto = var.arguments();
...@@ -55,6 +41,17 @@ OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap( ...@@ -55,6 +41,17 @@ OperatorBase::VarNameMap OpRegistry::ConvertOpDescVarsToVarNameMap(
return ret_val; return ret_val;
} }
std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
AttributeMap attrs;
for (auto& attr : op_desc.attrs()) {
attrs[attr.name()] = GetAttrValue(attr);
}
return CreateOp(op_desc.type(), inputs, outputs, attrs);
}
std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) { std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops"); PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
return std::unique_ptr<OperatorBase>(BuildGradOp(&op)); return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
......
...@@ -23,6 +23,7 @@ limitations under the License. */ ...@@ -23,6 +23,7 @@ limitations under the License. */
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/grad_op_builder.h" #include "paddle/framework/grad_op_builder.h"
#include "paddle/framework/op_info.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
...@@ -30,28 +31,16 @@ namespace paddle { ...@@ -30,28 +31,16 @@ namespace paddle {
namespace framework { namespace framework {
class OpRegistry { class OpRegistry {
using VarNameMap = OperatorBase::VarNameMap;
using OpCreator = std::function<OperatorBase*(
const std::string& /*type*/, const VarNameMap& /*inputs*/,
const VarNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
public: public:
struct OpInfo {
OpCreator creator_;
std::string grad_op_type_;
OpProto* proto_;
OpAttrChecker* checker_;
};
template <typename OpType, typename ProtoMakerType, typename GradOpType> template <typename OpType, typename ProtoMakerType, typename GradOpType>
static void RegisterOp(const std::string& op_type, static void RegisterOp(const std::string& op_type,
const std::string& grad_op_type) { const std::string& grad_op_type) {
PADDLE_ENFORCE(op_info_map().count(op_type) == 0, PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
"'%s' is registered more than once.", op_type); "'%s' is registered more than once.", op_type);
OpInfo op_info; OpInfo op_info;
op_info.creator_ = [](const std::string& type, const VarNameMap& inputs, op_info.creator_ = [](
const VarNameMap& outputs, const std::string& type, const VariableNameMap& inputs,
const AttributeMap& attrs) { const VariableNameMap& outputs, const AttributeMap& attrs) {
return new OpType(type, inputs, outputs, attrs); return new OpType(type, inputs, outputs, attrs);
}; };
op_info.grad_op_type_ = grad_op_type; op_info.grad_op_type_ = grad_op_type;
...@@ -70,7 +59,7 @@ class OpRegistry { ...@@ -70,7 +59,7 @@ class OpRegistry {
op_info.proto_ = nullptr; op_info.proto_ = nullptr;
op_info.checker_ = nullptr; op_info.checker_ = nullptr;
} }
op_info_map().insert(std::make_pair(op_type, op_info)); OpInfoMap::Instance().Insert(op_type, op_info);
// register gradient op // register gradient op
if (!grad_op_type.empty()) { if (!grad_op_type.empty()) {
RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, ""); RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
...@@ -78,21 +67,13 @@ class OpRegistry { ...@@ -78,21 +67,13 @@ class OpRegistry {
} }
static std::unique_ptr<OperatorBase> CreateOp(const std::string& type, static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
const VarNameMap& inputs, const VariableNameMap& inputs,
const VarNameMap& outputs, const VariableNameMap& outputs,
AttributeMap attrs); AttributeMap attrs);
static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc); static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
static VarNameMap ConvertOpDescVarsToVarNameMap(
const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars);
static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op); static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
static std::unordered_map<std::string, const OpInfo>& op_info_map() {
static std::unordered_map<std::string, const OpInfo> op_info_map_;
return op_info_map_;
}
}; };
class Registrar { class Registrar {
......
...@@ -115,8 +115,8 @@ void OperatorBase::Rename(const std::string& old_name, ...@@ -115,8 +115,8 @@ void OperatorBase::Rename(const std::string& old_name,
} }
OperatorBase::OperatorBase(const std::string& type, OperatorBase::OperatorBase(const std::string& type,
const OperatorBase::VarNameMap& inputs, const VariableNameMap& inputs,
const OperatorBase::VarNameMap& outputs, const VariableNameMap& outputs,
const AttributeMap& attrs) const AttributeMap& attrs)
: type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) { : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
static std::atomic<size_t> gUniqId(0UL); static std::atomic<size_t> gUniqId(0UL);
...@@ -141,18 +141,10 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const { ...@@ -141,18 +141,10 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
} }
return ret_val; return ret_val;
} }
auto it = OpRegistry::op_info_map().find(type_); auto& info = OpInfoMap::Instance().Get(Type());
PADDLE_ENFORCE(
it != OpRegistry::op_info_map().end(),
"Operator %s not registered, cannot figure out intermediate outputs",
type_);
PADDLE_ENFORCE(
it->second.proto_ != nullptr,
"Operator %s has no OpProto, cannot figure out intermediate outputs",
type_);
// get all OpProto::Var for outputs // get all OpProto::Var for outputs
for (auto& o : it->second.proto_->outputs()) { for (auto& o : info.Proto().outputs()) {
// ignore all intermediate output // ignore all intermediate output
if (o.intermediate()) continue; if (o.intermediate()) continue;
auto out = outputs_.find(o.name()); auto out = outputs_.find(o.name());
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "op_info.h"
#include "paddle/framework/attribute.h" #include "paddle/framework/attribute.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
...@@ -62,10 +63,8 @@ class ExecutionContext; ...@@ -62,10 +63,8 @@ class ExecutionContext;
*/ */
class OperatorBase { class OperatorBase {
public: public:
using VarNameMap = std::map<std::string, std::vector<std::string>>; OperatorBase(const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs);
OperatorBase(const std::string& type, const VarNameMap& inputs,
const VarNameMap& outputs, const AttributeMap& attrs);
virtual ~OperatorBase() {} virtual ~OperatorBase() {}
...@@ -93,8 +92,8 @@ class OperatorBase { ...@@ -93,8 +92,8 @@ class OperatorBase {
/// rename inputs outputs name /// rename inputs outputs name
void Rename(const std::string& old_name, const std::string& new_name); void Rename(const std::string& old_name, const std::string& new_name);
const VarNameMap& Inputs() const { return inputs_; } const VariableNameMap& Inputs() const { return inputs_; }
const VarNameMap& Outputs() const { return outputs_; } const VariableNameMap& Outputs() const { return outputs_; }
//! Get a input with argument's name described in `op_proto` //! Get a input with argument's name described in `op_proto`
const std::string& Input(const std::string& name) const; const std::string& Input(const std::string& name) const;
//! Get a input which has multiple variables. //! Get a input which has multiple variables.
...@@ -122,30 +121,32 @@ class OperatorBase { ...@@ -122,30 +121,32 @@ class OperatorBase {
// I (Inputs)opear // I (Inputs)opear
// O (Outputs) // O (Outputs)
// OG (Output Gradients) // OG (Output Gradients)
VarNameMap inputs_; VariableNameMap inputs_;
// NOTE: in case of OpGrad, outputs_ contains // NOTE: in case of OpGrad, outputs_ contains
// IG (Inputs Gradients) // IG (Inputs Gradients)
VarNameMap outputs_; VariableNameMap outputs_;
AttributeMap attrs_; AttributeMap attrs_;
}; };
// Macro for define a clone method. // Macro for define a clone method.
// If you are writing an kernel operator, `Clone` will be defined when you // If you are writing an kernel operator, `Clone` will be defined when you
// register it. i.e. `Clone` method is not needed to define by yourself. // register it. i.e. `Clone` method is not needed to define by yourself.
#define DEFINE_OP_CLONE_METHOD(CLS) \ #define DEFINE_OP_CLONE_METHOD(cls) \
std::unique_ptr<OperatorBase> Clone() const final { \ std::unique_ptr<OperatorBase> Clone() const final { \
return std::unique_ptr<OperatorBase>(new CLS(*this)); \ return std::unique_ptr<OperatorBase>(new cls(*this)); \
} }
// Macro for define a default constructor for Operator. // Macro for define a default constructor for Operator.
// You can also use // You can also use
// using PARENT_CLASS::PARENT_CLASS; // using PARENT_CLASS::PARENT_CLASS;
// to use parent's constructor. // to use parent's constructor.
#define DEFINE_OP_CONSTRUCTOR(CLS, PARENT_CLS) \ #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls) \
CLS(const std::string& type, const VarNameMap& inputs, \ cls(const std::string& type, \
const VarNameMap& outputs, const paddle::framework::AttributeMap& attrs) \ const ::paddle::framework::VariableNameMap& inputs, \
: PARENT_CLS(type, inputs, outputs, attrs) {} const ::paddle::framework::VariableNameMap& outputs, \
const paddle::framework::AttributeMap& attrs) \
: parent_cls(type, inputs, outputs, attrs) {}
class NOP : public OperatorBase { class NOP : public OperatorBase {
public: public:
...@@ -389,8 +390,8 @@ class OperatorWithKernel : public OperatorBase { ...@@ -389,8 +390,8 @@ class OperatorWithKernel : public OperatorBase {
using OpKernelMap = using OpKernelMap =
std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>; std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
OperatorWithKernel(const std::string& type, const VarNameMap& inputs, OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
const VarNameMap& outputs, const AttributeMap& attrs) const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void InferShape(const Scope& scope) const override { void InferShape(const Scope& scope) const override {
......
...@@ -23,8 +23,8 @@ static int op_run_num = 0; ...@@ -23,8 +23,8 @@ static int op_run_num = 0;
class OpWithoutKernelTest : public OperatorBase { class OpWithoutKernelTest : public OperatorBase {
public: public:
OpWithoutKernelTest(const std::string& type, const VarNameMap& inputs, OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
const VarNameMap& outputs, const AttributeMap& attrs) const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), x(1) {} : OperatorBase(type, inputs, outputs, attrs), x(1) {}
void InferShape(const Scope& scope) const override {} void InferShape(const Scope& scope) const override {}
void Run(const Scope& scope, void Run(const Scope& scope,
...@@ -249,8 +249,9 @@ TEST(OpKernel, multi_inputs) { ...@@ -249,8 +249,9 @@ TEST(OpKernel, multi_inputs) {
class OperatorClone : public paddle::framework::OperatorBase { class OperatorClone : public paddle::framework::OperatorBase {
public: public:
DEFINE_OP_CLONE_METHOD(OperatorClone); DEFINE_OP_CLONE_METHOD(OperatorClone);
OperatorClone(const std::string& type, const VarNameMap& inputs, OperatorClone(const std::string& type,
const VarNameMap& outputs, const paddle::framework::VariableNameMap& inputs,
const paddle::framework::VariableNameMap& outputs,
const paddle::framework::AttributeMap& attrs) const paddle::framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {} : OperatorBase(type, inputs, outputs, attrs) {}
void InferShape(const paddle::framework::Scope& scope) const override {} void InferShape(const paddle::framework::Scope& scope) const override {}
......
...@@ -105,7 +105,10 @@ class Tensor { ...@@ -105,7 +105,10 @@ class Tensor {
template <typename T> template <typename T>
inline Tensor Slice(const int& begin_idx, const int& end_idx) const; inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
platform::Place place() const { return holder_->place(); } platform::Place place() const {
PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder");
return holder_->place();
}
private: private:
template <typename T> template <typename T>
......
...@@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp) ...@@ -4,6 +4,10 @@ file(GLOB cpp_files . *Op.cpp)
list(APPEND h_files Function.h) list(APPEND h_files Function.h)
list(APPEND cpp_files Function.cpp) list(APPEND cpp_files Function.cpp)
list(APPEND cpp_files BufferArg.cpp) list(APPEND cpp_files BufferArg.cpp)
list(APPEND cpp_files GemmFunctor.cpp)
if(USE_EIGEN_FOR_BLAS)
list(APPEND cpp_files EigenGemm.cpp)
endif(USE_EIGEN_FOR_BLAS)
if(WITH_GPU) if(WITH_GPU)
file(GLOB cu_files . *OpGpu.cu) file(GLOB cu_files . *OpGpu.cu)
...@@ -17,6 +21,8 @@ if(USE_NNPACK) ...@@ -17,6 +21,8 @@ if(USE_NNPACK)
endif() endif()
endif() endif()
list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
add_library(paddle_function STATIC ${cpp_files} ${cu_objs}) add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
add_dependencies(paddle_function ${external_project_dependencies}) add_dependencies(paddle_function ${external_project_dependencies})
add_dependencies(paddle_function paddle_proto) add_dependencies(paddle_function paddle_proto)
...@@ -38,11 +44,11 @@ if(WITH_GPU) ...@@ -38,11 +44,11 @@ if(WITH_GPU)
add_simple_unittest(RowConvOpTest) add_simple_unittest(RowConvOpTest)
add_simple_unittest(BlockExpandOpTest) add_simple_unittest(BlockExpandOpTest)
add_simple_unittest(CropOpTest) add_simple_unittest(CropOpTest)
add_simple_unittest(DepthwiseConvOpTest)
endif() endif()
add_simple_unittest(Im2ColTest) add_simple_unittest(Im2ColTest)
add_simple_unittest(GemmConvOpTest) add_simple_unittest(GemmConvOpTest)
add_simple_unittest(DepthwiseConvOpTest)
endif() endif()
add_style_check_target(paddle_function ${h_files}) add_style_check_target(paddle_function ${h_files})
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "DepthwiseConvOp.h" #include "DepthwiseConvOp.h"
#include "ConvOp.h" #include "ConvOp.h"
#include "GemmFunctor.h"
namespace paddle { namespace paddle {
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "DepthwiseConvOp.h" #include "DepthwiseConvOp.h"
#include "GemmFunctor.h"
#include "paddle/math/BaseMatrix.h" #include "paddle/math/BaseMatrix.h"
namespace paddle { namespace paddle {
......
...@@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) { ...@@ -34,4 +34,13 @@ TEST(DepthwiseConv, BackwardFilter) {
} }
#endif #endif
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
TEST(DepthwiseConv, Forward) {
DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
"GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
}
#endif
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include "unsupported/Eigen/CXX11/Tensor"
namespace paddle {
template <class T>
struct EigenBlasGemm {
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
Eigen::Aligned>
Matrix;
static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
Eigen::array<int, 2> sizeA;
if (transA) {
sizeA[0] = K;
sizeA[1] = M;
CHECK_EQ(M, lda);
} else {
sizeA[0] = M;
sizeA[1] = K;
CHECK_EQ(K, lda);
}
Eigen::array<int, 2> sizeB;
if (transB) {
sizeB[0] = N;
sizeB[1] = K;
CHECK_EQ(K, ldb);
} else {
sizeB[0] = K;
sizeB[1] = N;
CHECK_EQ(N, ldb);
}
Eigen::array<int, 2> sizeC;
sizeC[0] = M;
sizeC[1] = N;
CHECK_EQ(N, ldc);
const Matrix a(const_cast<T*>(A), sizeA);
const Matrix b(const_cast<T*>(B), sizeB);
Matrix c(C, sizeC);
typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims;
dims[0] = DimPair(1, 0);
dims[0].first = transA ? 0 : 1;
dims[0].second = transB ? 1 : 0;
Eigen::DefaultDevice device;
if (alpha == T(1) && beta == T(0)) {
c.device(device) = a.contract(b, dims);
} else if (alpha == T(1) && beta == T(1)) {
c.device(device) += a.contract(b, dims);
} else {
c.device(device) = alpha * a.contract(b, dims) + beta * c;
}
}
};
#ifdef PADDLE_TYPE_DOUBLE
template class EigenBlasGemm<double>;
#else
template class EigenBlasGemm<float>;
#endif
} // namespace paddle
...@@ -85,7 +85,6 @@ public: ...@@ -85,7 +85,6 @@ public:
} }
Im2ColFunctor<kCFO, Device, real> im2col; Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements(); size_t inputOffset = imShape.getElements();
size_t outputOffset = size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth; (outputChannels / groups_) * outputHeight * outputWidth;
...@@ -108,19 +107,19 @@ public: ...@@ -108,19 +107,19 @@ public:
int M = outputChannels / groups_; int M = outputChannels / groups_;
int N = outputHeight * outputWidth; int N = outputHeight * outputWidth;
int K = inputChannels / groups_ * filterHeight * filterWidth; int K = inputChannels / groups_ * filterHeight * filterWidth;
gemm(CblasNoTrans, BlasGemm<Device, real>::compute(false,
CblasNoTrans, false,
M, M,
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset, filterData + g * filterOffset,
K, K,
colData, colData,
N, N,
beta, beta,
outputData + g * outputOffset, outputData + g * outputOffset,
N); N);
} }
inputData += inputChannels * inputHeight * inputWidth; inputData += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth; outputData += outputChannels * outputHeight * outputWidth;
...@@ -188,8 +187,6 @@ public: ...@@ -188,8 +187,6 @@ public:
} }
Col2ImFunctor<kCFO, Device, real> col2im; Col2ImFunctor<kCFO, Device, real> col2im;
GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements(); size_t inputOffset = imShape.getElements();
size_t outputOffset = size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth; (outputChannels / groups_) * outputHeight * outputWidth;
...@@ -205,19 +202,19 @@ public: ...@@ -205,19 +202,19 @@ public:
colData = inputGrad + g * inputOffset; colData = inputGrad + g * inputOffset;
scale = 1.0f; scale = 1.0f;
} }
gemm(CblasTrans, BlasGemm<Device, real>::compute(true,
CblasNoTrans, false,
M, M,
N, N,
K, K,
1.0f, 1.0f,
filterData + g * filterOffset, filterData + g * filterOffset,
M, M,
outputGrad + g * outputOffset, outputGrad + g * outputOffset,
N, N,
scale, scale,
colData, colData,
N); N);
if (needIm2col) { if (needIm2col) {
col2im(inputGrad + g * inputOffset, col2im(inputGrad + g * inputOffset,
imShape, imShape,
...@@ -299,7 +296,6 @@ public: ...@@ -299,7 +296,6 @@ public:
} }
Im2ColFunctor<kCFO, Device, real> im2col; Im2ColFunctor<kCFO, Device, real> im2col;
GemmFunctor<Device, real> gemm;
size_t inputOffset = imShape.getElements(); size_t inputOffset = imShape.getElements();
size_t outputOffset = size_t outputOffset =
(outputChannels / groups_) * outputHeight * outputWidth; (outputChannels / groups_) * outputHeight * outputWidth;
...@@ -321,19 +317,19 @@ public: ...@@ -321,19 +317,19 @@ public:
int M = outputChannels / groups_; int M = outputChannels / groups_;
int K = outputHeight * outputWidth; int K = outputHeight * outputWidth;
int N = inputChannels / groups_ * filterHeight * filterWidth; int N = inputChannels / groups_ * filterHeight * filterWidth;
gemm(CblasNoTrans, BlasGemm<Device, real>::compute(false,
CblasTrans, true,
M, M,
N, N,
K, K,
1.0f, 1.0f,
outputGrad + g * outputOffset, outputGrad + g * outputOffset,
K, K,
colData, colData,
K, K,
i == 0 ? beta : 1.0f, i == 0 ? beta : 1.0f,
filterGrad + g * filterOffset, filterGrad + g * filterOffset,
N); N);
} }
inputData += inputChannels * inputHeight * inputWidth; inputData += inputChannels * inputHeight * inputWidth;
outputGrad += outputChannels * outputHeight * outputWidth; outputGrad += outputChannels * outputHeight * outputWidth;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "GemmFunctor.h"
#include "paddle/math/MathFunctions.h"
namespace paddle {
template <class T>
struct BlasGemm<DEVICE_TYPE_CPU, T> {
static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
#ifdef PADDLE_USE_EIGEN_FOR_BLAS
EigenBlasGemm<T>::compute(
transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
#else
gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
ldc);
#endif
}
};
template <class T>
struct BlasGemm<DEVICE_TYPE_GPU, T> {
static void compute(const bool transA,
const bool transB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
hl_matrix_mul((T*)A,
transA == false ? HPPL_OP_N : HPPL_OP_T,
(T*)B,
transB == false ? HPPL_OP_N : HPPL_OP_T,
C,
M,
N,
K,
alpha,
beta,
lda,
ldb,
ldc);
}
};
template struct BlasGemm<DEVICE_TYPE_CPU, real>;
template struct BlasGemm<DEVICE_TYPE_GPU, real>;
} // namespace paddle
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/math/MathFunctions.h" #include "TensorType.h"
namespace paddle { namespace paddle {
...@@ -24,73 +24,42 @@ namespace paddle { ...@@ -24,73 +24,42 @@ namespace paddle {
// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul // of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
// interface. // interface.
template <DeviceType Device, class T> template <DeviceType Device, class T>
class GemmFunctor { struct BlasGemm {
public: static void compute(const bool transA,
void operator()(const CBLAS_TRANSPOSE transA, const bool transB,
const CBLAS_TRANSPOSE TransB, const int M,
const int M, const int N,
const int N, const int K,
const int K, const T alpha,
const T alpha, const T* A,
const T* A, const int lda,
const int lda, const T* B,
const T* B, const int ldb,
const int ldb, const T beta,
const T beta, T* C,
T* C, const int ldc);
const int ldc);
}; };
// TODO(hedaoyuan): Since the definition of the real type in the Paddle
// conflicts with the Eigen library, so compile the Eigen code can not
// include the Paddle header file. And need an EigenBlasGemm template class
// that does not contain the DeviceType parameter.
// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
template <class T> template <class T>
class GemmFunctor<DEVICE_TYPE_CPU, T> { struct EigenBlasGemm {
public: static void compute(const bool transA,
void operator()(const CBLAS_TRANSPOSE transA, const bool transB,
const CBLAS_TRANSPOSE TransB, const int M,
const int M, const int N,
const int N, const int K,
const int K, const T alpha,
const T alpha, const T* A,
const T* A, const int lda,
const int lda, const T* B,
const T* B, const int ldb,
const int ldb, const T beta,
const T beta, T* C,
T* C, const int ldc);
const int ldc) {
gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
}
};
template <class T>
class GemmFunctor<DEVICE_TYPE_GPU, T> {
public:
void operator()(const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE TransB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const int lda,
const T* B,
const int ldb,
const T beta,
T* C,
const int ldc) {
hl_matrix_mul((T*)A,
transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
(T*)B,
TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
C,
M,
N,
K,
alpha,
beta,
lda,
ldb,
ldc);
}
}; };
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "TensorShape.h" #include "TensorShape.h"
#include "TensorType.h" #include "TensorType.h"
#include "neon/neon_util.h"
namespace paddle { namespace paddle {
...@@ -93,4 +94,95 @@ public: ...@@ -93,4 +94,95 @@ public:
int paddingWidth); int paddingWidth);
}; };
template <class T>
struct Padding {
static void run(const T* src,
T* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
memcpy(dest, src, inputWidth * sizeof(T));
dest += inputWidth;
src += inputWidth;
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = T(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(T));
dest += destWidth * paddingHeight;
}
}
}
};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <>
struct Padding<float> {
static void run(const float* src,
float* dest,
int channels,
int inputHeight,
int inputWidth,
int paddingHeight,
int paddingWidth) {
const int destWidth = inputWidth + 2 * paddingWidth;
for (int c = 0; c < channels; c++) {
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
for (int i = 0; i < inputHeight; i++) {
// padding head
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
int step = inputWidth >> 2;
int remain = inputWidth & 3;
for (int s = 0; s < step; s++) {
float32x4_t s0 = vld1q_f32(src);
vst1q_f32(dest, s0);
src += 4;
dest += 4;
}
for (int r = 0; r < remain; r++) {
*dest++ = *src++;
}
// padding tail
for (int j = 0; j < paddingWidth; j++) {
*dest++ = float(0);
}
}
if (paddingHeight > 0) {
memset(dest, 0, destWidth * paddingHeight * sizeof(float));
dest += destWidth * paddingHeight;
}
}
}
};
#endif
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "neon_util.h"
#include "paddle/function/ConvOp.h"
#include "paddle/function/Im2Col.h"
namespace paddle {
namespace neon {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
template <int filterSize, int stride>
struct DepthwiseConvKernel {};
inline float32_t conv3x3(float32x4_t r0,
float32x4_t r1,
float32x4_t r2,
float32x4_t k0,
float32x4_t k1,
float32x4_t k2) {
float32x4_t tmp;
tmp = vmulq_f32(r0, k0);
tmp = vmlaq_f32(tmp, r1, k1);
tmp = vmlaq_f32(tmp, r2, k2);
return vaddvq_f32(tmp);
}
inline float32_t conv4x4(float32x4_t r0,
float32x4_t r1,
float32x4_t r2,
float32x4_t r3,
float32x4_t k0,
float32x4_t k1,
float32x4_t k2,
float32x4_t k3) {
float32x4_t tmp;
tmp = vmulq_f32(r0, k0);
tmp = vmlaq_f32(tmp, r1, k1);
tmp = vmlaq_f32(tmp, r2, k2);
tmp = vmlaq_f32(tmp, r3, k3);
return vaddvq_f32(tmp);
}
/**
* Each step calculates four elements of the output.
* First step:
* R0[0, 1, 2, 3...] * K[0][0]
* R0[1, 2, 3, 4...] * K[0][1]
* R0[2, 3, 4, 5...] * K[0][2]
* R1[0, 1, 2, 3...] * K[1][0]
* R1[1, 2, 3, 4...] * K[1][1]
* R1[2, 3, 4, 5...] * K[1][2]
* R2[0, 1, 2, 3...] * K[2][0]
* R2[1, 2, 3, 4...] * K[2][1]
* + R2[2, 3, 4, 5...] * K[2][2]
* ------------------------------
* Output[0, 1, 2, 3]
*/
template <>
struct DepthwiseConvKernel<3, 1> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 9) {
// Load the filters
float32x4_t k[3];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 3);
k[2] = vld1q_f32(filterData + 6);
k[0] = vsetq_lane_f32(0.f, k[0], 3);
k[1] = vsetq_lane_f32(0.f, k[1], 3);
k[2] = vsetq_lane_f32(0.f, k[2], 3);
const float* r0 =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
const float* r1 = r0 + inputWidth;
const float* r2 = r0 + inputWidth * 2;
float32x4_t input[3][3];
for (int h = 0; h < outputHeight; h++) {
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t tmp;
input[0][0] = vld1q_f32(r0);
tmp = vld1q_f32(r0 + 4);
input[0][1] = vextq_f32(input[0][0], tmp, 1);
input[0][2] = vextq_f32(input[0][0], tmp, 2);
input[1][0] = vld1q_f32(r1);
tmp = vld1q_f32(r1 + 4);
input[1][1] = vextq_f32(input[1][0], tmp, 1);
input[1][2] = vextq_f32(input[1][0], tmp, 2);
input[2][0] = vld1q_f32(r2);
tmp = vld1q_f32(r2 + 4);
input[2][1] = vextq_f32(input[2][0], tmp, 1);
input[2][2] = vextq_f32(input[2][0], tmp, 2);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 4;
r1 += 4;
r2 += 4;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0++;
r1++;
r2++;
outputData++;
}
r0 += 2;
r1 += 2;
r2 += 2;
}
}
}
};
/**
* Each step calculates four elements of the output.
* First step:
* R0[0, 2, 4, 6...] * K[0][0]
* R0[1, 3, 5, 7...] * K[0][1]
* R0[2, 4, 6, 8...] * K[0][2]
* R1[0, 2, 4, 6...] * K[1][0]
* R1[1, 3, 5, 7...] * K[1][1]
* R1[2, 4, 6, 8...] * K[1][2]
* R2[0, 2, 4, 6...] * K[2][0]
* R2[1, 3, 5, 7...] * K[2][1]
* R2[2, 4, 6, 8...] * K[2][2]
* ------------------------------
* Output[0, 1, 2, 3]
*/
template <>
struct DepthwiseConvKernel<3, 2> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 9) {
// Load the filters
float32x4_t k[3];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 3);
k[2] = vld1q_f32(filterData + 6);
k[0] = vsetq_lane_f32(0.f, k[0], 3);
k[1] = vsetq_lane_f32(0.f, k[1], 3);
k[2] = vsetq_lane_f32(0.f, k[2], 3);
const float* start =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
float32x4_t input[3][3];
for (int h = 0; h < outputHeight; h++) {
const float* r0 = start + 2 * h * inputWidth;
const float* r1 = start + (2 * h + 1) * inputWidth;
const float* r2 = start + (2 * h + 2) * inputWidth;
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t data1;
float32x4x2_t data2;
data2 = vld2q_f32(r0);
input[0][0] = data2.val[0];
input[0][1] = data2.val[1];
data1 = vld1q_f32(r0 + 8);
input[0][2] = vextq_f32(data2.val[0], data1, 1);
data2 = vld2q_f32(r1);
input[1][0] = data2.val[0];
input[1][1] = data2.val[1];
data1 = vld1q_f32(r1 + 8);
input[1][2] = vextq_f32(data2.val[0], data1, 1);
data2 = vld2q_f32(r2);
input[2][0] = data2.val[0];
input[2][1] = data2.val[1];
data1 = vld1q_f32(r2 + 8);
input[2][2] = vextq_f32(data2.val[0], data1, 1);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 8;
r1 += 8;
r2 += 8;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
*outputData = conv3x3(i0, i1, i2, k[0], k[1], k[2]);
r0 += 2;
r1 += 2;
r2 += 2;
outputData++;
}
}
}
}
};
/**
* Each step calculates four elements of the output.
*/
template <>
struct DepthwiseConvKernel<4, 1> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 16) {
// Load the filters
float32x4_t k[4];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 4);
k[2] = vld1q_f32(filterData + 8);
k[3] = vld1q_f32(filterData + 12);
const float* r0 =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
const float* r1 = r0 + inputWidth;
const float* r2 = r0 + inputWidth * 2;
const float* r3 = r0 + inputWidth * 3;
float32x4_t input[4][4];
for (int h = 0; h < outputHeight; h++) {
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4_t tmp;
input[0][0] = vld1q_f32(r0);
tmp = vld1q_f32(r0 + 4);
input[0][1] = vextq_f32(input[0][0], tmp, 1);
input[0][2] = vextq_f32(input[0][0], tmp, 2);
input[0][3] = vextq_f32(input[0][0], tmp, 3);
input[1][0] = vld1q_f32(r1);
tmp = vld1q_f32(r1 + 4);
input[1][1] = vextq_f32(input[1][0], tmp, 1);
input[1][2] = vextq_f32(input[1][0], tmp, 2);
input[1][3] = vextq_f32(input[1][0], tmp, 3);
input[2][0] = vld1q_f32(r2);
tmp = vld1q_f32(r2 + 4);
input[2][1] = vextq_f32(input[2][0], tmp, 1);
input[2][2] = vextq_f32(input[2][0], tmp, 2);
input[2][3] = vextq_f32(input[2][0], tmp, 3);
input[3][0] = vld1q_f32(r3);
tmp = vld1q_f32(r3 + 4);
input[3][1] = vextq_f32(input[3][0], tmp, 1);
input[3][2] = vextq_f32(input[3][0], tmp, 2);
input[3][3] = vextq_f32(input[3][0], tmp, 3);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 4;
r1 += 4;
r2 += 4;
r3 += 4;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
float32x4_t i3 = vld1q_f32(r3);
*outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
r0++;
r1++;
r2++;
r3++;
outputData++;
}
r0 += 3;
r1 += 3;
r2 += 3;
r3 += 3;
}
}
}
};
/**
* Each step calculates four elements of the output.
*/
template <>
struct DepthwiseConvKernel<4, 2> {
static void run(const float* inputData,
const float* filterData,
int inputHeight,
int inputWidth,
int outputChannels,
int outputHeight,
int outputWidth,
int filterMultiplier,
float* outputData) {
const int steps = outputWidth >> 2;
const int remain = outputWidth & 3;
for (int c = 0; c < outputChannels; c++, filterData += 16) {
// Load the filters
float32x4_t k[4];
k[0] = vld1q_f32(filterData);
k[1] = vld1q_f32(filterData + 4);
k[2] = vld1q_f32(filterData + 8);
k[3] = vld1q_f32(filterData + 12);
const float* start =
inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
float32x4_t input[4][4];
for (int h = 0; h < outputHeight; h++) {
const float* r0 = start + 2 * h * inputWidth;
const float* r1 = start + (2 * h + 1) * inputWidth;
const float* r2 = start + (2 * h + 2) * inputWidth;
const float* r3 = start + (2 * h + 3) * inputWidth;
for (int s = 0; s < steps; s++) {
// Load the inputs
float32x4x2_t data1;
float32x4x2_t data2;
data1 = vld2q_f32(r0);
data2 = vld2q_f32(r0 + 8);
input[0][0] = data1.val[0];
input[0][1] = data1.val[1];
input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r1);
data2 = vld2q_f32(r1 + 8);
input[1][0] = data1.val[0];
input[1][1] = data1.val[1];
input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r2);
data2 = vld2q_f32(r2 + 8);
input[2][0] = data1.val[0];
input[2][1] = data1.val[1];
input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1);
data1 = vld2q_f32(r3);
data2 = vld2q_f32(r3 + 8);
input[3][0] = data1.val[0];
input[3][1] = data1.val[1];
input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1);
input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1);
float32x4_t tmp1 = vdupq_n_f32(0.f);
float32x4_t tmp2 = vdupq_n_f32(0.f);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
tmp1 = vaddq_f32(tmp1, tmp2);
vst1q_f32(outputData, tmp1);
r0 += 8;
r1 += 8;
r2 += 8;
r3 += 8;
outputData += 4;
}
for (int r = 0; r < remain; r++) {
float32x4_t i0 = vld1q_f32(r0);
float32x4_t i1 = vld1q_f32(r1);
float32x4_t i2 = vld1q_f32(r2);
float32x4_t i3 = vld1q_f32(r3);
*outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
r0 += 2;
r1 += 2;
r2 += 2;
r3 += 2;
outputData++;
}
}
}
}
};
template <DeviceType Device>
class NeonDepthwiseConvFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
check(inputs, outputs);
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
// only support strideH() == strideW() and filterHeight == filterWidth.
CHECK_EQ(strideH(), strideW());
CHECK_EQ(filterHeight, filterWidth);
float* inputData = inputs[0].data<float>();
float* filterData = inputs[1].data<float>();
float* outputData = outputs[0].data<float>();
// padding the input
float* inputPadding = inputData;
if (paddingH() > 0 || paddingW() > 0) {
int newSize = batchSize * inputChannels * (inputHeight + 2 * paddingH()) *
(inputWidth + 2 * paddingW());
resizeBuffer<Device>(newSize);
inputPadding = reinterpret_cast<float*>(memory_->getBuf());
Padding<float>::run(inputData,
inputPadding,
batchSize * inputChannels,
inputHeight,
inputWidth,
paddingH(),
paddingW());
// height and width of padding data
inputHeight += 2 * paddingH();
inputWidth += 2 * paddingW();
}
std::function<void(
const float*, const float*, int, int, int, int, int, int, float*)>
DepthWiseConv;
if (filterWidth == 3 && strideW() == 1) {
DepthWiseConv = DepthwiseConvKernel<3, 1>::run;
} else if (filterWidth == 3 && strideW() == 2) {
DepthWiseConv = DepthwiseConvKernel<3, 2>::run;
} else if (filterWidth == 4 && strideW() == 1) {
DepthWiseConv = DepthwiseConvKernel<4, 1>::run;
} else if (filterWidth == 4 && strideW() == 2) {
DepthWiseConv = DepthwiseConvKernel<4, 2>::run;
} else {
LOG(FATAL) << "Not supported";
}
for (size_t i = 0; i < batchSize; i++) {
DepthWiseConv(inputPadding,
filterData,
inputHeight,
inputWidth,
outputChannels,
outputHeight,
outputWidth,
filterMultiplier,
outputData);
inputPadding += inputChannels * inputHeight * inputWidth;
outputData += outputChannels * outputHeight * outputWidth;
}
}
};
REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
#endif
} // namespace neon
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
namespace paddle {
namespace neon {
inline float32x4_t vld1q_f32_aligned(const float* p) {
return vld1q_f32(
(const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
}
#ifndef __aarch64__
inline float32_t vaddvq_f32(float32x4_t a) {
float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
return vget_lane_f32(vpadd_f32(v, v), 0);
}
inline float32x4_t vmlaq_laneq_f32(float32x4_t a,
float32x4_t b,
float32x4_t v,
const int lane) {
return vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane));
}
#endif
} // namespace neon
} // namespace paddle
#endif
...@@ -202,7 +202,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) { ...@@ -202,7 +202,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>( auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
para->getMat(PARAMETER_VALUE).get()); para->getMat(PARAMETER_VALUE).get());
para->clearGradient(); para->clearGradient();
mat->clearIndices(); if (mat) mat->clearIndices();
} }
} }
} }
......
...@@ -184,7 +184,7 @@ public: ...@@ -184,7 +184,7 @@ public:
} }
void backward(const UpdateCallback& callback) override { void backward(const UpdateCallback& callback) override {
if (biases_) { if (biases_ && biases_->getWGrad()) {
backwardActivation(); backwardActivation();
biases_->getWGrad()->collectBias(*getOutputGrad(), 1); biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
biases_->getParameterPtr()->incUpdate(callback); biases_->getParameterPtr()->incUpdate(callback);
...@@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() { ...@@ -1012,11 +1012,6 @@ void RecurrentGradientMachine::generateSequence() {
/* width */ resultNum, /* width */ resultNum,
false, false,
/* useGpu */ false); /* useGpu */ false);
Matrix::resizeOrCreate(generator_.outArg.value,
/* height */ maxGenWordCount,
/* width */ 1,
false,
/* useGpu */ false);
} }
ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions, ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
numSequences + 1, numSequences + 1,
...@@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() { ...@@ -1026,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
} else { } else {
oneWaySearch(numSequences); oneWaySearch(numSequences);
} }
if (dataArgsSize_) createDataOutlink(batchMachineIdVec_); if (dataArgsSize_) createDataOutlink();
size_t size = generator_.ids.size(); size_t size = generator_.ids.size();
generator_.outArg.ids->resize(size); generator_.outArg.ids->resize(size);
...@@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { ...@@ -1106,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
} }
batchMachineIdVec_.clear(); batchMachineIdVec_.clear();
batchMachineStartPos_.clear();
int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false); int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
starts[0] = 0; starts[0] = 0;
generator_.ids.clear(); generator_.ids.clear();
...@@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() { ...@@ -1312,13 +1308,20 @@ void RecurrentGradientMachine::fillGenOutputs() {
finalPaths_[i].resize(minFinalPathsSize); finalPaths_[i].resize(minFinalPathsSize);
} }
batchMachineIdVec_.clear();
generator_.ids.clear(); generator_.ids.clear();
int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false); int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
starts[0] = 0; starts[0] = 0;
if (numResults > 1) { if (numResults > 1) {
real* probs = generator_.outArg.in->getData(); int idsProbSaveSize = 0;
for (auto inSeq : finalPaths_) {
for (auto path : inSeq) idsProbSaveSize += path.ids.size();
idsProbSaveSize += inSeq.size();
}
Matrix::resizeOrCreate(
generator_.outArg.value, idsProbSaveSize, 1, false, false);
real* idsProb = generator_.outArg.value->getData(); real* idsProb = generator_.outArg.value->getData();
real* probs = generator_.outArg.in->getData();
size_t curPos = 0; size_t curPos = 0;
for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t i = 0; i < finalPaths_.size(); ++i) {
for (size_t j = 0; j < finalPaths_[i].size(); ++j) { for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
...@@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() { ...@@ -1333,24 +1336,16 @@ void RecurrentGradientMachine::fillGenOutputs() {
curPos += genLen; curPos += genLen;
idsProb[curPos++] = -1.0; idsProb[curPos++] = -1.0;
probs[i * numResults + j] = path.logProb; probs[i * numResults + j] = path.logProb;
if (!j && dataArgsSize_) {
// in beam search, here only reserved the top 1 generated result
// for out_links that are not the generated word indices.
batchMachineIdVec_.insert(batchMachineIdVec_.end(),
path.machineIdVec.begin(),
path.machineIdVec.end());
}
} }
starts[i + 1] = generator_.ids.size(); starts[i + 1] = generator_.ids.size();
} }
} else { } else {
for (size_t i = 0; i < finalPaths_.size(); ++i) { for (size_t i = 0; i < finalPaths_.size(); ++i) {
CHECK(!finalPaths_[i].empty()); CHECK(!finalPaths_[i].empty());
generator_.ids.insert(generator_.ids.begin(), Path& path = finalPaths_[i][0];
finalPaths_[i][0].ids.begin(), generator_.ids.insert(
finalPaths_[i][0].ids.end()); generator_.ids.end(), path.ids.begin(), path.ids.end());
starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size(); starts[i + 1] = starts[i] + path.ids.size();
} }
} }
} }
...@@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) { ...@@ -1364,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
} }
} }
void RecurrentGradientMachine::createDataOutlink( void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
std::vector<int>& machineIdVec) { bool isSeq, std::vector<Argument>& outArgs) {
size_t seqNum = batchMachineIdVec_.clear();
getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
std::vector<int> starts(seqNum + 1, 0); size_t seqIdx = 0;
for (size_t i = 0; i < seqNum; ++i) { for (size_t i = 0; i < finalPaths_.size(); ++i) {
size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size() for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
: finalPaths_[0][i].ids.size(); std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
starts[i + 1] = starts[i] + seqLen; if (isSeq) {
for (size_t i = 0; i < machineIdVec.size(); ++i) {
size_t rowId = machineIdVec[i];
int* seqPos =
outArgs[i].sequenceStartPositions->getMutableData(false);
batchMachineIdVec_.push_back(seqPos[rowId]);
}
} else {
batchMachineIdVec_.insert(
batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
}
seqIdx++;
}
}
}
void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
size_t totalSeqNum = std::accumulate(
finalPaths_.begin(),
finalPaths_.end(),
0UL,
[](size_t a, const std::vector<Path>& b) { return a + b.size(); });
copySize.resize(totalSeqNum, 1);
batchMachineStartPos_.resize(totalSeqNum + 1, 0);
if (isSeq) {
ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
int* starts = inputSeqStartPos->getMutableData(false);
int seqId = 0;
for (size_t i = 0; i < finalPaths_.size(); ++i) {
for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
: starts[j + 1] - starts[j];
batchMachineStartPos_[seqId + 1] =
batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
seqId++;
}
}
} else {
for (size_t i = 0; i < finalPaths_[0].size(); ++i)
batchMachineStartPos_[i + 1] =
batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
} }
}
void RecurrentGradientMachine::createDataOutlink() {
for (size_t i = 0; i < dataArgsSize_; i++) { for (size_t i = 0; i < dataArgsSize_; i++) {
bool isSeq = dataArgsFrame_[i][0].hasSeq();
std::vector<int> copySize;
createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
dataArgs_[i].concat(dataArgsFrame_[i], dataArgs_[i].concat(dataArgsFrame_[i],
machineIdVec, batchMachineIdVec_,
starts, batchMachineStartPos_,
copySize,
useGpu_, useGpu_,
HPPL_STREAM_1, HPPL_STREAM_1,
PASS_TEST); PASS_TEST);
auto dataAgent = auto dataAgent =
dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get()); dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
CHECK_NOTNULL(dataAgent); CHECK_NOTNULL(dataAgent);
......
...@@ -190,7 +190,7 @@ public: ...@@ -190,7 +190,7 @@ public:
std::vector<int> ids; std::vector<int> ids;
/** /**
* @brief idsProb, log probability of each generated words. * @brief idsProb, log probability of each generated word.
*/ */
std::vector<real> idsProb; std::vector<real> idsProb;
...@@ -472,15 +472,43 @@ private: ...@@ -472,15 +472,43 @@ private:
void copyDataOutlinkFrame(size_t machineCur); void copyDataOutlinkFrame(size_t machineCur);
/* /*
* @brief In generation, if the layer group has more than 1 outlink, outlinks * @brief In generation, if the layer group has more than 1 outlink, outlink
* except the first one are data outlinks. This function creates the data * except the first one is a data outlink. In RecurrentLayerGroup, each time
* outlinks. * step is a separate Network, outputs of a layer inside the
* @note In beam search, only one generated sequence with the hightest log * RecurrentLayerGroup are stored in separate Arguments. If one layer is
* probabilites are retained. * specified as an outlink of RecurrentLayerGroup. This function will
* @param machineIdVec : select a row of output matrix in each frame * collect outputs in each time step of each generated sequence which are
* that the generation process expanded. * dispersed in separate Arguments to form a new single Argument as output of
* RecurrentLayerGroup.
*/ */
void createDataOutlink(std::vector<int>& machineIdVec); void createDataOutlink();
/*
* @brief decide to select how many rows from the Matrix stored the forward
* pass results from a start position.
*
* @param isSeq: a flag indicating whetehr the layer to be output of the
* RecurrentGradientMachine is a sequence or not
* @param outArgs: all of the the returned Arguments of the forward pass
* during the generation process.
* @param copySize: the returned result, number of rows to select from the
* Matrix stored the forward pass results from a start position.
*/
void createDataOutlinkCopySizeInfo(bool isSeq,
std::vector<Argument>& outArgs,
std::vector<int>& copySize);
/*
* @brief decide index of the start row for each time step of a generated
* sequence in Matrix stored the entire beam search batch's forward pass
* results.
*
* @param isSeq: a flag indicating whether the layer to be output of the
* RecurrentGradientMachine is a sequence or not
* @param outArgs: all of the returned Arguments of the forward pass
* during the generation process.
*/
void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
/* /*
* @brief used in beam search, connect previous frame to form recurrent link * @brief used in beam search, connect previous frame to form recurrent link
...@@ -543,6 +571,7 @@ private: ...@@ -543,6 +571,7 @@ private:
std::vector<int> topIds_; std::vector<int> topIds_;
std::vector<int> seqIds_; std::vector<int> seqIds_;
std::vector<int> batchMachineIdVec_; std::vector<int> batchMachineIdVec_;
std::vector<int> batchMachineStartPos_;
std::vector<std::vector<Path>> finalPaths_; std::vector<std::vector<Path>> finalPaths_;
std::vector<real> minFinalPathLogProb_; std::vector<real> minFinalPathLogProb_;
BeamSearchControlCallbacks* beamSearchCtrlCallbacks_; BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
......
...@@ -32,9 +32,11 @@ bool ConvBaseLayer::init(const LayerMap& layerMap, ...@@ -32,9 +32,11 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
const ConvConfig& conf = inputConfig.conv_conf(); const ConvConfig& conf = inputConfig.conv_conf();
padding_.push_back(conf.padding()); padding_.push_back(conf.padding());
stride_.push_back(conf.stride()); stride_.push_back(conf.stride());
dilation_.push_back(conf.dilation());
filterSize_.push_back(conf.filter_size()); filterSize_.push_back(conf.filter_size());
paddingY_.push_back(conf.padding_y()); paddingY_.push_back(conf.padding_y());
strideY_.push_back(conf.stride_y()); strideY_.push_back(conf.stride_y());
dilationY_.push_back(conf.dilation_y());
filterSizeY_.push_back(conf.filter_size_y()); filterSizeY_.push_back(conf.filter_size_y());
filterPixels_.push_back(filterSize_.back() * filterSizeY_.back()); filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
channels_.push_back(conf.channels()); channels_.push_back(conf.channels());
...@@ -89,7 +91,11 @@ size_t ConvBaseLayer::calOutputSize() { ...@@ -89,7 +91,11 @@ size_t ConvBaseLayer::calOutputSize() {
size_t layerSize = 0; size_t layerSize = 0;
auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) { auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
size_t filterSizeY;
size_t filterSize;
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
inH.push_back(inputLayers_[i]->getOutput().getFrameHeight()); inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
inW.push_back(inputLayers_[i]->getOutput().getFrameWidth()); inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
const ConvConfig& conf = config_.inputs(i).conv_conf(); const ConvConfig& conf = config_.inputs(i).conv_conf();
...@@ -98,17 +104,17 @@ size_t ConvBaseLayer::calOutputSize() { ...@@ -98,17 +104,17 @@ size_t ConvBaseLayer::calOutputSize() {
inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x(); inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
if (inW[i] == 0) inW[i] = conf.output_x(); if (inW[i] == 0) inW[i] = conf.output_x();
outH.push_back(imageSize( outH.push_back(imageSize(
inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
outW.push_back(imageSize( outW.push_back(
inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_)); imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
} else { } else {
if (inH[i] == 0) if (inH[i] == 0)
inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
if (inW[i] == 0) inW[i] = conf.img_size(); if (inW[i] == 0) inW[i] = conf.img_size();
outH.push_back(outputSize( outH.push_back(outputSize(
inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
outW.push_back(outputSize( outW.push_back(outputSize(
inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_)); inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
} }
CHECK_EQ(outH[i], outH[0]); CHECK_EQ(outH[i], outH[0]);
CHECK_EQ(outW[i], outW[0]); CHECK_EQ(outW[i], outW[0]);
......
...@@ -40,6 +40,10 @@ protected: ...@@ -40,6 +40,10 @@ protected:
IntV stride_; IntV stride_;
/// The y dimension of the stride. /// The y dimension of the stride.
IntV strideY_; IntV strideY_;
/// The x dimension of the dilation.
IntV dilation_;
/// The y dimension of the dilation.
IntV dilationY_;
/// The x dimension of a filter kernel. /// The x dimension of a filter kernel.
IntV filterSize_; IntV filterSize_;
/// The y dimension of a filter kernel. /// The y dimension of a filter kernel.
......
...@@ -59,7 +59,8 @@ void ConvBaseOperator::allocConvWorkSpace() { ...@@ -59,7 +59,8 @@ void ConvBaseOperator::allocConvWorkSpace() {
&bwdDataAlgo_, &bwdDataAlgo_,
&bwdDataLimitBytes_, &bwdDataLimitBytes_,
&bwdFilterAlgo_, &bwdFilterAlgo_,
&bwdFilterLimitBytes_); &bwdFilterLimitBytes_,
/*useDilation*/ false);
size_t maxWorkSpace = 0; size_t maxWorkSpace = 0;
maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_); maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
......
...@@ -41,6 +41,11 @@ void ConvBaseProjection::getConvParams() { ...@@ -41,6 +41,11 @@ void ConvBaseProjection::getConvParams() {
strideH_ = conf.stride_y(); strideH_ = conf.stride_y();
strideW_ = conf.stride(); strideW_ = conf.stride();
dilationH_ = conf.dilation_y();
dilationW_ = conf.dilation();
CHECK_GT(dilationH_, 0);
CHECK_GT(dilationW_, 0);
filterH_ = conf.filter_size_y(); filterH_ = conf.filter_size_y();
filterW_ = conf.filter_size(); filterW_ = conf.filter_size();
...@@ -77,7 +82,9 @@ void ConvBaseProjection::initCudnn() { ...@@ -77,7 +82,9 @@ void ConvBaseProjection::initCudnn() {
paddingH_, paddingH_,
paddingW_, paddingW_,
strideH_, strideH_,
strideW_); strideW_,
dilationH_,
dilationW_);
// initialize all to default algorithms // initialize all to default algorithms
fwdAlgo_ = 0; fwdAlgo_ = 0;
...@@ -131,7 +138,9 @@ void ConvBaseProjection::reshapeTensorDesc(int batchSize) { ...@@ -131,7 +138,9 @@ void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
paddingH_, paddingH_,
paddingW_, paddingW_,
strideH_, strideH_,
strideW_); strideW_,
dilationH_,
dilationW_);
} }
void ConvBaseProjection::reshape(int batchSize) { void ConvBaseProjection::reshape(int batchSize) {
...@@ -140,6 +149,10 @@ void ConvBaseProjection::reshape(int batchSize) { ...@@ -140,6 +149,10 @@ void ConvBaseProjection::reshape(int batchSize) {
CHECK_EQ(calInputSize(), in_->value->getWidth()); CHECK_EQ(calInputSize(), in_->value->getWidth());
reshapeTensorDesc(batchSize); reshapeTensorDesc(batchSize);
bool useDilation = false;
if (dilationH_ > 1 || dilationW_ > 1) {
useDilation = true;
}
hl_conv_workspace(imageDesc_, hl_conv_workspace(imageDesc_,
outputDesc_, outputDesc_,
filterDesc_, filterDesc_,
...@@ -149,7 +162,8 @@ void ConvBaseProjection::reshape(int batchSize) { ...@@ -149,7 +162,8 @@ void ConvBaseProjection::reshape(int batchSize) {
&bwdDataAlgo_, &bwdDataAlgo_,
&bwdDataLimitBytes_, &bwdDataLimitBytes_,
&bwdFilterAlgo_, &bwdFilterAlgo_,
&bwdFilterLimitBytes_); &bwdFilterLimitBytes_,
useDilation);
size_t maxWorkSpace = 0; size_t maxWorkSpace = 0;
maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_); maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
......
...@@ -63,6 +63,7 @@ protected: ...@@ -63,6 +63,7 @@ protected:
int configChannels_, configNumFilters_; int configChannels_, configNumFilters_;
int paddingH_, paddingW_; int paddingH_, paddingW_;
int strideH_, strideW_; int strideH_, strideW_;
int dilationH_, dilationW_;
int filterH_, filterW_; int filterH_, filterW_;
/// One group offset of input data. /// One group offset of input data.
int inputOffset_; int inputOffset_;
......
...@@ -25,12 +25,12 @@ size_t ConvProjection::calOutputSize() { ...@@ -25,12 +25,12 @@ size_t ConvProjection::calOutputSize() {
if (imageH_ == 0) imageH_ = configImgH_; if (imageH_ == 0) imageH_ = configImgH_;
if (imageW_ == 0) imageW_ = configImgW_; if (imageW_ == 0) imageW_ = configImgW_;
outputH_ = outputSize(imageH_, outputH_ = outputSize(imageH_,
filterH_, (filterH_ - 1) * dilationH_ + 1,
paddingH_, paddingH_,
strideH_, strideH_,
/* caffeMode */ true); /* caffeMode */ true);
outputW_ = outputSize(imageW_, outputW_ = outputSize(imageW_,
filterW_, (filterW_ - 1) * dilationW_ + 1,
paddingW_, paddingW_,
strideW_, strideW_,
/* caffeMode */ true); /* caffeMode */ true);
......
...@@ -572,13 +572,8 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output, ...@@ -572,13 +572,8 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
} }
} }
// bool HuberCost::init(const LayerMap& layerMap,
// Huber loss for robust 2-classes classification const ParameterMap& parameterMap) {
//
REGISTER_LAYER(huber, HuberTwoClass);
bool HuberTwoClass::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CostLayer::init(layerMap, parameterMap); CostLayer::init(layerMap, parameterMap);
if (useGpu_) { if (useGpu_) {
tmpCpuInput_.reserve(inputLayers_.size()); tmpCpuInput_.reserve(inputLayers_.size());
...@@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap, ...@@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
return true; return true;
} }
void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) { void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
if (useGpu_) { if (useGpu_) {
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
tmpCpuInput_[i].resizeAndCopyFrom( tmpCpuInput_[i].resizeAndCopyFrom(
...@@ -597,13 +592,87 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) { ...@@ -597,13 +592,87 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
} }
hl_stream_synchronize(HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT);
} }
forwardImpIn(output, label, cost);
} }
void HuberTwoClass::forwardImpIn(Matrix& output, //
Argument& label, // Huber loss for robust regression.
Matrix& target) { //
REGISTER_LAYER(huber_regression, HuberRegressionLoss);
bool HuberRegressionLoss::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
HuberCost::init(layerMap, parameterMap);
delta_ = config_.delta();
return true;
}
void HuberRegressionLoss::forwardImp(Matrix& output,
Argument& label,
Matrix& target) {
HuberCost::forwardImp(output, label, target);
size_t numSamples = target.getHeight();
size_t dim = output.getWidth();
CHECK(label.value);
CHECK_EQ((*label.value).getHeight(), numSamples);
CHECK_EQ(output.getHeight(), numSamples);
CHECK_EQ(dim, (*label.value).getWidth());
CHECK_EQ(target.getWidth(), (size_t)1);
real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
real* lbl =
useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
std::vector<real> cost(numSamples, 0);
for (size_t i = 0; i < numSamples; ++i) {
for (size_t j = 0; j < dim; ++j) {
int index = i * dim + j;
real a = std::abs(lbl[index] - out[index]);
if (a <= delta_)
cost[i] += a * a / 2;
else
cost[i] += delta_ * (a - delta_ / 2);
}
}
target.copyFrom(cost.data(), numSamples);
}
void HuberRegressionLoss::backwardImp(Matrix& output,
Argument& label,
Matrix& outputG) {
size_t numSamples = output.getHeight();
size_t dim = output.getWidth();
real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
real* lbl =
useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
for (size_t i = 0; i < numSamples; ++i) {
for (size_t j = 0; j < dim; ++j) {
int index = i * dim + j;
real a = lbl[index] - out[index];
if (std::abs(a) <= delta_)
grad[index] += -a;
else
grad[index] += a > 0 ? -delta_ : delta_;
}
}
if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
}
//
// Huber loss for robust 2-classes classification
//
REGISTER_LAYER(huber_classification, HuberTwoClassification);
bool HuberTwoClassification::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
return HuberCost::init(layerMap, parameterMap);
}
void HuberTwoClassification::forwardImp(Matrix& output,
Argument& label,
Matrix& target) {
HuberCost::forwardImp(output, label, target);
size_t numSamples = target.getHeight(); size_t numSamples = target.getHeight();
CHECK(label.ids);
CHECK_EQ((*label.ids).getSize(), numSamples); CHECK_EQ((*label.ids).getSize(), numSamples);
CHECK_EQ(output.getHeight(), numSamples); CHECK_EQ(output.getHeight(), numSamples);
CHECK_EQ(output.getWidth(), (size_t)1); CHECK_EQ(output.getWidth(), (size_t)1);
...@@ -611,47 +680,35 @@ void HuberTwoClass::forwardImpIn(Matrix& output, ...@@ -611,47 +680,35 @@ void HuberTwoClass::forwardImpIn(Matrix& output,
real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData(); real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData(); int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
std::vector<real> cost(numSamples); std::vector<real> cost(numSamples, 0);
for (size_t i = 0; i < numSamples; ++i) { for (size_t i = 0; i < numSamples; ++i) {
int y = 2 * lbl[i] - 1; int y = 2 * lbl[i] - 1;
if (out[i] * y < -1) real a = out[i] * y;
cost[i] = -4 * out[i] * y; if (a < -1)
else if (out[i] * y < 1) cost[i] = -4 * a;
cost[i] = (1 - out[i] * y) * (1 - out[i] * y); else if (a < 1)
else cost[i] = (1 - a) * (1 - a);
cost[i] = 0;
} }
target.copyFrom(cost.data(), numSamples); target.copyFrom(cost.data(), numSamples);
} }
void HuberTwoClass::backwardImp(Matrix& outputValue, void HuberTwoClassification::backwardImp(Matrix& output,
Argument& label, Argument& label,
Matrix& outputGrad) { Matrix& outputG) {
if (useGpu_) {
backwardImpIn(
*tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
outputGrad.copyFrom(*tmpCpuInput_[0].grad);
} else {
backwardImpIn(outputValue, label, outputGrad);
}
}
void HuberTwoClass::backwardImpIn(Matrix& output,
Argument& label,
Matrix& outputG) {
size_t numSamples = output.getHeight(); size_t numSamples = output.getHeight();
real* out = output.getData(); real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
real* grad = outputG.getData(); int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
int* lbl = (*label.ids).getData(); real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
for (size_t i = 0; i < numSamples; ++i) { for (size_t i = 0; i < numSamples; ++i) {
int y = 2 * lbl[i] - 1; int y = 2 * lbl[i] - 1;
if (y * out[i] < -1) real a = out[i] * y;
if (a < -1)
grad[i] += -4 * y; grad[i] += -4 * y;
else if (y * out[i] < 1) else if (a < 1)
grad[i] += -2 * (1 - y * out[i]) * y; grad[i] += -2 * (1 - a) * y;
} }
if (useGpu_) outputG.copyFrom(grad, numSamples);
} }
/** /**
* This cost layer compute the sum of its input as loss. * This cost layer compute the sum of its input as loss.
* \f[ * \f[
......
...@@ -304,37 +304,68 @@ public: ...@@ -304,37 +304,68 @@ public:
Matrix& outputGrad) override; Matrix& outputGrad) override;
}; };
/** /*
* Huber loss for robust 2-classes classification. * A base layer for HuberRegressionLoss and HuberTwoClassification.
*
* For label={0, 1}, let y=2*label-1. Given output f, the loss is:
* \f[
* Loss =
* \left\{\begin{matrix}
* 4 * y * f & \textit{if} \ \ y* f < -1 \\
* (1 - y * f)^2 & \textit{if} \ \ -1 < y * f < 1 \\
* 0 & \textit{otherwise}
* \end{matrix}\right.
* \f]
*/ */
class HuberTwoClass : public CostLayer { class HuberCost : public CostLayer {
public:
std::vector<Argument> tmpCpuInput_; std::vector<Argument> tmpCpuInput_;
public: explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
bool init(const LayerMap& layerMap, bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override; const ParameterMap& parameterMap) override;
void forwardImp(Matrix& output, Argument& label, Matrix& cost) override; void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
void forwardImpIn(Matrix& output, Argument& label, Matrix& cost); void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
};
/**
* Huber loss for robust regression.
*
* Given output f(x), label y and delta, the loss is:
* Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
* Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
*/
class HuberRegressionLoss : public HuberCost {
public:
explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
void backwardImp(Matrix& outputValue, void backwardImp(Matrix& outputValue,
Argument& label, Argument& label,
Matrix& outputGrad) override; Matrix& outputGrad) override;
void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad); protected:
real delta_;
};
/**
* Huber loss for robust 2-classes classification.
*
* For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
* Loss = 4 * y * f, if y* f < -1 \\
* Loss = (1 - y * f)^2, if -1 < y * f < 1 \\
* Loss = 0, otherwise
*/
class HuberTwoClassification : public HuberCost {
public:
explicit HuberTwoClassification(const LayerConfig& config)
: HuberCost(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
void backwardImp(Matrix& outputValue,
Argument& label,
Matrix& outputGrad) override;
}; };
typedef std::shared_ptr<CostLayer> CostLayerPtr; typedef std::shared_ptr<CostLayer> CostLayerPtr;
......
...@@ -29,6 +29,10 @@ namespace paddle { ...@@ -29,6 +29,10 @@ namespace paddle {
REGISTER_LAYER(exconv, ExpandConvLayer); REGISTER_LAYER(exconv, ExpandConvLayer);
REGISTER_LAYER(exconvt, ExpandConvLayer); REGISTER_LAYER(exconvt, ExpandConvLayer);
inline bool isDepthwiseConv(int channels, int groups) {
return channels == groups;
}
bool ExpandConvLayer::init(const LayerMap &layerMap, bool ExpandConvLayer::init(const LayerMap &layerMap,
const ParameterMap &parameterMap) { const ParameterMap &parameterMap) {
/* Initialize the basic convolutional parent class */ /* Initialize the basic convolutional parent class */
...@@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -47,14 +51,27 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]}; std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]}; std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) { // Convolution Layer uses the GemmConv function by default.
convType = "GemmConv";
convGradInputType = "GemmConvGradInput";
convGradFilterType = "GemmConvGradFilter";
// If depth wise convolution and useGpu == true
if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
convType = "DepthwiseConv"; convType = "DepthwiseConv";
convGradInputType = "DepthwiseConvGradInput"; convGradInputType = "DepthwiseConvGradInput";
convGradFilterType = "DepthwiseConvGradFilter"; convGradFilterType = "DepthwiseConvGradFilter";
} else { }
convType = "GemmConv";
convGradInputType = "GemmConvGradInput"; // If depth wise convolution and useGpu == false and ARM-NEON
convGradFilterType = "GemmConvGradFilter"; if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
if ((filterSize_[i] == filterSizeY_[i]) &&
(filterSize_[i] == 3 || filterSize_[i] == 4) &&
(stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
convType = "NeonDepthwiseConv";
}
#endif
} }
if (FLAGS_use_nnpack && !isDeconv_) { if (FLAGS_use_nnpack && !isDeconv_) {
......
...@@ -80,13 +80,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) { ...@@ -80,13 +80,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
<< "input of " << getName() << "input of " << getName()
<< " must be a sequence or a nested sequence."; << " must be a sequence or a nested sequence.";
CHECK_EQ(input.value->getWidth(), 1UL) CHECK_EQ(input.value->getWidth(), 1UL)
<< "input of " << getName() << "input of " << getName() << " are scores over a sequence or "
<< " is score over a sequence or a nested sequence, so its width " << "a nested sequence, so its width must be 1.";
<< " must be 1.";
if (useGpu_) { if (useGpu_) {
// this Layer runs only in CPU, if the model is runing on GPU, /*
// then copy the input to this layer from GPU to CPU. * currently, this Layer only runs in CPU, if the other part of the model is
* runing on GPU, then copy the input to this layer from GPU to CPU.
*/
Matrix::resizeOrCreate(scores_, Matrix::resizeOrCreate(scores_,
inputScore->getHeight(), inputScore->getHeight(),
1, 1,
...@@ -97,6 +98,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) { ...@@ -97,6 +98,14 @@ void KmaxSeqScoreLayer::forward(PassType passType) {
scores_ = inputScore; scores_ = inputScore;
} }
/*
* TODO(caoying)
* In PaddePaddle, currently all matrices are real number types,
* but output of this layer which is some selected indices of the give
* sequence are actually filled with int types so that storing int types
* information in a real number matrix is dangerous, since real numbers will
* be convered to int types.
*/
Matrix::resizeOrCreate( Matrix::resizeOrCreate(
output_.value, output_.value,
input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(), input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
......
...@@ -41,7 +41,7 @@ namespace paddle { ...@@ -41,7 +41,7 @@ namespace paddle {
Layer::Layer(const LayerConfig& config, bool useGpu) Layer::Layer(const LayerConfig& config, bool useGpu)
: config_(config), : config_(config),
useGpu_(useGpu), useGpu_(useGpu),
deviceId_(-1), deviceId_(CPU_DEVICE),
needSequenceInfo_(true) {} needSequenceInfo_(true) {}
bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
......
...@@ -59,7 +59,12 @@ protected: ...@@ -59,7 +59,12 @@ protected:
LayerConfig config_; LayerConfig config_;
/// whether to use GPU /// whether to use GPU
bool useGpu_; bool useGpu_;
/// Device Id. CPU is -1, and GPU is 0, 1, 2 ... /// Paddle device ID, MKLDNN is -2, CPU is -1
enum PADDLE_DEVICE_ID {
MKLDNN_DEVICE = -2,
CPU_DEVICE = -1,
};
/// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
int deviceId_; int deviceId_;
/// Input layers /// Input layers
std::vector<LayerPtr> inputLayers_; std::vector<LayerPtr> inputLayers_;
...@@ -77,6 +82,7 @@ protected: ...@@ -77,6 +82,7 @@ protected:
Argument output_; Argument output_;
/// Several outputs stored on different devices, used in 'parallel_nn' case, /// Several outputs stored on different devices, used in 'parallel_nn' case,
/// and record them by deviceId_. /// and record them by deviceId_.
/// Also used in 'use_mkldnn' case.
std::vector<Argument> outputOtherDevice_; std::vector<Argument> outputOtherDevice_;
/// If there are several outputs, map them by each name. /// If there are several outputs, map them by each name.
std::map<std::string, Argument*> outputMap_; std::map<std::string, Argument*> outputMap_;
...@@ -172,6 +178,13 @@ protected: ...@@ -172,6 +178,13 @@ protected:
return inputLayer.getOutput(deviceId_); return inputLayer.getOutput(deviceId_);
} }
/**
* Get the argument of input layer with deviceId.
*/
const Argument& getInput(size_t inputIndex, int deviceId) const {
return inputLayers_[inputIndex]->getOutput(deviceId);
}
/** /**
* Get the forward-input value. * Get the forward-input value.
*/ */
...@@ -186,6 +199,13 @@ protected: ...@@ -186,6 +199,13 @@ protected:
return inputLayer.getOutput(deviceId_).value; return inputLayer.getOutput(deviceId_).value;
} }
/**
* Get the forward-input value with deviceId.
*/
const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).value;
}
/** /**
* Get the forward-input grad. * Get the forward-input grad.
*/ */
...@@ -200,6 +220,13 @@ protected: ...@@ -200,6 +220,13 @@ protected:
return inputLayer.getOutput(deviceId_).grad; return inputLayer.getOutput(deviceId_).grad;
} }
/**
* Get the forward-input grad.
*/
const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
return inputLayers_[inputIndex]->getOutput(deviceId).grad;
}
/** /**
* Get the forward-input label. * Get the forward-input label.
*/ */
......
...@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() { ...@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
return; return;
} }
// TODO(TJ): dst format should get from wgtVal_ CHECK(wgtVal_) << "should have been initialized";
int dstFmt = PARAM_FORMAT_MKLDNN_OI; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
int srcFmt = weight_->getParameterPtr()->getHeaderFormat(); auto targetDim = wgtVal_->getDims();
if (srcFmt == dstFmt) { auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
return; wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
}
// The weight_ is transposed from initial paddle weight
MatrixPtr paddleWgt = Matrix::create(
weight_->getW()->getData(), iLayerSize_, oc_, false, false);
// TODO(TJ): remove this print when do not need differ weights
std::ostringstream ostr;
paddleWgt->print(ostr);
VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
// The mkldnn weight is transposed from initial paddle matrix
MatrixPtr paddleWgtT;
paddleWgt->transpose(paddleWgtT, true);
weight_->getW()->copyFrom(*paddleWgtT);
weight_->getParameterPtr()->setHeaderFormat(dstFmt);
hasInitedWgt_ = true; hasInitedWgt_ = true;
} }
void MKLDNNFcLayer::convertWeightsToPaddle() { void MKLDNNFcLayer::convertWeightsToPaddle() {
MatrixPtr dnnWgt = weight_->getW(); CHECK(wgtVal_) << "should have been initialized";
MatrixPtr paddleWgt; bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
dnnWgt->transpose(paddleWgt, true); auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
// copy paddle weight and override on weight_ wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
MatrixPtr dnnWgtT = Matrix::create( }
dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
dnnWgtT->copyFrom(*paddleWgt); void MKLDNNFcLayer::convertOutputToOtherDevice() {
copyOutputInfoToOtherDevice();
// find other cpu device and reorder output to cpu device
int cnt = 0;
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
// fc cpu output value do not need convert
// just share point
outputOtherDevice_[i].value = output_.value;
++cnt;
}
}
if (cnt > 1) {
LOG(WARNING) << "should not have more than one CPU devie";
}
} }
void MKLDNNFcLayer::reshape() { void MKLDNNFcLayer::reshape() {
const Argument& input = getInput(0); const Argument& input = getInput(0, getPrev(0)->getDeviceId());
int batchSize = input.getBatchSize(); int batchSize = input.getBatchSize();
if (bs_ == batchSize) { if (bs_ == batchSize) {
return; return;
...@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() { ...@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
if (iw_ == 0) { if (iw_ == 0) {
iw_ = 1; iw_ = 1;
} }
hasSpatial_ = true;
if (ih_ == 1 && iw_ == 1) {
hasSpatial_ = false;
}
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize()); CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
ic_ = iLayerSize_ / (ih_ * iw_); ic_ = iLayerSize_ / (ih_ * iw_);
CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible"; CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
...@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() { ...@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {
void MKLDNNFcLayer::resetFwd() { void MKLDNNFcLayer::resetFwd() {
bool hasBias = biases_ && biases_->getW(); bool hasBias = biases_ && biases_->getW();
real* iData = getInputValue(0)->getData(); const MatrixPtr& wgt = weight_->getW();
real* oData = getOutputValue()->getData(); const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
real* wData = weight_->getW()->getData(); const MatrixPtr& out = output_.value;
real* bData = hasBias ? biases_->getW()->getData() : NULL;
if (inputIsOnlyMKLDNN()) {
// TODO(TJ): below create should be covered in MkldnnMatrix const MatrixPtr& in = getInputValue(0);
// create memory desc inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) CHECK(inVal_) << "Input should be MKLDNNMatrix";
: createMD({bs_, ic_}, format::nc); } else {
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw) CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
: createMD({oc_, ic_}, format::oi); const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
memory::desc bMD = bData != NULL ? createMD({oc_}, format::x) inVal_ = MKLDNNMatrix::create(
: createMD({}, format::format_undef); in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
memory::desc oMD = createMD({bs_, oc_}, format::nc); }
inVal_->downSpatial();
// create memory primitive desc and memory self wgtVal_ = MKLDNNMatrix::create(
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData)); wgtVal_->downSpatial();
outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData)); biasVal_ =
hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
// change original output value to mkldnn output value
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
if (!outputIsOnlyMKLDNN()) {
convertOutputToOtherDevice();
}
// create forward handle
prop_kind pk = prop_kind::forward; prop_kind pk = prop_kind::forward;
fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD) fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
: fc_fwd::desc(pk, iMD, wMD, oMD); inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
biasVal_->getMemoryDesc(),
outVal_->getMemoryDesc())
: fc_fwd::desc(pk,
inVal_->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
outVal_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
if (hasBias) {
if (bData != NULL) {
biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_)); fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
} else { } else {
fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_)); fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
} }
printValueFormatFlow();
pipelineFwd_.clear(); pipelineFwd_.clear();
pipelineFwd_.push_back(*fwd_); pipelineFwd_.push_back(*fwd_);
} }
...@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() { ...@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
return; return;
} }
needResetBwd_ = false; needResetBwd_ = false;
bool hasBias = biases_ && biases_->getWGrad(); bool hasBias = biases_ && biases_->getWGrad();
real* iData = getInputValue(0)->getData();
real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
real* oDiff = getOutputGrad()->getData();
real* wDiff = weight_->getWGrad()->getData();
real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
/// backward weight /// backward weight
// create memory desc for backward memory CHECK(inVal_) << "Should have input value";
memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw) const MatrixPtr& wgt = weight_->getWGrad();
: createMD({bs_, ic_}, format::nc); const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
: createMD({oc_, ic_}, format::oi); // TODO(TJ): merge outgrad
memory::desc oMD = createMD({bs_, oc_}, format::nc); int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x) // for MKLDNN device:
: createMD({}, format::format_undef); // can not directly cast outputgrad to mkldnnmatrix,
// since each layer can not write the inputgrad to mkldnn inputgrad.
if (inVal_) { // So just create from matrix with outputvalue format.
// update data // for CPU device:
inVal_->set_data_handle(iData); // fc do not need to convert from cpu device since output is always nc format
} else { // only need create from cpu device
inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData)); const MatrixPtr& out = getOutput(device).grad;
} outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
// create memory primitive desc and memory self biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff)); : nullptr;
outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
// create memory primitive desc
fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD); fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_); fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL fc_bwdWgt::desc bwdWgtDesc = hasBias
? fc_bwdWgt::desc(iMD, wMD, bMD, oMD) ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
: fc_bwdWgt::desc(iMD, wMD, oMD); wgtGrad_->getMemoryDesc(),
biasGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc())
: fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdWgt::primitive_desc bwdWgtPD = fc_bwdWgt::primitive_desc bwdWgtPD =
fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD); fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
if (bDiff != NULL) { if (hasBias) {
biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
bwdWgt_.reset( bwdWgt_.reset(
new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_)); new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
} else { } else {
...@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() { ...@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
pipelineBwd_.push_back(*bwdWgt_); pipelineBwd_.push_back(*bwdWgt_);
/// backward data /// backward data
if (iDiff == NULL) { device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
const MatrixPtr& in = getInputGrad(0, device);
if (in == nullptr) {
return; return;
} }
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD); if (getInput(0, device).getAllCount() > 1) {
// TODO(TJ): use outputMaps_ ways when merge outgrad done
} else {
inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
}
fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
wgtGrad_->getMemoryDesc(),
outGrad_->getMemoryDesc());
fc_bwdData::primitive_desc bwdDataPD = fc_bwdData::primitive_desc bwdDataPD =
fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD); fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
CHECK(wgtVal_) << "Should have weight memory"; CHECK(wgtVal_) << "Should have weight memory";
bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_)); bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
printGradFormatFlow();
pipelineBwd_.push_back(*bwdData_); pipelineBwd_.push_back(*bwdData_);
} }
...@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) { ...@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
{ {
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
syncInputValue();
// update input data
// since it might be changed if this is after data layer
real* iData = getInputValue(0)->getData();
inVal_->set_data_handle(iData);
// just submit forward pipeline // just submit forward pipeline
stream_->submit(pipelineFwd_); stream_->submit(pipelineFwd_);
...@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) { ...@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str()); REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
resetBwd(); resetBwd();
// update diff syncOutputGrad();
real* oDiff = getOutputGrad()->getData();
outGrad_->set_data_handle(oDiff);
// just sumbmit backward pipeline // just sumbmit backward pipeline
stream_->submit(pipelineBwd_); stream_->submit(pipelineBwd_);
} }
......
...@@ -32,16 +32,13 @@ protected: ...@@ -32,16 +32,13 @@ protected:
// if has already init the weight // if has already init the weight
bool hasInitedWgt_; bool hasInitedWgt_;
// if input layer has image size info (ih>1 && iw>1)
bool hasSpatial_;
// fc weight and bias // fc weight and bias
std::unique_ptr<Weight> weight_; std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> biases_; std::unique_ptr<Weight> biases_;
public: public:
explicit MKLDNNFcLayer(const LayerConfig& config) explicit MKLDNNFcLayer(const LayerConfig& config)
: MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {} : MKLDNNLayer(config), hasInitedWgt_(false) {}
~MKLDNNFcLayer() {} ~MKLDNNFcLayer() {}
...@@ -75,6 +72,8 @@ protected: ...@@ -75,6 +72,8 @@ protected:
* only would be called when needed * only would be called when needed
*/ */
void resetBwd(); void resetBwd();
void convertOutputToOtherDevice() override;
}; };
} // namespace paddle } // namespace paddle
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include "Layer.h" #include "Layer.h"
#include "MKLDNNBase.h" #include "MKLDNNBase.h"
#include "mkldnn.hpp" #include "mkldnn.hpp"
#include "paddle/math/MKLDNNMatrix.h"
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
DECLARE_bool(use_mkldnn_wgt);
namespace paddle { namespace paddle {
...@@ -52,15 +52,15 @@ protected: ...@@ -52,15 +52,15 @@ protected:
std::vector<mkldnn::primitive> pipelineFwd_; std::vector<mkldnn::primitive> pipelineFwd_;
std::vector<mkldnn::primitive> pipelineBwd_; std::vector<mkldnn::primitive> pipelineBwd_;
// TODO(TJ): change below memory as MKLDNNMatrixPtr type // MKLDNNMatrixPtr
std::shared_ptr<mkldnn::memory> inVal_; MKLDNNMatrixPtr inVal_;
std::shared_ptr<mkldnn::memory> inGrad_; MKLDNNMatrixPtr inGrad_;
std::shared_ptr<mkldnn::memory> outVal_; MKLDNNMatrixPtr outVal_;
std::shared_ptr<mkldnn::memory> outGrad_; MKLDNNMatrixPtr outGrad_;
std::shared_ptr<mkldnn::memory> wgtVal_; MKLDNNMatrixPtr wgtVal_;
std::shared_ptr<mkldnn::memory> wgtGrad_; MKLDNNMatrixPtr wgtGrad_;
std::shared_ptr<mkldnn::memory> biasVal_; MKLDNNMatrixPtr biasVal_;
std::shared_ptr<mkldnn::memory> biasGrad_; MKLDNNMatrixPtr biasGrad_;
public: public:
explicit MKLDNNLayer(const LayerConfig& config) explicit MKLDNNLayer(const LayerConfig& config)
...@@ -83,17 +83,21 @@ public: ...@@ -83,17 +83,21 @@ public:
virtual bool init(const LayerMap& layerMap, virtual bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
// set device id before Layer::init
setDevice(MKLDNN_DEVICE);
// change param device to MKLDNN device
setParamsDevice(MKLDNN_DEVICE, parameterMap);
if (!Layer::init(layerMap, parameterMap)) { if (!Layer::init(layerMap, parameterMap)) {
return false; return false;
} }
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
stream_.reset(new MKLDNNStream()); stream_.reset(new MKLDNNStream());
engine_ = CPUEngine::Instance().getEngine(); engine_ = CPUEngine::Instance().getEngine();
// TODO(TJ): deivecId
return true; return true;
} }
...@@ -109,6 +113,12 @@ public: ...@@ -109,6 +113,12 @@ public:
*/ */
virtual void convertWeightsToPaddle() {} virtual void convertWeightsToPaddle() {}
/**
* convert MKLDNN output to other device.
* only support CPU device yet
*/
virtual void convertOutputToOtherDevice() {}
/** /**
* print info about sizes * print info about sizes
*/ */
...@@ -118,14 +128,124 @@ public: ...@@ -118,14 +128,124 @@ public:
<< ", oh: " << oh_ << ", ow: " << ow_; << ", oh: " << oh_ << ", ow: " << ow_;
} }
// TODO(TJ): move to MkldnnMatrix /**
// create memory desc * Print the mkldnn memory format flow of value
inline mkldnn::memory::desc createMD( */
mkldnn::memory::dims dims, virtual void printValueFormatFlow() {
mkldnn::memory::format fmt, if (inVal_ && outVal_) {
mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) { VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
// TODO(TJ): isFmtSuppoted(fmt) << " >>> " << outVal_->getFormat();
return mkldnn::memory::desc(dims, type, fmt); }
}
/**
* Print the mkldnn memory format flow of grad
*/
virtual void printGradFormatFlow() {
if (inGrad_ && outGrad_) {
VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
<< " <<< " << outGrad_->getFormat();
}
}
protected:
/**
* copy image size and sequence info to other device
* @note: can not directly use Layer::copyOutputToOtherDevice since here only
* copy base info and do not copy data value
*/
void copyOutputInfoToOtherDevice() {
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
outputOtherDevice_[i].sequenceStartPositions =
output_.sequenceStartPositions;
outputOtherDevice_[i].subSequenceStartPositions =
output_.subSequenceStartPositions;
outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
}
}
/**
* If input only has MKLDNN device.
* Otherwise, only support the previous layer using CPU device.
*/
bool inputIsOnlyMKLDNN(int index = 0) {
int prevDevice = getPrev(index)->getDeviceId();
if (prevDevice == MKLDNN_DEVICE) {
return true;
} else {
// do not support GPU yet
CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
return false;
}
}
/**
* If output only has MKLDNN device.
* Otherwise, other devices should only using CPU device.
*/
bool outputIsOnlyMKLDNN() {
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
<< "Only support other device is CPU yet";
}
return outputOtherDevice_.size() == 0;
}
/**
* Sync input value data
*/
void syncInputValue() {
if (inputIsOnlyMKLDNN()) {
return;
}
real* iData = getInputValue(0, CPU_DEVICE)->getData();
// update input data
// since it might be changed if this is after data layer
inVal_->updateData(iData);
}
/**
* Sync output grad data
*/
void syncOutputGrad() {
if (outputIsOnlyMKLDNN()) {
return;
}
// update diff
real* oDiff = getOutput(CPU_DEVICE).grad->getData();
outGrad_->updateData(oDiff);
}
/**
* Set deviceId of this layer.
*/
void setDevice(int id) { deviceId_ = id; }
/**
* Set deviceId of the params used in this layer.
*/
void setParamsDevice(int id, const ParameterMap& parameterMap) {
for (auto& inputConfig : config_.inputs()) {
if (inputConfig.has_input_parameter_name()) {
ParameterPtr parameter;
std::string name = inputConfig.input_parameter_name();
CHECK(mapGet(name, parameterMap, &parameter))
<< "Cannot find input parameter " << name << " for layer "
<< getName();
parameter->setDevice(id);
}
}
if (config_.has_bias_parameter_name()) {
ParameterPtr parameter;
std::string name = config_.bias_parameter_name();
CHECK(mapGet(name, parameterMap, &parameter))
<< "Cannot find bias parameter " << name << " for layer "
<< getName();
parameter->setDevice(id);
}
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
namespace paddle {
/**
* A layer applies a linear transformation to each element in each row of
* the input matrix. For each element, the layer first re-scale it and then
* adds a bias to it.
*
* \f[
* y = wx + b
* \f]
*
* Here, w is the scale and b is the bias. Both w and b are trainable scalars.
*
*/
class ScaleShiftLayer : public Layer {
protected:
std::unique_ptr<Weight> scale_;
std::unique_ptr<Weight> offset_;
public:
explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
REGISTER_LAYER(scale_shift, ScaleShiftLayer);
bool ScaleShiftLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 1U);
scale_.reset(new Weight(1, 1, parameters_[0]));
if (biasParameter_.get() != NULL) {
offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
}
return true;
}
void ScaleShiftLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV = getInputValue(0);
resetOutput(inV->getHeight(), inV->getWidth());
MatrixPtr outV = getOutputValue();
real scaleValue = scale_->getW()->getElement(0, 0);
outV->mulScalar(*inV, scaleValue);
if (offset_) {
real offsetValue = offset_->getW()->getElement(0, 0);
outV->add(offsetValue);
}
}
void ScaleShiftLayer::backward(const UpdateCallback& callback) {
MatrixPtr inV = getInputValue(0);
MatrixPtr inG = getInputGrad(0);
MatrixPtr outV = getOutputValue();
MatrixPtr outG = getOutputGrad();
/* Calculate the parameter gradient for the current layer */
if (scale_->getWGrad()) {
MatrixPtr rowSumMtx;
Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
rowSumMtx->sumOfProducts(
/* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
scale_->getWGrad()->sumCols(
/* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
scale_->getParameterPtr()->incUpdate(callback);
}
if (offset_ && offset_->getWGrad()) {
MatrixPtr rowSumMtx;
Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
rowSumMtx->sumRows(*outG, 1., 0.);
offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
offset_->getParameterPtr()->incUpdate(callback);
}
/* Calculate the input layers error */
if (inG) {
real scaleValue = scale_->getW()->getElement(0, 0);
inG->add(*outG, scaleValue);
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Vector.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
class SequenceSliceLayer : public Layer {
public:
explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
private:
/*
* TODO(caoying)
* In PaddePaddle, currently all matrices are real number types,
* but the second and the (optional) third input which are some
* selected indices of the give sequence to trim the sequence, are actually
* filled with int types so that storing int types information in real number
* matrices is very dangerous, since real numbers will be convered to int
* types. If a user fills this matrix himself, invalid data may occor.
*/
MatrixPtr startIdsOnCpu_;
MatrixPtr endIdsOnCpu_;
std::vector<int> selectedRows_;
IVectorPtr rowIndice_;
std::vector<std::vector<int>> inputSeqInfoVec_;
std::vector<int> outSubSeqStartPos_;
std::vector<int> outSeqStartPos_;
void checkInputs();
void copySliceIdsToCpu();
void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
};
REGISTER_LAYER(seq_slice, SequenceSliceLayer);
bool SequenceSliceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
CHECK_GE(inputLayers_.size(), 2U);
CHECK_LE(inputLayers_.size(), 3U);
setNeedSequenceInfo(false);
return true;
}
void SequenceSliceLayer::checkInputs() {
const Argument& inputSeq = getInput(0);
CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
<< "must be a sequence.";
const MatrixPtr indices1 = getInputValue(1);
CHECK_EQ(static_cast<size_t>(indices1->getHeight()),
inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
: inputSeq.getNumSequences())
<< "Height of the second input should be equal to number of sequence "
<< "in the first input.";
if (inputLayers_.size() == 3) {
const MatrixPtr indices2 = getInputValue(2);
CHECK_EQ(indices2->getHeight(), indices1->getHeight())
<< "start indices and end indices should have the same height.";
CHECK_EQ(indices2->getWidth(), indices1->getWidth())
<< "start indices and end indices should have the same Width.";
}
}
void SequenceSliceLayer::copySliceIdsToCpu() {
const MatrixPtr indices1 = getInputValue(1);
if (inputLayers_.size() == 2U) {
if (config_.select_first()) {
Matrix::resizeOrCreate(startIdsOnCpu_,
indices1->getHeight(),
indices1->getWidth(),
false /* trans */,
false /* useGpu */);
startIdsOnCpu_->copyFrom(*indices1);
endIdsOnCpu_ = nullptr;
} else {
Matrix::resizeOrCreate(endIdsOnCpu_,
indices1->getHeight(),
indices1->getWidth(),
false /* trans */,
false /* useGpu */);
endIdsOnCpu_->copyFrom(*indices1);
startIdsOnCpu_ = nullptr;
}
} else if (inputLayers_.size() == 3U) {
Matrix::resizeOrCreate(startIdsOnCpu_,
indices1->getHeight(),
indices1->getWidth(),
false /* trans */,
false /* useGpu */);
startIdsOnCpu_->copyFrom(*indices1);
const MatrixPtr indices2 = getInputValue(2);
Matrix::resizeOrCreate(endIdsOnCpu_,
indices2->getHeight(),
indices2->getWidth(),
false /* trans */,
false /* useGpu */);
endIdsOnCpu_->copyFrom(*indices2);
}
}
void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
const MatrixPtr ends) {
CHECK(starts || ends) << "At least one of the start or end indices "
<< "should be given.";
bool hasSubseq = getInput(0).hasSubseq();
outSeqStartPos_.resize(1, 0);
outSubSeqStartPos_.resize(1, 0);
selectedRows_.clear();
size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
size_t rowIdx = 0;
for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
for (size_t k = 0; k < beamSize; ++k) {
if (starts && starts->getElement(rowIdx, k) == -1.) break;
if (ends && ends->getElement(rowIdx, k) == -1.) break;
int begPos = inputSeqInfoVec_[i][j];
if (starts) begPos += starts->getElement(rowIdx, k);
int endPos = inputSeqInfoVec_[i][j + 1] - 1;
if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
int seqLen = endPos - begPos + 1;
CHECK_GT(seqLen, 0U);
for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
hasSubseq
? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
: outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
}
rowIdx++;
}
if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
}
if (useGpu_) {
rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
} else {
rowIndice_ =
IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
}
// create the sequence information for the output.
ICpuGpuVector::resizeOrCreate(
output_.sequenceStartPositions, outSeqStartPos_.size(), false);
output_.sequenceStartPositions->copyFrom(
outSeqStartPos_.data(), outSeqStartPos_.size(), false);
if (hasSubseq) {
ICpuGpuVector::resizeOrCreate(
output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
output_.subSequenceStartPositions->copyFrom(
outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
}
}
void SequenceSliceLayer::forward(PassType passType) {
Layer::forward(passType);
checkInputs();
const Argument& inputSeq = getInput(0);
inputSeqInfoVec_.clear();
Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
inputSeq.subSequenceStartPositions,
inputSeqInfoVec_);
if (!useGpu_) {
if (inputLayers_.size() == 2U) {
startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
} else if (inputLayers_.size() == 3U) {
startIdsOnCpu_ = getInputValue(1);
endIdsOnCpu_ = getInputValue(2);
}
} else {
copySliceIdsToCpu();
}
/*
* calculate the selected row indices in a batch, and build the output
* sequence information.
*/
calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
resetOutput(selectedRows_.size(), getSize());
getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
}
void SequenceSliceLayer::backward(const UpdateCallback& callback) {
getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
}
} // namespace paddle
...@@ -52,23 +52,34 @@ private: ...@@ -52,23 +52,34 @@ private:
* ] * ]
* *
* ths output is saved to private member rowIndice_; * ths output is saved to private member rowIndice_;
* [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
* 16,17,18,19,20,21,22,23,24,25,26,27]
*/ */
void calSelectedCols(const MatrixPtr selectedIndices, void calSelectedRows(const MatrixPtr selectedIndices,
const std::vector<std::vector<int>>& inputSeqInfo); const std::vector<std::vector<int>>& inputSeqInfo);
// if the second input of this layer is on GPU memory, copy it to CPU memory. /*
* TODO(caoying)
* In PaddePaddle, currently all matrices are real number types,
* but the second is some selected indices of the give sequence to trim
* the nested sequence, are actually filled with int types so that storing
* int types information in real number matrices is very dangerous, since
* real numbers will be convered to int types. If a user fills this matrix
* himself, invalid data may occor.
*
* if the second input of this layer is on GPU memory, copy it to CPU memory.
*/
MatrixPtr selIdsCpu_; MatrixPtr selIdsCpu_;
// reorganized sequenceStartPositions and subSequenceStartPositions /*
// into a 2d vector to facilitate the sequence selection process. * reorganize sequenceStartPositions and subSequenceStartPositions
* into a 2d vector to facilitate the sequence selection process.
*/
std::vector<std::vector<int>> inputSeqInfoVec_; std::vector<std::vector<int>> inputSeqInfoVec_;
// the final selected row indices in a batch, /* store the final selected row indices in a batch */
// rowIdx_ and selectedRows_ actually share a same memory.
IVectorPtr rowIndice_; IVectorPtr rowIndice_;
/* rowIndice_ and selectedRows_ actually share a same memory. */
std::vector<int> selectedRows_; std::vector<int> selectedRows_;
}; };
...@@ -83,7 +94,7 @@ bool SubNestedSequenceLayer::init(const LayerMap& layerMap, ...@@ -83,7 +94,7 @@ bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
return true; return true;
} }
void SubNestedSequenceLayer::calSelectedCols( void SubNestedSequenceLayer::calSelectedRows(
const MatrixPtr selectedIndices, const MatrixPtr selectedIndices,
const std::vector<std::vector<int>>& inputSeqInfo) { const std::vector<std::vector<int>>& inputSeqInfo) {
selectedRows_.clear(); selectedRows_.clear();
...@@ -160,7 +171,7 @@ void SubNestedSequenceLayer::forward(PassType passType) { ...@@ -160,7 +171,7 @@ void SubNestedSequenceLayer::forward(PassType passType) {
Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions, Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
inputSeq.subSequenceStartPositions, inputSeq.subSequenceStartPositions,
inputSeqInfoVec_); inputSeqInfoVec_);
calSelectedCols(selIdsCpu_, inputSeqInfoVec_); calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
resetOutput(selectedRows_.size(), getSize()); resetOutput(selectedRows_.size(), getSize());
getOutputValue()->selectRows(*getInputValue(0), *rowIndice_); getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
......
...@@ -34,6 +34,12 @@ add_unittest_without_exec(test_CRFLayerGrad ...@@ -34,6 +34,12 @@ add_unittest_without_exec(test_CRFLayerGrad
add_test(NAME test_CRFLayerGrad add_test(NAME test_CRFLayerGrad
COMMAND test_CRFLayerGrad) COMMAND test_CRFLayerGrad)
################ test_SeqSliceLayerGrad ####################
add_unittest_without_exec(test_SeqSliceLayerGrad
test_SeqSliceLayerGrad.cpp
LayerGradUtil.cpp)
add_test(NAME test_SeqSliceLayerGrad
COMMAND test_SeqSliceLayerGrad)
add_unittest_without_exec(test_ActivationGrad add_unittest_without_exec(test_ActivationGrad
test_ActivationGrad.cpp test_ActivationGrad.cpp
......
...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_ONLY_CPU
#include <cudnn.h>
#endif
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -189,10 +192,16 @@ TEST(Projection, scaling) { ...@@ -189,10 +192,16 @@ TEST(Projection, scaling) {
void testProjectionConv(size_t groups, bool isDeconv) { void testProjectionConv(size_t groups, bool isDeconv) {
const int NUM_FILTERS = 18; const int NUM_FILTERS = 18;
const int FILTER_SIZE = 2; const int FILTER_SIZE = 2;
const int FILTER_SIZE_Y = 4; const int FILTER_SIZE_Y = 2;
const int CHANNELS = 3; const int CHANNELS = 3;
const int IMAGE_SIZE = 16; const int IMAGE_SIZE = 16;
#if CUDNN_VERSION >= 6000
const int DILATION = 2;
#else
const int DILATION = 1;
#endif
ProjectionConfig conf; ProjectionConfig conf;
if (isDeconv) { if (isDeconv) {
conf.set_type("convt"); conf.set_type("convt");
...@@ -209,6 +218,8 @@ void testProjectionConv(size_t groups, bool isDeconv) { ...@@ -209,6 +218,8 @@ void testProjectionConv(size_t groups, bool isDeconv) {
conv->set_padding_y(1); conv->set_padding_y(1);
conv->set_stride(2); conv->set_stride(2);
conv->set_stride_y(2); conv->set_stride_y(2);
conv->set_dilation(DILATION);
conv->set_dilation_y(DILATION);
conv->set_groups(groups); conv->set_groups(groups);
if (isDeconv) { if (isDeconv) {
conv->set_filter_channels(NUM_FILTERS / conv->groups()); conv->set_filter_channels(NUM_FILTERS / conv->groups());
...@@ -217,12 +228,12 @@ void testProjectionConv(size_t groups, bool isDeconv) { ...@@ -217,12 +228,12 @@ void testProjectionConv(size_t groups, bool isDeconv) {
} }
conv->set_img_size(IMAGE_SIZE); conv->set_img_size(IMAGE_SIZE);
int output_x = outputSize(conv->img_size(), int output_x = outputSize(conv->img_size(),
conv->filter_size(), (conv->filter_size() - 1) * DILATION + 1,
conv->padding(), conv->padding(),
conv->stride(), conv->stride(),
/* caffeMode */ true); /* caffeMode */ true);
int output_y = outputSize(conv->img_size(), int output_y = outputSize(conv->img_size(),
conv->filter_size_y(), (conv->filter_size_y() - 1) * DILATION + 1,
conv->padding_y(), conv->padding_y(),
conv->stride_y(), conv->stride_y(),
/* caffeMode */ true); /* caffeMode */ true);
...@@ -424,27 +435,38 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -424,27 +435,38 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
config.layerConfig.set_partial_sum(1); config.layerConfig.set_partial_sum(1);
config.layerConfig.set_shared_biases(true); config.layerConfig.set_shared_biases(true);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288}); int dilation = 1;
if (type == "cudnn_conv") {
#if CUDNN_VERSION >= 6000
dilation = 2;
#else
dilation = 1;
#endif
}
config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
LayerInputConfig* input = config.layerConfig.add_inputs(); LayerInputConfig* input = config.layerConfig.add_inputs();
ConvConfig* conv = input->mutable_conv_conf(); ConvConfig* conv = input->mutable_conv_conf();
conv->set_filter_size(2); conv->set_filter_size(2);
conv->set_filter_size_y(3); conv->set_filter_size_y(2);
conv->set_channels(3); conv->set_channels(3);
conv->set_padding(0); conv->set_padding(0);
conv->set_padding_y(1); conv->set_padding_y(1);
conv->set_stride(2); conv->set_stride(2);
conv->set_stride_y(2); conv->set_stride_y(2);
conv->set_dilation(dilation);
conv->set_dilation_y(dilation);
conv->set_groups(1); conv->set_groups(1);
conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_filter_channels(conv->channels() / conv->groups());
conv->set_img_size(16); conv->set_img_size(16);
conv->set_img_size_y(8); conv->set_img_size_y(16);
conv->set_output_x(outputSize(conv->img_size(), conv->set_output_x(outputSize(conv->img_size(),
conv->filter_size(), (conv->filter_size() - 1) * dilation + 1,
conv->padding(), conv->padding(),
conv->stride(), conv->stride(),
/* caffeMode */ true)); /* caffeMode */ true));
conv->set_output_y(outputSize(conv->img_size_y(), conv->set_output_y(outputSize(conv->img_size_y(),
conv->filter_size_y(), (conv->filter_size_y() - 1) * dilation + 1,
conv->padding_y(), conv->padding_y(),
conv->stride_y(), conv->stride_y(),
/* caffeMode */ true)); /* caffeMode */ true));
...@@ -828,9 +850,27 @@ TEST(Layer, square_error_weighted) { ...@@ -828,9 +850,27 @@ TEST(Layer, square_error_weighted) {
} }
} }
TEST(Layer, huber_regression_loss) {
TestConfig config;
config.layerConfig.set_type("huber_regression");
config.biasSize = 0;
config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
for (auto delta : {1, 3, 5}) {
config.layerConfig.set_delta(delta);
testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
}
}
}
TEST(Layer, huber_two_class) { TEST(Layer, huber_two_class) {
TestConfig config; TestConfig config;
config.layerConfig.set_type("huber"); config.layerConfig.set_type("huber_classification");
config.biasSize = 0; config.biasSize = 0;
config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0}); config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
...@@ -839,7 +879,7 @@ TEST(Layer, huber_two_class) { ...@@ -839,7 +879,7 @@ TEST(Layer, huber_two_class) {
config.layerConfig.add_inputs(); config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) { for (auto useGpu : {false, true}) {
testLayerGrad(config, "huber", 100, /* trans */ false, useGpu); testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
} }
} }
...@@ -2007,6 +2047,21 @@ TEST(Layer, RowL2NormLayer) { ...@@ -2007,6 +2047,21 @@ TEST(Layer, RowL2NormLayer) {
} }
} }
TEST(Layer, ScaleShiftLayer) {
const size_t batchSize = 16;
const size_t size = 32;
TestConfig config;
config.layerConfig.set_type("scale_shift");
config.layerConfig.set_size(size);
config.biasSize = 1;
config.inputDefs.push_back(
{INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
......
...@@ -269,7 +269,8 @@ TEST(Compare, img_conv2) { ...@@ -269,7 +269,8 @@ TEST(Compare, img_conv2) {
bool useGpu = FLAGS_use_gpu; bool useGpu = FLAGS_use_gpu;
double eps = FLAGS_checkgrad_eps; double eps = FLAGS_checkgrad_eps;
FLAGS_use_gpu = true; FLAGS_use_gpu = true;
FLAGS_checkgrad_eps = 1e-2; // Sometimes, this unit test will fail with 1e-2
FLAGS_checkgrad_eps = 4e-2;
compareNetwork(config_file_a, config_file_b); compareNetwork(config_file_a, config_file_b);
FLAGS_use_gpu = useGpu; FLAGS_use_gpu = useGpu;
FLAGS_checkgrad_eps = eps; FLAGS_checkgrad_eps = eps;
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "ModelConfig.pb.h"
#include "paddle/gserver/layers/DataLayer.h"
#include "paddle/trainer/Trainer.h"
#include "LayerGradUtil.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
DECLARE_int32(gpu_id);
DECLARE_bool(thread_local_rand_use_global_seed);
const int MAX_SEQ_NUM = 17;
const int MAX_SEQ_LEN = 23;
const int MAX_BEAM_SIZE = 13;
const size_t SEED = (size_t)(time(NULL));
vector<real> randSampling(real range, int n) {
CHECK_GE(range, n);
vector<real> num(range);
iota(begin(num), end(num), 0.);
if (range == n) return num;
random_shuffle(begin(num), end(num));
num.resize(n);
sort(begin(num), end(num));
return num;
}
void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
seqStartPos.resize(1, 0);
subSeqStartPos.resize(1, 0);
srand(SEED);
int seqNum = 1 + (rand() % MAX_SEQ_NUM);
for (int i = 0; i < seqNum; ++i) {
int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
for (int j = 0; j < subSeqNum; ++j)
subSeqStartPos.push_back(subSeqStartPos.back() +
(1 + (rand() % MAX_SEQ_LEN)));
seqStartPos.push_back(subSeqStartPos.back());
}
}
/*
generate start indices according to sequence start positions.
*/
void genStarts(vector<int>& seqStartPos,
vector<vector<real>>& starts,
size_t beamSize) {
starts.clear();
starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
int seqLen = seqStartPos[i + 1] - seqStartPos[i];
vector<real> randStarts =
randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
copy(begin(randStarts), end(randStarts), begin(starts[i]));
}
}
/*
generate end indices according to sequence start positions and start indices.
*/
void genEnds(vector<int>& seqStartPos,
vector<vector<real>>& starts,
vector<vector<real>>& ends,
size_t beamSize) {
CHECK_EQ(seqStartPos.size() - 1, starts.size());
ends.clear();
ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
for (size_t i = 0; i < starts.size(); ++i) {
for (size_t j = 0; j < starts[i].size(); ++j) {
int seqLen = seqStartPos[i + 1] - seqStartPos[i];
CHECK_GE(seqLen - 1, starts[i][j]);
if (starts[i][j] == -1.) break;
if (starts[i][j] == (seqLen - 1)) {
ends[i][j] = starts[i][j];
} else {
ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
}
}
}
}
void genTestData(vector<int>& seqStartPos,
vector<int>& subSeqStartPos,
vector<vector<real>>& starts,
vector<vector<real>>& ends,
bool hasSubseq) {
size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
genSeqInfo(seqStartPos, subSeqStartPos);
genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
}
template <typename T>
void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
size_t totalSize{0};
for (auto const& items : inVec) totalSize += items.size();
outVec.reserve(totalSize);
for (auto& items : inVec)
move(items.begin(), items.end(), back_inserter(outVec));
}
void testSeqSliceLayer(bool hasSubseq,
bool useGpu,
vector<int>& seqStartPos,
vector<int>& subSeqStartPos,
vector<vector<real>>& starts,
vector<vector<real>>& ends) {
// layer size is not crutial for this layer,
// so here use a small layer size in the unittest.
const size_t layerSize{4};
TestConfig config;
config.layerConfig.set_type("seq_slice");
config.layerConfig.set_size(layerSize);
// add the first input
MatrixPtr seqInputPtr =
Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
layerSize,
false,
false);
seqInputPtr->randomizeUniform();
if (hasSubseq) {
config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
"seq_input",
seqInputPtr,
seqStartPos,
subSeqStartPos});
} else {
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
}
config.layerConfig.add_inputs();
// add start indices
if (starts.size()) {
vector<real> startsToVec;
flatten2dVector(starts, startsToVec);
MatrixPtr startMatrixPtr =
Matrix::create(starts.size(), starts[0].size(), false, false);
startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
config.inputDefs.push_back(
{INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
config.layerConfig.add_inputs();
config.layerConfig.set_select_first(true);
}
// add end indices
if (ends.size()) {
vector<real> endsToVec;
flatten2dVector(ends, endsToVec);
MatrixPtr endMatrixPtr =
Matrix::create(ends.size(), ends[0].size(), false, false);
endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
config.layerConfig.add_inputs();
config.layerConfig.set_select_first(false);
}
testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
}
TEST(Layer, SeqSliceLayer) {
vector<int> seqStartPos;
vector<int> subSeqStartPos;
vector<vector<real>> starts;
vector<vector<real>> ends;
std::vector<bool> mode = {false};
#ifndef PADDLE_ONLY_CPU
mode.push_back(true);
#endif
genSeqInfo(seqStartPos, subSeqStartPos);
for (bool hasSubseq : {true, false}) {
LOG(INFO) << "hasSubSeq : " << hasSubseq;
genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
for (bool useGpu : mode) {
vector<vector<real>> tmp;
testSeqSliceLayer(
hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
testSeqSliceLayer(
hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
testSeqSliceLayer(
hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
}
}
}
int main(int argc, char** argv) {
initMain(argc, argv);
hl_start();
hl_init(FLAGS_gpu_id);
FLAGS_thread_local_rand_use_global_seed = true;
srand(1);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
...@@ -48,7 +48,13 @@ public: ...@@ -48,7 +48,13 @@ public:
*/ */
virtual void* alloc(size_t size) { virtual void* alloc(size_t size) {
void* ptr; void* ptr;
#ifdef PADDLE_USE_MKLDNN
// refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
// memory alignment
CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
#else
CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0); CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
#endif
CHECK(ptr) << "Fail to allocate CPU memory: size=" << size; CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
return ptr; return ptr;
} }
......
...@@ -14,6 +14,17 @@ ...@@ -14,6 +14,17 @@
# #
file(GLOB MATH_HEADERS . *.h) file(GLOB MATH_HEADERS . *.h)
file(GLOB MATH_SOURCES . *.cpp) file(GLOB MATH_SOURCES . *.cpp)
if(NOT WITH_MKLDNN)
set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
message(STATUS "Skip compiling with MKLDNNMatrix")
else()
message(STATUS "Compile with MKLDNNMatrix")
endif()
set(MATH_SOURCES set(MATH_SOURCES
"${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu" "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
"${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu" "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNMatrix.h"
using namespace mkldnn; // NOLINT
namespace paddle {
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
memory::desc md = pd.desc();
size_t ndims = md.data.ndims;
int* dims = md.data.dims;
CHECK(ndims > 0) << "Input dims should not be empty";
size_t cnts = 1;
for (size_t i = 0; i < ndims; ++i) {
cnts *= dims[i];
}
if (m == nullptr) {
size_t height = dims[0];
size_t width = cnts / dims[0];
m = Matrix::create(height, width, false, false);
}
CHECK(m) << " Matrix should not be empty";
CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
return std::make_shared<MKLDNNMatrix>(
m->getData(), m->getHeight(), m->getWidth(), pd);
}
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
memory::dims dims,
memory::format fmt,
engine& eg,
mkldnn::memory::data_type dtype) {
return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
}
void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
memory::format srcFmt,
memory::dims targetDim) {
memory::format dstFmt = getFormat();
if (srcFmt == dstFmt) {
return;
}
CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
}
void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
memory::format dstFmt,
memory::dims targetDim) {
memory::format srcFmt = getFormat();
if (srcFmt == dstFmt) {
return;
}
CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
}
void MKLDNNMatrix::reorderOnce(void* srcData,
void* dstData,
memory::format srcFmt,
memory::format dstFmt,
memory::dims dm) {
CHECK(srcData);
CHECK(dstData);
MatrixPtr tmpSrc;
if (dstData == srcData) {
// inplace data
size_t sz = 1;
for (size_t i = 0; i < dm.size(); ++i) {
sz *= dm[i];
}
tmpSrc = Matrix::create(sz, 1, false, false);
tmpSrc->copyFrom((real*)srcData, sz);
srcData = tmpSrc->getData();
}
auto dtype = this->getDtype();
auto srcMD = memory::desc(dm, dtype, srcFmt);
auto dstMD = memory::desc(dm, dtype, dstFmt);
auto eg = this->getEngine();
auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
auto r = reorder(src, dst);
stream(stream::kind::eager).submit({r}).wait();
}
void MKLDNNMatrix::downSpatial() {
int fmt = getFormat();
if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
// only support nchw and oihw yet, later can support more like nhwc, ihwo
return;
}
// TODO(TJ): change H(height) and W(width) if support nhwc or more
const int H = 2, W = 3;
memory::dims srcDims = getDims();
if (srcDims[H] != 1 || srcDims[W] != 1) {
// can not down spatial
return;
}
memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
memory::format dstFmt;
switch (fmt) {
case memory::format::nchw:
dstFmt = memory::format::nc;
break;
case memory::format::oihw:
dstFmt = memory::format::oi;
break;
default:
LOG(FATAL) << "unsupported format";
}
memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
mkldnn_primitive_t result;
mkldnn::error::wrap_c_api(
mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
"could not create a memory primitive");
reset(result);
set_data_handle(getData());
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "Matrix.h"
#include "mkldnn.hpp"
#include "paddle/parameter/Parameter.h"
namespace paddle {
class MKLDNNMatrix;
typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
/**
* @brief MKLDNN Matrix.
*
*/
class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
public:
MKLDNNMatrix(real* data,
size_t height,
size_t width,
mkldnn::memory::primitive_desc pd)
: CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
~MKLDNNMatrix() {}
/**
* Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
*/
static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
/**
* Create MKLDNNMatrix from a MatrixPtr and memory details info
*/
static MKLDNNMatrixPtr create(
MatrixPtr m,
mkldnn::memory::dims dims,
mkldnn::memory::format fmt,
mkldnn::engine& eg,
mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
public:
/**
* Reorder this MKLDNNMatrix from other format.
* Support inplace reorder.
* @note: this function would only reorder the data layout.
* will NOT change this original dim or format info
*/
void reorderDataFrom(const MKLDNNMatrixPtr& m,
memory::format srcFmt,
memory::dims targetDim);
/**
* Reorder this MKLDNNMatrix to other format.
* Support inplace reorder.
* @note: this function would only reorder the data layout.
* will NOT change the dst dim or format info
*/
void reorderDataTo(const MKLDNNMatrixPtr& m,
memory::format dstFmt,
memory::dims targetDim);
/**
* Dimensionality reduction.
* Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
*/
void downSpatial();
/**
* Update the memory data handle.
* Caution: This will not check the buffer size of the data,
* it should be coverd by user.
*/
void updateData(void* data) { set_data_handle(data); }
/**
* Get primitive descriptor.
*/
mkldnn::memory::primitive_desc getPrimitiveDesc() {
return this->get_primitive_desc();
}
/**
* Get memory descriptor.
*/
mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
/**
* Get dimensions.
*/
mkldnn::memory::dims getDims() {
mkldnn::memory::desc md = getMemoryDesc();
const int* src = md.data.dims;
int ndims = md.data.ndims;
mkldnn::memory::dims dst;
dst.resize(ndims);
for (int i = 0; i < ndims; ++i) {
dst[i] = src[i];
}
return dst;
}
/**
* Get format.
*/
mkldnn::memory::format getFormat() {
return (mkldnn::memory::format)(getMemoryDesc().data.format);
}
/**
* Get memory data type.
*/
mkldnn::memory::data_type getDtype() {
return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
}
/**
* Get engine.
*/
mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
protected:
/**
* Do reorder once.
* Can support inplace.
*/
void reorderOnce(void* srcData,
void* dstData,
memory::format srcFmt,
memory::format dstFmt,
memory::dims dm);
};
} // namespace paddle
...@@ -42,9 +42,12 @@ function(op_library TARGET) ...@@ -42,9 +42,12 @@ function(op_library TARGET)
endfunction() endfunction()
add_subdirectory(math) add_subdirectory(math)
cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(gather_test SRCS gather_test.cc DEPS tensor)
op_library(gather_op SRCS gather_op.cc gather_op.cu)
cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
op_library(scatter_op SRCS scatter_op.cc scatter_op.cu)
cc_library(net_op SRCS net_op.cc DEPS op_registry) cc_library(net_op SRCS net_op.cc DEPS op_registry)
cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
...@@ -66,5 +69,7 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu) ...@@ -66,5 +69,7 @@ op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
DEPS framework_proto tensor op_registry operator net_op) DEPS framework_proto tensor op_registry operator net_op)
op_library(uniform_random_op op_library(uniform_random_op SRCS uniform_random_op.cc uniform_random_op.cu)
SRCS uniform_random_op.cc uniform_random_op.cu) op_library(lookup_table_op SRCS lookup_table_op.cc lookup_table_op.cu)
op_library(scale_op SRCS scale_op.cc scale_op.cu DEPS net_op)
op_library(minus_op SRCS minus_op.cc minus_op.cu DEPS scale_op)
...@@ -39,11 +39,10 @@ class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -39,11 +39,10 @@ class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(const framework::InferShapeContext &ctx) const override { void InferShape(const framework::InferShapeContext &ctx) const override {
auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X")); auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto X = ctx.Input<Tensor>("X"); auto X = ctx.Input<Tensor>("X");
// TODO(superjom) add enforce here after helper functions ready dX->Resize(X->dims());
X_grad->Resize(X->dims());
} }
}; };
...@@ -70,9 +69,7 @@ namespace ops = paddle::operators; ...@@ -70,9 +69,7 @@ namespace ops = paddle::operators;
REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp, REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad, ops::OnehotCrossEntropyOpMaker, onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOp); ops::OnehotCrossEntropyGradientOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
onehot_cross_entropy, ops::OnehotCrossEntropyOpKernel<float>);
ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>); REGISTER_OP_CPU_KERNEL(onehot_cross_entropy_grad,
REGISTER_OP_CPU_KERNEL( ops::OnehotCrossEntropyGradientOpKernel<float>);
onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOpKernel<paddle::platform::CPUPlace, float>);
...@@ -12,10 +12,122 @@ ...@@ -12,10 +12,122 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#define EIGEN_USE_GPU #include "paddle/framework/op_registry.h"
#include "paddle/operators/cross_entropy_op.h" #include "paddle/platform/assert.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
__host__ __device__ T clipping_log(const T x) {
PADDLE_ASSERT(std::is_floating_point<T>::value);
const T kApproInf = 1e20;
T v = log(x);
if (v == INFINITY) {
return kApproInf;
}
if (v == -INFINITY) {
return -kApproInf;
}
return v;
}
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
const int N, const int D) {
// TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
// CUDA_1D_KERNEL_LOOP(i, N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
Y[i] = -clipping_log(X[i * D + label[i]]);
}
}
// TODO(qingqing): make zero setting an common function.
template <typename T>
__global__ void zero(T* X, const int N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
X[i] = 0.0;
}
}
template <typename T>
__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
const int* label, const int N,
const int D) {
// TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
// CUDA_1D_KERNEL_LOOP(i, N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
int idx = i * D + label[i];
dX[idx] = -dY[i] / X[idx];
}
}
template <typename T>
class OnehotCrossEntropyOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto X = ctx.Input<Tensor>("X");
const T* Xdata = X->data<T>();
const int* label_data = ctx.Input<Tensor>("label")->data<int>();
auto Y = ctx.Output<Tensor>("Y");
Y->mutable_data<T>(ctx.GetPlace());
T* Ydata = Y->data<T>();
int N = X->dims()[0];
int D = X->dims()[1];
int block = 512;
int grid = (N + block - 1) / block;
// TODO(qingqing) launch kernel on specified stream
// base on ExecutionContext.
CrossEntropyKernel<T><<<grid, block>>>(Ydata, Xdata, label_data, N, D);
}
};
template <typename T>
class OnehotCrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"It must use GPUPlace.");
auto X = ctx.Input<Tensor>("X");
auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
auto label = ctx.Input<Tensor>("label");
auto* dXdata = dX->template mutable_data<T>(ctx.GetPlace());
auto* dYdata = dY->template data<T>();
auto* Xdata = X->template data<T>();
auto* label_data = label->data<int>();
int N = X->dims()[0];
int D = X->dims()[1];
int block = 512;
int grid = (N * D + block - 1) / block;
zero<T><<<grid, block>>>(dXdata, N * D);
grid = (N + block - 1) / block;
// TODO(qingqing): launch kernel on specified stream
// base on ExecutionContext.
CrossEntropyGradientKernel<T><<<grid, block>>>(dXdata, dYdata, Xdata,
label_data, N, D);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
onehot_cross_entropy, ops::OnehotCrossEntropyOpCUDAKernel<float>);
ops::OnehotCrossEntropyOpKernel<paddle::platform::GPUPlace, float>); REGISTER_OP_GPU_KERNEL(onehot_cross_entropy_grad,
ops::OnehotCrossEntropyGradientOpCUDAKernel<float>);
...@@ -21,7 +21,7 @@ namespace operators { ...@@ -21,7 +21,7 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T> template <typename T>
T tolerable_value(T x) { inline T tolerable_value(const T x) {
static_assert(std::is_floating_point<T>::value, static_assert(std::is_floating_point<T>::value,
"tolerable_value works only on float, " "tolerable_value works only on float, "
"double and double double."); "double and double double.");
...@@ -39,10 +39,13 @@ T tolerable_value(T x) { ...@@ -39,10 +39,13 @@ T tolerable_value(T x) {
return x; return x;
} }
template <typename Place, typename T> template <typename T>
class OnehotCrossEntropyOpKernel : public framework::OpKernel { class OnehotCrossEntropyOpKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto X = ctx.Input<Tensor>("X"); auto X = ctx.Input<Tensor>("X");
const T* Xdata = X->data<T>(); const T* Xdata = X->data<T>();
const int* label_data = ctx.Input<Tensor>("label")->data<int>(); const int* label_data = ctx.Input<Tensor>("label")->data<int>();
...@@ -62,10 +65,13 @@ class OnehotCrossEntropyOpKernel : public framework::OpKernel { ...@@ -62,10 +65,13 @@ class OnehotCrossEntropyOpKernel : public framework::OpKernel {
} }
}; };
template <typename Place, typename T> template <typename T>
class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel { class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto X = ctx.Input<Tensor>("X"); auto X = ctx.Input<Tensor>("X");
auto dX = ctx.Output<Tensor>(framework::GradVarName("X")); auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto dY = ctx.Input<Tensor>(framework::GradVarName("Y")); auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));
...@@ -79,6 +85,8 @@ class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel { ...@@ -79,6 +85,8 @@ class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
const int batch_size = X->dims()[0]; const int batch_size = X->dims()[0];
const int class_num = X->dims()[1]; const int class_num = X->dims()[1];
// TODO(qingqing): make zero setting an common function.
memset(dXdata, 0, sizeof(T) * batch_size * class_num);
for (int i = 0; i < batch_size; ++i) { for (int i = 0; i < batch_size; ++i) {
int index = i * class_num + label_data[i]; int index = i * class_num + label_data[i];
dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]); dXdata[index] = -tolerable_value(dYdata[i] / Xdata[index]);
......
...@@ -26,7 +26,7 @@ class FillZerosLikeKernel : public framework::OpKernel { ...@@ -26,7 +26,7 @@ class FillZerosLikeKernel : public framework::OpKernel {
auto* output = context.Output<framework::Tensor>("Dst"); auto* output = context.Output<framework::Tensor>("Dst");
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*output); auto t = framework::EigenVector<T>::Flatten(*output);
t.device(context.GetEigenDevice<Place>()) = t.constant(T(0)); t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
} }
}; };
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <cstring> #include <cstring>
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
...@@ -25,13 +26,13 @@ namespace operators { ...@@ -25,13 +26,13 @@ namespace operators {
// Implementation of CPU copy // Implementation of CPU copy
template <typename T> template <typename T>
void CPUGather(const T* params, const int* indices, const int slice_size, void CPUGather(const T* src, const int* indices, const int slice_size,
const int index_size, T* output) { const int index_size, T* output) {
const size_t slice_bytes = slice_size * sizeof(T); const size_t slice_bytes = slice_size * sizeof(T);
for (int i = 0; i < index_size; ++i) { for (int i = 0; i < index_size; ++i) {
int index_ = indices[i]; int index_ = indices[i];
memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes); memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
} }
} }
...@@ -55,7 +56,7 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src, ...@@ -55,7 +56,7 @@ void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
int index_size = index->dims()[0]; int index_size = index->dims()[0];
auto src_dims = src->dims(); auto src_dims = src->dims();
paddle::framework::DDim output_dims(src_dims); framework::DDim output_dims(src_dims);
output_dims[0] = index_size; output_dims[0] = index_size;
// slice size // slice size
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/gather_op.h"
#include "paddle/framework/ddim.h"
namespace paddle {
namespace operators {
class GatherOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
int batch_size = ctx.Input<Tensor>("Index")->dims()[0];
PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
framework::DDim output_dims(ctx.Input<Tensor>("X")->dims());
output_dims[0] = batch_size;
ctx.Output<Tensor>("Out")->Resize(output_dims);
}
};
class GatherGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto X = ctx.Input<Tensor>("X");
X_grad->Resize(X->dims());
}
};
class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
public:
GatherOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The source input of gather op");
AddInput("Index", "The index input of gather op");
AddOutput("Out", "The output of add op");
AddComment(R"DOC(
Gather Operator by selecting from the first axis,
Out = X[Index]
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
ops::GatherGradOp);
REGISTER_OP_CPU_KERNEL(gather,
ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
gather_grad,
ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/gather_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(gather,
ops::GatherOpKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "gather.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "scatter.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
class GatherOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *X = ctx.Input<Tensor>("X");
auto *Index = ctx.Input<Tensor>("Index");
auto *Y = ctx.Output<Tensor>("Out");
Y->mutable_data<T>(ctx.GetPlace());
Gather<T>(ctx.GetPlace(), X, Index, Y);
}
};
template <typename Place, typename T>
class GatherGradientOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *Index = ctx.Input<Tensor>("Index");
auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
dX->mutable_data<T>(ctx.GetPlace());
ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -19,25 +16,25 @@ namespace paddle { ...@@ -19,25 +16,25 @@ namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T>
class GaussianRandomKernel : public framework::OpKernel { class CPUGaussianRandomKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
float mean = context.op_.GetAttr<float>("mean"); float mean = context.op_.GetAttr<float>("mean");
float std = context.op_.GetAttr<float>("std"); float std = context.op_.GetAttr<float>("std");
auto* tensor = context.Output<framework::Tensor>(0); auto* tensor = context.Output<framework::Tensor>("Out");
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
// TODO(dzh): attribute does not support unsigned int. unsigned int seed =
// And we need a global random seed configuration. static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
int seed = context.op_.GetAttr<int>("seed"); std::minstd_rand engine;
if (seed == 0) { if (seed == 0) {
seed = std::random_device()(); seed = std::random_device()();
} }
std::mt19937 g(seed); engine.seed(seed);
std::normal_distribution<T> distribution(mean, std); std::normal_distribution<T> dist(mean, std);
ssize_t size = framework::product(tensor->dims()); ssize_t size = framework::product(tensor->dims());
for (int i = 0; i < size; ++i) { for (ssize_t i = 0; i < size; ++i) {
data[i] = distribution(g); data[i] = dist(engine);
} }
} }
}; };
...@@ -48,7 +45,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { ...@@ -48,7 +45,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
protected: protected:
void InferShape(const framework::InferShapeContext& context) const override { void InferShape(const framework::InferShapeContext& context) const override {
auto* tensor = context.Output<framework::Tensor>(0); auto* tensor = context.Output<framework::Tensor>("Out");
auto dims = GetAttr<std::vector<int>>("dims"); auto dims = GetAttr<std::vector<int>>("dims");
PADDLE_ENFORCE(dims.size() > 0UL, PADDLE_ENFORCE(dims.size() > 0UL,
"dims can be one int or array. dims must be set."); "dims can be one int or array. dims must be set.");
...@@ -68,8 +65,8 @@ Use to initialize tensor with gaussian random generator. ...@@ -68,8 +65,8 @@ Use to initialize tensor with gaussian random generator.
)DOC"); )DOC");
AddAttr<std::vector<int>>("dims", "The dimension of random tensor."); AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
AddAttr<float>("mean", "mean value of random.").SetDefault(.0f); AddAttr<float>("mean", "mean of random tensor.").SetDefault(.0f);
AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f); AddAttr<float>("std", "std of random tensor.").SetDefault(1.0f);
AddAttr<int>("seed", AddAttr<int>("seed",
"Random seed of generator." "Random seed of generator."
"0 means use system wide seed") "0 means use system wide seed")
...@@ -83,4 +80,4 @@ Use to initialize tensor with gaussian random generator. ...@@ -83,4 +80,4 @@ Use to initialize tensor with gaussian random generator.
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
ops::GaussianRandomOpMaker); ops::GaussianRandomOpMaker);
REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>); REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <memory> #include <thrust/device_ptr.h>
#include <random> #include <thrust/iterator/counting_iterator.h>
#include "paddle/platform/dynload/curand.h" #include <thrust/random.h>
#include "paddle/platform/gpu_info.h" #include <thrust/transform.h>
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename T>
class GaussianRandomKernel : public framework::OpKernel { struct GaussianGenerator {
T mean_, std_;
unsigned int seed_;
__host__ __device__ GaussianGenerator(T mean, T std, int seed)
: mean_(mean), std_(std), seed_(seed) {}
__host__ __device__ T operator()(const unsigned int n) const {
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::normal_distribution<T> dist(mean_, std_);
rng.discard(n);
return dist(rng);
}
};
template <typename T>
class GPUGaussianRandomKernel : public framework::OpKernel {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
float mean = context.op_.GetAttr<float>("mean"); auto* tensor = context.Output<framework::Tensor>("Out");
float std = context.op_.GetAttr<float>("std");
auto* tensor = context.Output<framework::Tensor>(0);
T* data = tensor->mutable_data<T>(context.GetPlace()); T* data = tensor->mutable_data<T>(context.GetPlace());
unsigned int seed =
int seed = context.op_.GetAttr<int>("seed"); static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
if (seed == 0) { if (seed == 0) {
std::random_device rd; std::random_device rd;
seed = rd(); seed = rd();
} }
curandGenerator_t g; T mean = static_cast<T>(context.op_.GetAttr<float>("mean"));
PADDLE_ENFORCE(platform::dynload::curandCreateGenerator( T std = static_cast<T>(context.op_.GetAttr<float>("std"));
&g, CURAND_RNG_PSEUDO_DEFAULT)); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
PADDLE_ENFORCE( ssize_t N = framework::product(tensor->dims());
platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed)); thrust::transform(index_sequence_begin, index_sequence_begin + N,
platform::dynload::curandGenerateNormal( thrust::device_ptr<T>(data),
g, data, framework::product(tensor->dims()), mean, std); GaussianGenerator<T>(mean, std, seed));
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(gaussian_random,
REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>); paddle::operators::GPUGaussianRandomKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/lookup_table_op.h"
namespace paddle {
namespace operators {
class LookupTableOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &context) const override {
auto table_t = context.Input<Tensor>("W");
auto ids_t = context.Input<Tensor>("Ids");
auto output_t = context.Output<Tensor>("Out");
output_t->Resize({ids_t->dims()[0], table_t->dims()[1]});
}
};
class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LookupTableOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("W",
"An input represents embedding tensors,"
" which is a learnable parameter.");
AddInput("Ids",
"An input with type int32 or int64"
"contains the ids to be looked up in W.");
AddOutput("Out", "The lookup results, which have the same type with W.");
AddComment(
"This operator is used to perform lookups on the parameter W,"
"then concatenated into a dense tensor.");
}
};
class LookupTableOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &context) const override {
auto table = context.Input<Tensor>("W");
auto d_table = context.Output<Tensor>(framework::GradVarName("W"));
d_table->Resize(table->dims());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
lookup_table_grad, ops::LookupTableOpGrad);
REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>);
REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/assert.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void LookupTable(T* output, const T* table, const int32_t* ids,
const int N, const int K, const int D) {
int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * GridDimX;
while (idy < K) {
int id = ids[idy];
PADDLE_ASSERT(id >= 0);
PADDLE_ASSERT(id < N);
T* out = output + idy * D;
const T* tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
out[i] = tab[i];
}
idy += BlockDimY * GridDimX;
}
}
template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
const int N, const int K, const int D) {
int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * GridDimX;
while (idy < K) {
int id = ids[idy];
PADDLE_ASSERT(id >= 0);
PADDLE_ASSERT(id < N);
const T* out = output + idy * D;
T* tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
}
idy += BlockDimY * GridDimX;
}
}
template <typename T>
class LookupTableCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto table_t = context.Input<Tensor>("W");
auto ids_t = context.Input<Tensor>("Ids");
auto output_t = context.Output<Tensor>("Out");
size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1];
size_t K = product(ids_t->dims());
auto ids = ids_t->data<int32_t>();
auto table = table_t->data<T>();
auto output = output_t->mutable_data<T>(context.GetPlace());
dim3 threads(128, 8);
dim3 grids(8, 1);
LookupTable<T, 128, 8, 8><<<grids, threads>>>(output, table, ids, N, K, D);
}
};
template <typename T>
class LookupTableGradCUDAKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto ids_t = context.Input<Tensor>("Ids");
auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
int N = d_table_t->dims()[0];
int D = d_table_t->dims()[1];
int K = product(ids_t->dims());
const int32_t* ids = ids_t->data<int32_t>();
const T* d_output = d_output_t->data<T>();
T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_table_t);
t.device(context.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
dim3 threads(128, 8);
dim3 grids(8, 1);
LookupTableGrad<T, 128, 8, 8><<<grids, threads>>>(d_table, d_output, ids, N,
K, D);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(lookup_table_grad,
ops::LookupTableGradCUDAKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class LookupTableKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto table_t = context.Input<Tensor>("W"); // float tensor
auto ids_t = context.Input<Tensor>("Ids"); // int tensor
auto output_t = context.Output<Tensor>("Out"); // float tensor
size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1];
auto ids = ids_t->data<int32_t>();
auto table = table_t->data<T>();
auto output = output_t->mutable_data<T>(context.GetPlace());
for (size_t i = 0; i < product(ids_t->dims()); ++i) {
PADDLE_ENFORCE_LT(ids[i], N);
PADDLE_ENFORCE_GE(ids[i], 0);
memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
}
}
};
template <typename T>
class LookupTableGradKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto ids_t = context.Input<Tensor>("Ids");
auto d_output_t = context.Input<Tensor>(framework::GradVarName("Out"));
auto d_table_t = context.Output<Tensor>(framework::GradVarName("W"));
size_t N = d_table_t->dims()[0];
size_t D = d_table_t->dims()[1];
auto ids = ids_t->data<int32_t>();
const T* d_output = d_output_t->data<T>();
T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*d_table_t);
t.device(context.GetEigenDevice<platform::CPUPlace>()) =
t.constant(static_cast<T>(0));
for (size_t i = 0; i < product(ids_t->dims()); ++i) {
PADDLE_ENFORCE_LT(ids[i], N);
PADDLE_ENFORCE_GE(ids[i], 0);
for (size_t j = 0; j < D; ++j) {
d_table[ids[i] * D + j] += d_output[i * D + j];
}
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/minus_op.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class MinusOp : public framework::OperatorWithKernel {
public:
MinusOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto *left_tensor = ctx.Input<framework::Tensor>("X");
auto *right_tensor = ctx.Input<framework::Tensor>("Y");
PADDLE_ENFORCE_EQ(
framework::product(left_tensor->dims()),
framework::product(right_tensor->dims()),
"Minus operator must take two tensor with same num of elements");
ctx.Output<framework::Tensor>("Out")->Resize(left_tensor->dims());
}
};
class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The left tensor of minus operator.").NotInGradient();
AddInput("Y", "The right tensor of minus operator.").NotInGradient();
AddOutput("Out", "The output tensor of minus operator.").NotInGradient();
AddComment(R"DOC(Minus Operator
Equation: Out = X - Y
)DOC");
}
};
template <typename AttrType>
class MinusGradOp : public NetOp {
public:
MinusGradOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
auto out_grad = Input(framework::GradVarName("Out"));
auto x_grad = Output(framework::GradVarName("X"));
auto y_grad = Output(framework::GradVarName("Y"));
// x_grad = out_grad
AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
{{"Out", {x_grad}}}, {}));
framework::AttributeMap scale_attr;
scale_attr["scale"] = static_cast<AttrType>(-1);
AppendOp(framework::OpRegistry::CreateOp("scale", {{"X", {out_grad}}},
{{"Out", {y_grad}}}, scale_attr));
CompleteAddOp(false);
}
};
} // namespace operators
} // namespace paddle
USE_OP(scale);
USE_OP_ITSELF(identity);
namespace ops = paddle::operators;
REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
ops::MinusGradOp<float>);
REGISTER_OP_CPU_KERNEL(minus,
ops::MinusKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/minus_op.h"
REGISTER_OP_GPU_KERNEL(
minus, paddle::operators::MinusKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename Place, typename T>
class MinusKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* left_tensor = context.Input<framework::Tensor>("X");
auto* right_tensor = context.Input<framework::Tensor>("Y");
auto* out_tensor = context.Output<framework::Tensor>("Out");
out_tensor->mutable_data<T>(context.GetPlace());
auto& dev = context.GetEigenDevice<Place>();
framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
framework::EigenVector<T>::Flatten(*left_tensor) -
framework::EigenVector<T>::Flatten(*right_tensor);
}
};
} // namespace operators
} // namespace paddle
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
limitations under the License. */ limitations under the License. */
#include "paddle/operators/mul_op.h" #include "paddle/operators/mul_op.h"
#include "paddle/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -31,10 +31,13 @@ void NetOp::CompleteAddOp(bool calc) { ...@@ -31,10 +31,13 @@ void NetOp::CompleteAddOp(bool calc) {
for (auto& op : ops_) { for (auto& op : ops_) {
for (auto& ipt : op->Inputs()) { for (auto& ipt : op->Inputs()) {
for (auto& var_name : ipt.second) { for (auto& var_name : ipt.second) {
if (!Contains(output_set, var_name)) { // Not other op's output // If input variable has been in output set, then it will be
input_set.insert(var_name); // added into intermediate_outputs_. Otherwise, it will be
} else { // added into input set.
if (Contains(output_set, var_name)) {
intermediate_outputs_.insert(var_name); intermediate_outputs_.insert(var_name);
} else {
input_set.insert(var_name);
} }
} }
} }
...@@ -68,10 +71,15 @@ std::string NetOp::DebugString() const { ...@@ -68,10 +71,15 @@ std::string NetOp::DebugString() const {
bool NetOp::IsNetOp() const { return true; } bool NetOp::IsNetOp() const { return true; }
std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const { std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
std::vector<std::string> all;
for (auto& pair : this->outputs_) {
for (auto& var_name : pair.second) {
all.push_back(var_name);
}
}
if (has_intermediate) { if (has_intermediate) {
return this->outputs_.at(kAll); return all;
} }
auto& all = this->outputs_.at(kAll);
std::vector<std::string> ret_val; std::vector<std::string> ret_val;
for (auto& each : all) { for (auto& each : all) {
if (!Contains(intermediate_outputs_, each)) { if (!Contains(intermediate_outputs_, each)) {
...@@ -81,9 +89,8 @@ std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const { ...@@ -81,9 +89,8 @@ std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
return ret_val; return ret_val;
} }
NetOp::NetOp(const std::string& type, NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::OperatorBase::VarNameMap& inputs, const framework::VariableNameMap& outputs,
const framework::OperatorBase::VarNameMap& outputs,
const framework::AttributeMap& attrs) const framework::AttributeMap& attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {} : framework::OperatorBase(type, inputs, outputs, attrs) {}
......
...@@ -38,8 +38,10 @@ class NetOp : public framework::OperatorBase { ...@@ -38,8 +38,10 @@ class NetOp : public framework::OperatorBase {
public: public:
static const char kAll[]; static const char kAll[];
NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {} NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {}
NetOp(const std::string& type, const VarNameMap& inputs,
const VarNameMap& outputs, const framework::AttributeMap& attrs); NetOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs);
NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) { NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
this->ops_.reserve(o.ops_.size()); this->ops_.reserve(o.ops_.size());
...@@ -84,13 +86,14 @@ class NetOp : public framework::OperatorBase { ...@@ -84,13 +86,14 @@ class NetOp : public framework::OperatorBase {
return true; return true;
} }
void AddOp(const framework::OperatorBase& op) { AddOp(op.Clone()); } void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
/** /**
* @brief Add an operator by ptr * @brief Add an operator by ptr
*/ */
void AddOp(std::unique_ptr<framework::OperatorBase> op) { void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed"); PADDLE_ENFORCE(!add_op_done_,
"Cannot AppendOp when this network is sealed");
PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op"); PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
ops_.push_back(std::move(op)); ops_.push_back(std::move(op));
} }
......
...@@ -38,10 +38,10 @@ TEST(OpKernel, all) { ...@@ -38,10 +38,10 @@ TEST(OpKernel, all) {
auto net = std::make_shared<NetOp>(); auto net = std::make_shared<NetOp>();
ASSERT_NE(net, nullptr); ASSERT_NE(net, nullptr);
net->AddOp(std::unique_ptr<TestOp>( net->AppendOp(std::unique_ptr<TestOp>(
new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"Out", {"y"}}}, {}))); {{"Out", {"y"}}}, {})));
net->AddOp(std::unique_ptr<TestOp>( net->AppendOp(std::unique_ptr<TestOp>(
new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}}, new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
{{"Out", {"z"}}}, {}))); {{"Out", {"z"}}}, {})));
...@@ -61,7 +61,7 @@ TEST(NetOp, insert_op) { ...@@ -61,7 +61,7 @@ TEST(NetOp, insert_op) {
auto op1 = std::unique_ptr<framework::NOP>( auto op1 = std::unique_ptr<framework::NOP>(
new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
{{"Out", {"y"}}}, {})); {{"Out", {"y"}}}, {}));
net.AddOp(*op1); net.AppendOp(*op1);
net.InsertOp(0, *op1); net.InsertOp(0, *op1);
ASSERT_EQ(2UL, net.ops_.size()); ASSERT_EQ(2UL, net.ops_.size());
net.InsertOp(2, std::move(op1)); net.InsertOp(2, std::move(op1));
...@@ -70,16 +70,16 @@ TEST(NetOp, insert_op) { ...@@ -70,16 +70,16 @@ TEST(NetOp, insert_op) {
TEST(NetOp, Clone) { TEST(NetOp, Clone) {
NetOp net; NetOp net;
net.AddOp( net.AppendOp(
std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}})); std::unique_ptr<framework::NOP>(new framework::NOP{"empty", {}, {}, {}}));
net.AddOp(std::unique_ptr<framework::NOP>( net.AppendOp(std::unique_ptr<framework::NOP>(
new framework::NOP{"empty2", {}, {}, {}})); new framework::NOP{"empty2", {}, {}, {}}));
net.CompleteAddOp(true); net.CompleteAddOp(true);
auto new_net_op = net.Clone(); auto new_net_op = net.Clone();
ASSERT_NE(new_net_op, nullptr); ASSERT_NE(new_net_op, nullptr);
ASSERT_TRUE(new_net_op->IsNetOp()); ASSERT_TRUE(new_net_op->IsNetOp());
auto* new_net = static_cast<NetOp*>(new_net_op.get()); auto* new_net = static_cast<NetOp*>(new_net_op.get());
ASSERT_EQ(2, new_net->ops_.size()); ASSERT_EQ(2UL, new_net->ops_.size());
ASSERT_EQ(new_net->ops_[0]->Type(), "empty"); ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
ASSERT_EQ(new_net->ops_[1]->Type(), "empty2"); ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
} }
......
...@@ -131,8 +131,8 @@ const rnn::ArgumentName RecurrentGradientOp::kArgName{ ...@@ -131,8 +131,8 @@ const rnn::ArgumentName RecurrentGradientOp::kArgName{
"memories", "pre_memories", "boot_memories@grad"}; "memories", "pre_memories", "boot_memories@grad"};
RecurrentOp::RecurrentOp(const std::string& type, RecurrentOp::RecurrentOp(const std::string& type,
const framework::OperatorBase::VarNameMap& inputs, const framework::VariableNameMap& inputs,
const framework::OperatorBase::VarNameMap& outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs) const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) { : OperatorBase(type, inputs, outputs, attrs) {
rnn::InitArgument(kArgName, &arg_, *this); rnn::InitArgument(kArgName, &arg_, *this);
...@@ -223,8 +223,8 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const { ...@@ -223,8 +223,8 @@ void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
} }
RecurrentGradientOp::RecurrentGradientOp( RecurrentGradientOp::RecurrentGradientOp(
const std::string& type, const framework::OperatorBase::VarNameMap& inputs, const std::string& type, const framework::VariableNameMap& inputs,
const framework::OperatorBase::VarNameMap& outputs, const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs) const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) { : OperatorBase(type, inputs, outputs, attrs) {
rnn::InitArgument(kArgName, &arg_, *this); rnn::InitArgument(kArgName, &arg_, *this);
......
...@@ -114,8 +114,9 @@ class RecurrentGradientAlgorithm { ...@@ -114,8 +114,9 @@ class RecurrentGradientAlgorithm {
class RecurrentOp : public framework::OperatorBase { class RecurrentOp : public framework::OperatorBase {
public: public:
RecurrentOp(const std::string& type, const VarNameMap& inputs, RecurrentOp(const std::string& type, const framework::VariableNameMap& inputs,
const VarNameMap& outputs, const framework::AttributeMap& attrs); const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs);
RecurrentOp(const RecurrentOp& o) RecurrentOp(const RecurrentOp& o)
: framework::OperatorBase( : framework::OperatorBase(
...@@ -150,8 +151,9 @@ class RecurrentOp : public framework::OperatorBase { ...@@ -150,8 +151,9 @@ class RecurrentOp : public framework::OperatorBase {
class RecurrentGradientOp : public framework::OperatorBase { class RecurrentGradientOp : public framework::OperatorBase {
public: public:
RecurrentGradientOp(const std::string& type, const VarNameMap& inputs, RecurrentGradientOp(const std::string& type,
const VarNameMap& outputs, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs); const framework::AttributeMap& attrs);
RecurrentGradientOp(const RecurrentGradientOp& o) RecurrentGradientOp(const RecurrentGradientOp& o)
......
...@@ -18,3 +18,6 @@ ...@@ -18,3 +18,6 @@
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>); rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
rowwise_add_grad,
ops::RowwiseAddGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
...@@ -63,7 +63,7 @@ class RowwiseAddGradKernel : public framework::OpKernel { ...@@ -63,7 +63,7 @@ class RowwiseAddGradKernel : public framework::OpKernel {
// https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
// colwise add // colwise add
Eigen::array<int, 1> dims{{1}}; /* dimension to reduce */ Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims); EigenVector<T>::Flatten(*db).device(place) = OutGrad.sum(dims);
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/scale_op.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class ScaleOp : public framework::OperatorWithKernel {
public:
ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto *in = ctx.Input<framework::Tensor>("X");
auto *out = ctx.Output<framework::Tensor>("Out");
out->Resize(in->dims());
}
};
template <typename AttrType>
class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of scale operator.").NotInGradient();
AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
AddComment(R"DOC(Scale operator
The equation is: Out = scale*X
)DOC");
AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
}
};
// Identity Op's gradient is identity op, too.
// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
template <typename AttrType>
class ScaleGradOp : public NetOp {
public:
ScaleGradOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input(framework::GradVarName("Out"))}}},
{{"Out", {Output(framework::GradVarName("X"))}}},
{{"scale", GetAttr<AttrType>("scale")}}));
CompleteAddOp(false);
}
};
// identity is a alias of scale op. This is also a example for creating a alias
// operator.
template <typename AttrType>
class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
public:
IdentityOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "input tensor of identity op");
AddOutput("Out", "output tensor of identity op");
AddComment("identity operator. Just a alias of scale op which scale = 1.0");
}
};
template <typename AttrType>
class IdentityOp : public NetOp {
public:
IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Out")}}},
{{"scale", static_cast<AttrType>(1)}}));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
ops::ScaleGradOp<float>);
REGISTER_OP_CPU_KERNEL(scale,
ops::ScaleKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
ops::IdentityOpMaker<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/scale_op.h"
REGISTER_OP_GPU_KERNEL(
scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename Place, typename T, typename AttrType = T>
class ScaleKernel : public framework::OpKernel {
public:
virtual void Compute(const framework::ExecutionContext& context) const {
auto* tensor = context.Output<framework::Tensor>("Out");
auto* in = context.Input<framework::Tensor>("X");
tensor->mutable_data<T>(in->place());
auto scale = static_cast<T>(context.op_.GetAttr<AttrType>("scale"));
auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto& dev = context.GetEigenDevice<Place>();
eigen_out.device(dev) = scale * eigen_in;
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/scatter_op.h"
#include "paddle/framework/ddim.h"
namespace paddle {
namespace operators {
class ScatterOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Index")->dims().size(), 1,
"Update Index should be 1-D.");
PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Ref")->dims().size(),
ctx.Input<Tensor>("Updates")->dims().size(),
"Reference and Updates should have the same shape size");
PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("Updates")->dims()[0],
ctx.Input<Tensor>("Index")->dims()[0],
"Updates and Index should have same batch-size.");
framework::DDim data_dim(ctx.Input<Tensor>("Updates")->dims());
for (int i = 1; i < data_dim.size(); ++i)
PADDLE_ENFORCE_EQ(data_dim[i], ctx.Input<Tensor>("Updates")->dims()[i]);
ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("Ref")->dims());
}
};
class ScatterGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(const framework::InferShapeContext &ctx) const override {
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Updates = ctx.Input<Tensor>("Updates");
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *Ref = ctx.Input<Tensor>("Ref");
dRef->Resize(Ref->dims());
dUpdates->Resize(Updates->dims());
}
};
class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ScatterOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Ref", "The source input of scatter op");
AddInput("Index",
"The index input of scatter op where Ref will be updated");
AddInput("Updates", "The updated value of updates op");
AddOutput("Out", "The output of add op");
AddComment(R"DOC(
Scatter Operator by selecting from the first axis,
Out = Ref
Out[Index] = Ref[Index] + Updates
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
ops::ScatterGradOp);
REGISTER_OP_CPU_KERNEL(scatter,
ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
scatter_grad,
ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/scatter_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(scatter,
ops::ScatterOpKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "gather.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "scatter.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename Place, typename T>
class ScatterOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *Ref = ctx.Input<Tensor>("Ref");
auto *Index = ctx.Input<Tensor>("Index");
auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out");
// In place output: Out = Ref, Out[Index] += Updates
Out->ShareDataWith<T>(*Ref);
// Apply ScatterUpdate: Out[index] += Updates[:]
ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out);
}
};
template <typename Place, typename T>
class ScatterGradientOpKernel : public framework::OpKernel {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
auto *Index = ctx.Input<Tensor>("Index");
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO
dRef->ShareDataWith<T>(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index]
Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -39,7 +36,8 @@ class CPUUniformRandomKernel : public framework::OpKernel { ...@@ -39,7 +36,8 @@ class CPUUniformRandomKernel : public framework::OpKernel {
std::uniform_real_distribution<T> dist( std::uniform_real_distribution<T> dist(
static_cast<T>(context.op_.GetAttr<float>("min")), static_cast<T>(context.op_.GetAttr<float>("min")),
static_cast<T>(context.op_.GetAttr<float>("max"))); static_cast<T>(context.op_.GetAttr<float>("max")));
for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) { ssize_t size = framework::product(tensor->dims());
for (ssize_t i = 0; i < size; ++i) {
data[i] = dist(engine); data[i] = dist(engine);
} }
} }
...@@ -66,7 +64,6 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,7 +64,6 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("Out", "The output tensor of uniform random op"); AddOutput("Out", "The output tensor of uniform random op");
AddComment(R"DOC(Uniform random operator. AddComment(R"DOC(Uniform random operator.
Used to initialize tensor with uniform random generator. Used to initialize tensor with uniform random generator.
)DOC"); )DOC");
AddAttr<std::vector<int>>("dims", "the dimension of random tensor"); AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
...@@ -276,17 +276,21 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, ...@@ -276,17 +276,21 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
void Argument::concat(const std::vector<Argument>& args, void Argument::concat(const std::vector<Argument>& args,
const std::vector<int>& selectRows, const std::vector<int>& selectRows,
const std::vector<int>& seqStartPos, const std::vector<int>& seqStartPos,
const std::vector<int>& copySize,
bool useGpu, bool useGpu,
hl_stream_t stream, hl_stream_t stream,
PassType passType) { PassType passType) {
CHECK(!subSequenceStartPositions) CHECK(!subSequenceStartPositions)
<< "undefined behavior for subsequence positions"; << "undefined behavior for subsequence positions";
size_t batchSize = selectRows.size(); size_t batchSize = 0;
for (size_t i = 0; i < copySize.size(); ++i)
batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
auto copyArg = [batchSize, stream](MatrixPtr& dst, auto copyArg = [batchSize, stream](MatrixPtr& dst,
MatrixPtr src, MatrixPtr src,
int startRow, int desStartRow,
int pos, int srcStartRow,
int size, int size,
bool useGpu) { bool useGpu) {
if (!src) { if (!src) {
...@@ -300,14 +304,14 @@ void Argument::concat(const std::vector<Argument>& args, ...@@ -300,14 +304,14 @@ void Argument::concat(const std::vector<Argument>& args,
dst->resize(batchSize, width); dst->resize(batchSize, width);
} }
MatrixPtr tmpMatrix = dst->subMatrix(startRow, size); MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream); tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
}; };
auto copyIds = [batchSize, stream](IVectorPtr& dst, auto copyIds = [batchSize, stream](IVectorPtr& dst,
const IVectorPtr& src, const IVectorPtr& src,
int startRow, int desStartRow,
int pos, int srcStartRow,
int size, int size,
bool useGpu) { bool useGpu) {
if (!src) { if (!src) {
...@@ -315,13 +319,14 @@ void Argument::concat(const std::vector<Argument>& args, ...@@ -315,13 +319,14 @@ void Argument::concat(const std::vector<Argument>& args,
return; return;
} }
IVector::resizeOrCreate(dst, batchSize, useGpu); IVector::resizeOrCreate(dst, batchSize, useGpu);
dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream); dst->subVec(desStartRow, size)
->copyFrom(*src->subVec(srcStartRow, size), stream);
}; };
auto copyStrs = [batchSize, stream](SVectorPtr& dst, auto copyStrs = [batchSize, stream](SVectorPtr& dst,
const SVectorPtr& src, const SVectorPtr& src,
int startRow, int desStartRow,
int pos, int srcStartRow,
int size, int size,
bool useGpu) { bool useGpu) {
if (!src) { if (!src) {
...@@ -333,30 +338,31 @@ void Argument::concat(const std::vector<Argument>& args, ...@@ -333,30 +338,31 @@ void Argument::concat(const std::vector<Argument>& args,
} else { } else {
dst->resize(batchSize); dst->resize(batchSize);
} }
std::copy( std::copy(src->begin() + srcStartRow,
src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow); src->begin() + srcStartRow + size,
dst->begin() + desStartRow);
}; };
dataId = args[0].dataId; dataId = args[0].dataId;
CHECK_NE(seqStartPos.size(), 0UL); CHECK_NE(seqStartPos.size(), 0UL);
size_t sampleNum = seqStartPos.size() - 1; int desStartRow = 0;
for (size_t i = 0; i < sampleNum; ++i) { for (size_t i = 0; i < copySize.size(); ++i) {
int startPos = seqStartPos[i]; int startPos = seqStartPos[i];
int endPos = seqStartPos[i + 1]; int endPos = seqStartPos[i + 1];
CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos)); CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
for (int j = startPos; j < endPos; ++j) { for (int j = startPos; j < endPos; ++j) {
const Argument& arg = args[j - startPos]; const Argument& arg = args[j - startPos];
CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have" CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
<< " same dataId"; << "the same dataId.";
const int copySize = 1; const int srcStartRow = selectRows[j];
const int rowIdx = selectRows[j]; copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
copyArg(in, arg.in, j, rowIdx, copySize, useGpu); copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
copyArg(value, arg.value, j, rowIdx, copySize, useGpu);
if (passType != PASS_TEST) { if (passType != PASS_TEST) {
copyArg(grad, arg.grad, j, rowIdx, copySize, useGpu); copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
} }
copyIds(ids, arg.ids, j, rowIdx, copySize, useGpu); copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu); copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
desStartRow += copySize[i];
} }
} }
ICpuGpuVector::resizeOrCreate( ICpuGpuVector::resizeOrCreate(
...@@ -670,19 +676,28 @@ void Argument::reorganizeSeqInfo( ...@@ -670,19 +676,28 @@ void Argument::reorganizeSeqInfo(
const ICpuGpuVectorPtr seqStartPos, const ICpuGpuVectorPtr seqStartPos,
const ICpuGpuVectorPtr subSeqStartPos, const ICpuGpuVectorPtr subSeqStartPos,
std::vector<std::vector<int>>& reorganizedSeqInfo) { std::vector<std::vector<int>>& reorganizedSeqInfo) {
int* seqStarts = seqStartPos->getMutableData(false); CHECK(seqStartPos);
int* subSeqStarts = subSeqStartPos->getMutableData(false);
int seqNum = seqStartPos->getSize() - 1; int seqNum = seqStartPos->getSize() - 1;
reorganizedSeqInfo.resize(seqNum, std::vector<int>()); int* seqStarts = seqStartPos->getMutableData(false);
int seqIdx = 0;
for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) { if (subSeqStartPos) {
reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]); int* subSeqStarts = subSeqStartPos->getMutableData(false);
if (subSeqStarts[i] == seqStarts[seqIdx + 1]) { reorganizedSeqInfo.resize(seqNum, std::vector<int>());
seqIdx++; int seqIdx = 0;
if (seqIdx == seqNum) return; for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]); reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
seqIdx++;
if (seqIdx == seqNum) return;
reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
}
} }
} else {
reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
memcpy(reorganizedSeqInfo[0].data(),
seqStarts,
sizeof(int) * seqStartPos->getSize());
} }
} }
......
...@@ -240,6 +240,7 @@ struct Argument { ...@@ -240,6 +240,7 @@ struct Argument {
void concat(const std::vector<Argument>& args, void concat(const std::vector<Argument>& args,
const std::vector<int>& selectRows, const std::vector<int>& selectRows,
const std::vector<int>& seqStartPos, const std::vector<int>& seqStartPos,
const std::vector<int>& copySize,
bool useGpu, bool useGpu,
hl_stream_t stream, hl_stream_t stream,
PassType passType); PassType passType);
......
...@@ -65,7 +65,10 @@ public: ...@@ -65,7 +65,10 @@ public:
size_t getSize() const { return config_.size(); } size_t getSize() const { return config_.size(); }
bool isFullSize() const { bool isFullSize() const {
return this->getSize() == bufs_[PARAMETER_VALUE]->getSize(); if (bufs_[PARAMETER_VALUE]) {
return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
}
return false;
} }
inline bool useGpu() const { return useGpu_; } inline bool useGpu() const { return useGpu_; }
...@@ -278,7 +281,11 @@ public: ...@@ -278,7 +281,11 @@ public:
/** /**
* @brief Set the format in header. * @brief Set the format in header.
*/ */
void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; } void setHeaderFormat(int32_t fmt) {
CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
<< fmt;
headerFormat_ = fmt;
}
/** /**
* @brief Parameter Update Hook. * @brief Parameter Update Hook.
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cuda.h>
namespace paddle {
namespace platform {
#define CUDA_ATOMIC_WRAPPER(op, T) \
__device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
#define USE_CUDA_ATOMIC(op, T) \
CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
// For atomicAdd.
USE_CUDA_ATOMIC(Add, float);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
USE_CUDA_ATOMIC(Add, double);
#else
CUDA_ATOMIC_WRAPPER(Add, double) {
unsigned long long int* address_as_ull =
reinterpret_cast<unsigned long long int*>(address);
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN
} while (assumed != old);
return __longlong_as_double(old);
}
#endif
} // namespace platform
} // namespace paddle
...@@ -114,9 +114,6 @@ CUDADeviceContext::~CUDADeviceContext() { ...@@ -114,9 +114,6 @@ CUDADeviceContext::~CUDADeviceContext() {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
} }
if (curand_generator_) {
PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_));
}
eigen_stream_.reset(); eigen_stream_.reset();
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_));
...@@ -152,19 +149,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() { ...@@ -152,19 +149,6 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() {
cudaStream_t CUDADeviceContext::stream() { return stream_; } cudaStream_t CUDADeviceContext::stream() { return stream_; }
curandGenerator_t CUDADeviceContext::curand_generator() {
if (!curand_generator_) {
SetDeviceId(place_.device);
PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
CURAND_RNG_PSEUDO_DEFAULT));
PADDLE_ENFORCE(
dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_));
}
return curand_generator_;
}
#endif // PADDLE_ONLY_CPU #endif // PADDLE_ONLY_CPU
} // namespace platform } // namespace platform
......
...@@ -17,7 +17,6 @@ limitations under the License. */ ...@@ -17,7 +17,6 @@ limitations under the License. */
#ifndef PADDLE_ONLY_CPU #ifndef PADDLE_ONLY_CPU
#include "paddle/platform/dynload/cublas.h" #include "paddle/platform/dynload/cublas.h"
#include "paddle/platform/dynload/cudnn.h" #include "paddle/platform/dynload/cudnn.h"
#include "paddle/platform/dynload/curand.h"
#include "paddle/platform/gpu_info.h" #include "paddle/platform/gpu_info.h"
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#endif #endif
...@@ -40,7 +39,7 @@ class DeviceContext { ...@@ -40,7 +39,7 @@ class DeviceContext {
class CPUDeviceContext : public DeviceContext { class CPUDeviceContext : public DeviceContext {
public: public:
CPUDeviceContext(); CPUDeviceContext();
explicit CPUDeviceContext(CPUPlace); explicit CPUDeviceContext(CPUPlace place);
virtual ~CPUDeviceContext() {} virtual ~CPUDeviceContext() {}
Eigen::DefaultDevice* eigen_device() const; Eigen::DefaultDevice* eigen_device() const;
...@@ -56,7 +55,7 @@ class EigenCudaStreamDevice; ...@@ -56,7 +55,7 @@ class EigenCudaStreamDevice;
class CUDADeviceContext : public DeviceContext { class CUDADeviceContext : public DeviceContext {
public: public:
explicit CUDADeviceContext(GPUPlace); explicit CUDADeviceContext(GPUPlace place);
virtual ~CUDADeviceContext(); virtual ~CUDADeviceContext();
/*! \brief Wait for all operations completion in the stream. */ /*! \brief Wait for all operations completion in the stream. */
...@@ -75,9 +74,6 @@ class CUDADeviceContext : public DeviceContext { ...@@ -75,9 +74,6 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return cudnn handle in the device context. */ /*! \brief Return cudnn handle in the device context. */
cudnnHandle_t cudnn_handle(); cudnnHandle_t cudnn_handle();
/*! \brief Return curand handle in the device context. */
curandGenerator_t curand_generator();
/*! \brief Return cuda stream in the device context. */ /*! \brief Return cuda stream in the device context. */
cudaStream_t stream(); cudaStream_t stream();
// clang-format on // clang-format on
...@@ -85,18 +81,13 @@ class CUDADeviceContext : public DeviceContext { ...@@ -85,18 +81,13 @@ class CUDADeviceContext : public DeviceContext {
private: private:
GPUPlace place_; GPUPlace place_;
private:
std::unique_ptr<Eigen::GpuDevice> eigen_device_; std::unique_ptr<Eigen::GpuDevice> eigen_device_;
std::unique_ptr<EigenCudaStreamDevice> eigen_stream_; std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
private:
uint64_t seed_;
// clang-format off // clang-format off
cudaStream_t stream_{nullptr}; cudaStream_t stream_{nullptr};
cudnnHandle_t cudnn_handle_{nullptr}; cudnnHandle_t cudnn_handle_{nullptr};
cublasHandle_t cublas_handle_{nullptr}; cublasHandle_t cublas_handle_{nullptr};
curandGenerator_t curand_generator_{nullptr};
// clang-format on // clang-format on
}; };
......
...@@ -43,8 +43,6 @@ TEST(Device, CUDADeviceContext) { ...@@ -43,8 +43,6 @@ TEST(Device, CUDADeviceContext) {
ASSERT_NE(nullptr, cudnn_handle); ASSERT_NE(nullptr, cudnn_handle);
cublasHandle_t cublas_handle = device_context->cublas_handle(); cublasHandle_t cublas_handle = device_context->cublas_handle();
ASSERT_NE(nullptr, cublas_handle); ASSERT_NE(nullptr, cublas_handle);
curandGenerator_t curand_handle = device_context->curand_generator();
ASSERT_NE(nullptr, curand_handle);
ASSERT_NE(nullptr, device_context->stream()); ASSERT_NE(nullptr, device_context->stream());
delete device_context; delete device_context;
} }
......
...@@ -65,7 +65,6 @@ void ParameterClient2::initThreads() { ...@@ -65,7 +65,6 @@ void ParameterClient2::initThreads() {
LOG(INFO) << "parallel_thread_num dosent need to set"; LOG(INFO) << "parallel_thread_num dosent need to set";
} }
syncThreadPool_.reset(new SyncThreadPool(threadNum_)); syncThreadPool_.reset(new SyncThreadPool(threadNum_));
startThreads(); startThreads();
} }
...@@ -224,6 +223,14 @@ void ParameterClient2::prepareSendData( ...@@ -224,6 +223,14 @@ void ParameterClient2::prepareSendData(
request.set_cost(cost); request.set_cost(cost);
request.set_batch_status(batchStatus); request.set_batch_status(batchStatus);
CHECK_EQ(request.blocks_size(), 0); CHECK_EQ(request.blocks_size(), 0);
VLOG(10) << "request: trainer_id: " << request.trainer_id()
<< " update_mode" << request.update_mode()
<< " send_back_parameter: " << request.send_back_parameter()
<< " send_back_parameter_type: "
<< request.send_back_parameter_type()
<< " num_samples: " << request.num_samples()
<< " cost: " << request.cost()
<< " batch_status: " << request.batch_status();
} }
for (const auto& segments : parameterSegments) { for (const auto& segments : parameterSegments) {
const auto it = parameterMap_.find(segments.id); const auto it = parameterMap_.find(segments.id);
...@@ -251,11 +258,17 @@ void ParameterClient2::prepareSendData( ...@@ -251,11 +258,17 @@ void ParameterClient2::prepareSendData(
CHECK(sendMat != nullptr) << "sendMat is nullptr"; CHECK(sendMat != nullptr) << "sendMat is nullptr";
syncThreadPool_->exec([&](int tid, size_t numThreads) { syncThreadPool_->exec([&](int tid, size_t numThreads) {
std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
const auto& localIndices = prefetchMat->getLocalIndices(); const auto& localIndices = prefetchMat->getLocalIndices();
/// num of sparse rows /// num of sparse rows
size_t nLocalBlocks = localIndices.size(); size_t nLocalBlocks = localIndices.size();
uint64_t beginDim = 0; uint64_t beginDim = 0;
uint64_t endDim = 0; uint64_t endDim = 0;
// FIXME(typhoonzero): let it resize first
prefetchMat->getLocalRow(nLocalBlocks + 1);
sendMat->getLocalRow(nLocalBlocks + 1);
for (size_t row = 0; row < nLocalBlocks; ++row) { for (size_t row = 0; row < nLocalBlocks; ++row) {
int64_t blockId = localIndices[row]; // local row -> sparse row int64_t blockId = localIndices[row]; // local row -> sparse row
int serverId = std::abs((blockId + nameHash) % serviceNum_); int serverId = std::abs((blockId + nameHash) % serviceNum_);
...@@ -275,7 +288,6 @@ void ParameterClient2::prepareSendData( ...@@ -275,7 +288,6 @@ void ParameterClient2::prepareSendData(
block->set_begin_pos(row * blockSize); block->set_begin_pos(row * blockSize);
/// block len /// block len
block->set_block_size(endDim - beginDim); block->set_block_size(endDim - beginDim);
if (sendingPara) { if (sendingPara) {
sendJob->parallelInputIovs[serverId].push_back( sendJob->parallelInputIovs[serverId].push_back(
{sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize}); {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
......
...@@ -583,6 +583,7 @@ protected: ...@@ -583,6 +583,7 @@ protected:
#ifndef PADDLE_DISABLE_TIMER #ifndef PADDLE_DISABLE_TIMER
uint64_t forwardbackwordTime_; uint64_t forwardbackwordTime_;
#endif #endif
std::mutex sparseAutoGrowthMutex_;
/// map id to parameter used for decoding protobuf data /// map id to parameter used for decoding protobuf data
std::unordered_map<size_t, ParameterPtr> parameterMap_; std::unordered_map<size_t, ParameterPtr> parameterMap_;
......
if(WITH_PYTHON)
cc_library(paddle_pybind SHARED
SRCS pybind.cc
DEPS pybind python backward
sgd_op
gather_op
scatter_op
add_op
mul_op
rowwise_add_op
sigmoid_op
softmax_op
mean_op
cross_entropy_op
recurrent_op
uniform_random_op
gaussian_random_op
fill_zeros_like_op
lookup_table_op
scale_op
minus_op)
endif(WITH_PYTHON)
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
#include "paddle/framework/backward.h" #include "paddle/framework/backward.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/tensor_py.h"
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
#include "paddle/operators/recurrent_op.h" #include "paddle/operators/recurrent_op.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/pybind/tensor_py.h"
#include "paddle/string/to_string.h" #include "paddle/string/to_string.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -31,7 +31,7 @@ limitations under the License. */ ...@@ -31,7 +31,7 @@ limitations under the License. */
namespace py = pybind11; namespace py = pybind11;
USE_OP(add_two); USE_OP(add_two);
USE_CPU_ONLY_OP(onehot_cross_entropy); USE_OP(onehot_cross_entropy);
USE_OP(sgd); USE_OP(sgd);
USE_OP(mul); USE_OP(mul);
USE_OP(mean); USE_OP(mean);
...@@ -42,6 +42,12 @@ USE_OP(fill_zeros_like); ...@@ -42,6 +42,12 @@ USE_OP(fill_zeros_like);
USE_OP_ITSELF(recurrent_op); USE_OP_ITSELF(recurrent_op);
USE_OP(gaussian_random); USE_OP(gaussian_random);
USE_OP(uniform_random); USE_OP(uniform_random);
USE_OP(lookup_table);
USE_OP(scale);
USE_OP_ITSELF(identity);
USE_OP(minus);
USE_CPU_ONLY_OP(gather);
USE_CPU_ONLY_OP(scatter);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -131,26 +137,24 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -131,26 +137,24 @@ All parameter, weight, gradient are variables in Paddle.
py::return_value_policy::reference) py::return_value_policy::reference)
.def("find_var", &Scope::FindVar, py::return_value_policy::reference) .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
.def(py::init<>()) .def(py::init<>())
.def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, .def("new_scope",
[](Scope &self) -> Scope * { return &self.NewScope(); },
py::return_value_policy::reference) py::return_value_policy::reference)
.def("drop_kids", &Scope::DropKids); .def("drop_kids", &Scope::DropKids);
//! @note: Be careful! PyBind will return std::string as an unicode, not //! @note: Be careful! PyBind will return std::string as an unicode, not
//! Python str. If you want a str object, you should cast them in Python. //! Python str. If you want a str object, you should cast them in Python.
m.def("get_all_op_protos", []() -> std::vector<py::bytes> { m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
auto &op_info_map = OpRegistry::op_info_map();
std::vector<py::bytes> ret_values; std::vector<py::bytes> ret_values;
for (auto it = op_info_map.begin(); it != op_info_map.end(); ++it) {
const OpProto *proto = it->second.proto_; OpInfoMap::Instance().IterAllInfo([&ret_values](const std::string &type,
if (proto == nullptr) { const OpInfo &info) {
continue; if (!info.HasOpProtoAndChecker()) return;
}
PADDLE_ENFORCE(proto->IsInitialized(), "OpProto must all be initialized");
std::string str; std::string str;
PADDLE_ENFORCE(proto->SerializeToString(&str), PADDLE_ENFORCE(info.Proto().SerializeToString(&str),
"Serialize OpProto Error. This could be a bug of Paddle."); "Serialize OpProto Error. This could be a bug of Paddle.");
ret_values.push_back(py::bytes(str)); ret_values.emplace_back(str);
} });
return ret_values; return ret_values;
}); });
m.def_submodule( m.def_submodule(
...@@ -222,8 +226,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -222,8 +226,10 @@ All parameter, weight, gradient are variables in Paddle.
retv->SetType("plain_net"); retv->SetType("plain_net");
return retv; return retv;
}) })
.def("add_op", [](operators::NetOp &self, .def("append_op",
const OperatorBase &op) { self.AddOp(op); }) [](operators::NetOp &self, const OperatorBase &op) {
self.AppendOp(op);
})
.def("complete_add_op", &operators::NetOp::CompleteAddOp) .def("complete_add_op", &operators::NetOp::CompleteAddOp)
.def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) { .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
self->CompleteAddOp(); self->CompleteAddOp();
...@@ -243,10 +249,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -243,10 +249,9 @@ All parameter, weight, gradient are variables in Paddle.
auto rnn_op = OpRegistry::CreateOp(desc); auto rnn_op = OpRegistry::CreateOp(desc);
return static_cast<operators::RecurrentOp *>(rnn_op.release()); return static_cast<operators::RecurrentOp *>(rnn_op.release());
}) })
.def("set_stepnet", [](operators::RecurrentOp &self, .def("set_stepnet",
const operators::NetOp &net) -> void { [](operators::RecurrentOp &self, const operators::NetOp &net)
self.set_stepnet(net.Clone()); -> void { self.set_stepnet(net.Clone()); });
});
m.def("unique_integer", UniqueIntegerGenerator); m.def("unique_integer", UniqueIntegerGenerator);
......
...@@ -63,8 +63,11 @@ struct CastToPyBufferImpl<true, I, ARGS...> { ...@@ -63,8 +63,11 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
} }
return py::buffer_info( return py::buffer_info(
dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()), dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(), sizeof(CUR_TYPE),
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides); py::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(dst_tensor.dims()),
dims_outside,
strides);
} else { } else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value; constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor); return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
...@@ -107,8 +110,8 @@ void PyCUDATensorSetFromArray( ...@@ -107,8 +110,8 @@ void PyCUDATensorSetFromArray(
self.Resize(framework::make_ddim(dims)); self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place); auto *dst = self.mutable_data<T>(place);
paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(), paddle::platform::GpuMemcpySync(
cudaMemcpyHostToDevice); dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
} }
#endif #endif
......
...@@ -38,7 +38,7 @@ Configuring cmake in /paddle/build ... ...@@ -38,7 +38,7 @@ Configuring cmake in /paddle/build ...
-DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-DCUDNN_ROOT=/usr/ -DCUDNN_ROOT=/usr/
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-DWITH_TESTING=${WITH_TESTING:-OFF} -DWITH_TESTING=${WITH_TESTING:-ON}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
======================================== ========================================
EOF EOF
...@@ -56,19 +56,18 @@ cmake .. \ ...@@ -56,19 +56,18 @@ cmake .. \
-DWITH_C_API=${WITH_C_API:-OFF} \ -DWITH_C_API=${WITH_C_API:-OFF} \
-DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-DCUDNN_ROOT=/usr/ \ -DCUDNN_ROOT=/usr/ \
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \ -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-DWITH_TESTING=${WITH_TESTING:-OFF} \ -DWITH_TESTING=${WITH_TESTING:-ON} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
cat <<EOF cat <<EOF
============================================ ============================================
Building in /paddle/build ... Building in /paddle/build ...
Build unit tests: ${WITH_TESTING:-OFF}
============================================ ============================================
EOF EOF
make -j `nproc` make -j `nproc`
if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Running unit tests ... Running unit tests ...
......
...@@ -82,6 +82,9 @@ message ConvConfig { ...@@ -82,6 +82,9 @@ message ConvConfig {
// if not set, use img_size // if not set, use img_size
optional uint32 img_size_y = 14; optional uint32 img_size_y = 14;
optional uint32 dilation = 15 [ default = 1 ];
optional uint32 dilation_y = 16 [ default = 1 ];
} }
message PoolConfig { message PoolConfig {
...@@ -496,6 +499,9 @@ message LayerConfig { ...@@ -496,6 +499,9 @@ message LayerConfig {
optional int32 axis = 54 [ default = 2 ]; optional int32 axis = 54 [ default = 2 ];
repeated uint32 offset = 55; repeated uint32 offset = 55;
repeated uint32 shape = 56; repeated uint32 shape = 56;
// for HuberRegressionLoss
optional double delta = 57 [ default = 1.0 ];
} }
message EvaluatorConfig { message EvaluatorConfig {
......
...@@ -338,7 +338,8 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name, ...@@ -338,7 +338,8 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
in_links_count += 1 in_links_count += 1
layer_name = MakeLayerNameInParentSubmodel(name) layer_name = MakeLayerNameInParentSubmodel(name)
layer = g_layer_map[layer_name] layer = g_layer_map[layer_name]
ScatterAgentLayer(name=name, size=layer.size) ScatterAgentLayer(
name=name, size=layer.size, width=layer.width, height=layer.height)
pair = g_current_submodel.in_links.add() pair = g_current_submodel.in_links.add()
pair.layer_name = layer_name pair.layer_name = layer_name
...@@ -869,12 +870,16 @@ class Conv(Cfg): ...@@ -869,12 +870,16 @@ class Conv(Cfg):
caffe_mode=True, caffe_mode=True,
filter_size_y=None, filter_size_y=None,
padding_y=None, padding_y=None,
stride_y=None): stride_y=None,
dilation=None,
dilation_y=None):
self.add_keys(locals()) self.add_keys(locals())
if filter_size_y is None: if filter_size_y is None:
self.filter_size_y = filter_size self.filter_size_y = filter_size
if padding_y is None: if padding_y is None:
self.padding_y = padding self.padding_y = padding
if dilation_y is None:
self.dilation_y = dilation
if stride_y is None: if stride_y is None:
self.stride_y = stride self.stride_y = stride
if output_x is not None: if output_x is not None:
...@@ -2197,8 +2202,8 @@ class MaxOutLayer(LayerBase): ...@@ -2197,8 +2202,8 @@ class MaxOutLayer(LayerBase):
maxout_conf = self.config.inputs[0].maxout_conf maxout_conf = self.config.inputs[0].maxout_conf
parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf) parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
out_channels = maxout_conf.image_conf.channels / maxout_conf.groups out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
self.set_cnn_layer(name, g_layer_map[input_layer.name].height, self.set_cnn_layer(name, maxout_conf.image_conf.img_size_y,
g_layer_map[input_layer.name].width, out_channels) maxout_conf.image_conf.img_size, out_channels)
@config_layer('row_conv') @config_layer('row_conv')
...@@ -2232,6 +2237,20 @@ class ClipLayer(LayerBase): ...@@ -2232,6 +2237,20 @@ class ClipLayer(LayerBase):
self.config.inputs[0].clip_conf.max = max self.config.inputs[0].clip_conf.max = max
@config_layer('scale_shift')
class ScaleShiftLayer(LayerBase):
def __init__(self, name, inputs, bias=True, **xargs):
super(ScaleShiftLayer, self).__init__(
name, 'scale_shift', 0, inputs=inputs, **xargs)
config_assert(
len(self.inputs) == 1,
'ScaleShiftLayer must have one and only one input.')
input_layer = self.get_input_layer(0)
self.set_layer_size(input_layer.size)
self.create_input_parameter(0, 1, [1, 1])
self.create_bias_parameter(bias, 1)
# key: cost type # key: cost type
# value: cost class # value: cost class
g_cost_map = {} g_cost_map = {}
...@@ -2255,7 +2274,7 @@ define_cost('PnpairValidation', 'pnpair-validation') ...@@ -2255,7 +2274,7 @@ define_cost('PnpairValidation', 'pnpair-validation')
define_cost('SumOfSquaresCostLayer', 'square_error') define_cost('SumOfSquaresCostLayer', 'square_error')
define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy') define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy') define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
define_cost('HuberTwoClass', 'huber') define_cost('HuberTwoClassification', 'huber_classification')
define_cost('SumCost', 'sum_cost') define_cost('SumCost', 'sum_cost')
define_cost('SmoothL1Cost', 'smooth_l1') define_cost('SmoothL1Cost', 'smooth_l1')
...@@ -2317,6 +2336,17 @@ class LambdaCost(LayerBase): ...@@ -2317,6 +2336,17 @@ class LambdaCost(LayerBase):
self.config.max_sort_size = max_sort_size self.config.max_sort_size = max_sort_size
@config_layer('huber_regression')
class HuberRegressionLoss(LayerBase):
def __init__(self, name, inputs, delta=1., coeff=1., device=None):
super(HuberRegressionLoss, self).__init__(
name, 'huber_regression', 1, inputs=inputs, device=device)
config_assert(
len(self.inputs) == 2, 'HuberRegression must have 2 inputs')
self.config.delta = delta
self.config.coeff = coeff
@config_layer('nce') @config_layer('nce')
class NCELayer(LayerBase): class NCELayer(LayerBase):
def __init__(self, def __init__(self,
...@@ -2391,9 +2421,11 @@ class GatherAgentLayer(LayerBase): ...@@ -2391,9 +2421,11 @@ class GatherAgentLayer(LayerBase):
@config_layer('scatter_agent') @config_layer('scatter_agent')
class ScatterAgentLayer(LayerBase): class ScatterAgentLayer(LayerBase):
def __init__(self, name, size, device=None): def __init__(self, name, size, width=None, height=None, device=None):
super(ScatterAgentLayer, self).__init__( super(ScatterAgentLayer, self).__init__(
name, 'scatter_agent', size, inputs=[], device=device) name, 'scatter_agent', size, inputs=[], device=device)
if height and width:
self.set_layer_height_width(height, width)
@config_layer('multiplex') @config_layer('multiplex')
...@@ -2677,6 +2709,49 @@ class SubSequenceLayer(LayerBase): ...@@ -2677,6 +2709,49 @@ class SubSequenceLayer(LayerBase):
self.create_bias_parameter(bias, size) self.create_bias_parameter(bias, size)
@config_layer('seq_slice')
class SeqSliceLayer(LayerBase):
def __init__(self, name, inputs, starts, ends, bias=False, **xargs):
if isinstance(inputs, list):
assert len(inputs) == 1, ('the first input of sequence slice layer '
'is a single sequence input.')
else:
inputs = [inputs]
if starts is not None:
if isinstance(starts, list):
assert len(starts) == 1, (
'the start indices for sequence slice layer cannot '
'be a list having more than one element.')
starts = starts[0]
inputs.append(starts)
if ends is not None:
if isinstance(ends, list):
assert len(ends) == 1, (
'the end indices for sequence slice layer cannot '
'be a list having more than one element.')
ends = ends[0]
inputs.append(ends)
assert len(inputs) >= 2, (
'the sequence slice layer has at least two inputs.')
super(SeqSliceLayer, self).__init__(
name, 'seq_slice', 0, inputs=inputs, **xargs)
input_layer0 = self.get_input_layer(0)
size = input_layer0.size
self.set_layer_size(size)
if len(inputs) == 3:
assert (
self.get_input_layer(1).size == self.get_input_layer(2).size), (
'If start and end indices are both given to'
'sequence slice layer, they should have the same width.')
elif len(inputs) == 2:
self.config.select_first = (starts is not None)
@config_layer('sub_nested_seq') @config_layer('sub_nested_seq')
class SubNestedSequenceLayer(LayerBase): class SubNestedSequenceLayer(LayerBase):
def __init__(self, name, inputs, selected_indices, bias=False, **xargs): def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
......
...@@ -16,11 +16,13 @@ import functools ...@@ -16,11 +16,13 @@ import functools
import collections import collections
import inspect import inspect
import paddle.trainer.config_parser as cp
from paddle.trainer.config_parser import * from paddle.trainer.config_parser import *
from .activations import LinearActivation, SigmoidActivation, TanhActivation, \ from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
from .evaluators import * from .evaluators import *
from .poolings import MaxPooling, AvgPooling, BasePoolingType from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
CudnnAvgPooling, CudnnMaxPooling
from .attrs import * from .attrs import *
from .default_decorators import * from .default_decorators import *
...@@ -108,7 +110,8 @@ __all__ = [ ...@@ -108,7 +110,8 @@ __all__ = [
'sum_cost', 'sum_cost',
'rank_cost', 'rank_cost',
'lambda_cost', 'lambda_cost',
'huber_cost', 'huber_regression_cost',
'huber_classification_cost',
'block_expand_layer', 'block_expand_layer',
'maxout_layer', 'maxout_layer',
'out_prod_layer', 'out_prod_layer',
...@@ -132,7 +135,9 @@ __all__ = [ ...@@ -132,7 +135,9 @@ __all__ = [
'sub_nested_seq_layer', 'sub_nested_seq_layer',
'clip_layer', 'clip_layer',
'slice_projection', 'slice_projection',
'seq_slice_layer',
'kmax_sequence_score_layer', 'kmax_sequence_score_layer',
'scale_shift_layer',
] ]
...@@ -216,7 +221,8 @@ class LayerType(object): ...@@ -216,7 +221,8 @@ class LayerType(object):
RANK_COST = 'rank-cost' RANK_COST = 'rank-cost'
LAMBDA_COST = 'lambda_cost' LAMBDA_COST = 'lambda_cost'
HUBER = 'huber' HUBER_REGRESSION = 'huber_regression'
HUBER_CLASSIFICATION = 'huber_classification'
CROSS_ENTROPY = 'multi-class-cross-entropy' CROSS_ENTROPY = 'multi-class-cross-entropy'
CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm' CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy' SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
...@@ -228,8 +234,10 @@ class LayerType(object): ...@@ -228,8 +234,10 @@ class LayerType(object):
CROP_LAYER = 'crop' CROP_LAYER = 'crop'
SUB_NESTED_SEQ = 'sub_nested_seq' SUB_NESTED_SEQ = 'sub_nested_seq'
CLIP_LAYER = 'clip' CLIP_LAYER = 'clip'
SEQ_SLICE = 'seq_slice'
KMAX_SEQ_SCORE = 'kmax_seq_score' KMAX_SEQ_SCORE = 'kmax_seq_score'
SCALE_SHIFT_LAYER = 'scale_shift'
@staticmethod @staticmethod
def is_layer_type(type_name): def is_layer_type(type_name):
...@@ -328,6 +336,14 @@ class LayerOutput(object): ...@@ -328,6 +336,14 @@ class LayerOutput(object):
self.outputs = outputs self.outputs = outputs
self.reverse = reverse self.reverse = reverse
@property
def width(self):
return cp.g_layer_map[self.full_name].width
@property
def height(self):
return cp.g_layer_map[self.full_name].height
def set_input(self, input): def set_input(self, input):
""" """
Set the input for a memory layer. Can only be used for memory layer Set the input for a memory layer. Can only be used for memory layer
...@@ -909,7 +925,13 @@ def data_layer(name, size, height=None, width=None, layer_attr=None): ...@@ -909,7 +925,13 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
width=width, width=width,
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(name, LayerType.DATA, size=size) num_filters = None
if height is not None and width is not None:
num_filters = size / (width * height)
assert num_filters * width * height == size, \
"size=%s width=%s height=%s" % (size, width, height)
return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
@wrap_name_default("embedding") @wrap_name_default("embedding")
...@@ -2322,6 +2344,7 @@ def img_conv_layer(input, ...@@ -2322,6 +2344,7 @@ def img_conv_layer(input,
groups=1, groups=1,
stride=1, stride=1,
padding=0, padding=0,
dilation=1,
bias_attr=None, bias_attr=None,
param_attr=None, param_attr=None,
shared_biases=True, shared_biases=True,
...@@ -2329,6 +2352,7 @@ def img_conv_layer(input, ...@@ -2329,6 +2352,7 @@ def img_conv_layer(input,
filter_size_y=None, filter_size_y=None,
stride_y=None, stride_y=None,
padding_y=None, padding_y=None,
dilation_y=None,
trans=False, trans=False,
layer_type=None): layer_type=None):
""" """
...@@ -2393,6 +2417,11 @@ def img_conv_layer(input, ...@@ -2393,6 +2417,11 @@ def img_conv_layer(input,
:type padding: int|tuple|list :type padding: int|tuple|list
:param padding_y: The y dimension of the padding. :param padding_y: The y dimension of the padding.
:type padding_y: int :type padding_y: int
:param dilation: The x dimension of the dilation. Or input a tuple for two
image dimension
:type dilation: int|tuple|list
:param dilation_y: The y dimension of the dilation.
:type dilation_y: int
:param bias_attr: Convolution bias attribute. None means default bias. :param bias_attr: Convolution bias attribute. None means default bias.
False means no bias. False means no bias.
:type bias_attr: ParameterAttribute|False :type bias_attr: ParameterAttribute|False
...@@ -2440,6 +2469,13 @@ def img_conv_layer(input, ...@@ -2440,6 +2469,13 @@ def img_conv_layer(input,
else: else:
padding_y = padding padding_y = padding
if dilation_y is None:
if isinstance(dilation, collections.Sequence):
assert len(dilation) == 2
dilation, dilation_y = dilation
else:
dilation_y = dilation
if param_attr.attr.get('initial_smart'): if param_attr.attr.get('initial_smart'):
# special initial for conv layers. # special initial for conv layers.
init_w = (2.0 / (filter_size**2 * num_channels))**0.5 init_w = (2.0 / (filter_size**2 * num_channels))**0.5
...@@ -2449,6 +2485,8 @@ def img_conv_layer(input, ...@@ -2449,6 +2485,8 @@ def img_conv_layer(input,
param_attr.attr["initial_smart"] = False param_attr.attr["initial_smart"] = False
if layer_type: if layer_type:
if dilation > 1 or dilation_y > 1:
assert layer_type in ["cudnn_conv", "cudnn_convt"]
if trans: if trans:
assert layer_type in ["exconvt", "cudnn_convt"] assert layer_type in ["exconvt", "cudnn_convt"]
else: else:
...@@ -2464,11 +2502,13 @@ def img_conv_layer(input, ...@@ -2464,11 +2502,13 @@ def img_conv_layer(input,
conv=Conv( conv=Conv(
filter_size=filter_size, filter_size=filter_size,
padding=padding, padding=padding,
dilation=dilation,
stride=stride, stride=stride,
channels=num_channels, channels=num_channels,
groups=groups, groups=groups,
filter_size_y=filter_size_y, filter_size_y=filter_size_y,
padding_y=padding_y, padding_y=padding_y,
dilation_y=dilation_y,
stride_y=stride_y), stride_y=stride_y),
**param_attr.attr), **param_attr.attr),
active_type=act.name, active_type=act.name,
...@@ -2574,11 +2614,14 @@ def img_pool_layer(input, ...@@ -2574,11 +2614,14 @@ def img_pool_layer(input,
elif isinstance(pool_type, AvgPooling): elif isinstance(pool_type, AvgPooling):
pool_type.name = 'avg' pool_type.name = 'avg'
assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
CudnnMaxPooling], \
"only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
type_name = pool_type.name + '-projection' \ type_name = pool_type.name + '-projection' \
if ( if (
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
else pool_type.name else pool_type.name
pool_size_y = pool_size if pool_size_y is None else pool_size_y pool_size_y = pool_size if pool_size_y is None else pool_size_y
stride_y = stride if stride_y is None else stride_y stride_y = stride if stride_y is None else stride_y
padding_y = padding if padding_y is None else padding_y padding_y = padding if padding_y is None else padding_y
...@@ -4202,8 +4245,7 @@ def conv_operator(img, ...@@ -4202,8 +4245,7 @@ def conv_operator(img,
num_channels = img.num_filters num_channels = img.num_filters
assert isinstance(filter, LayerOutput) assert isinstance(filter, LayerOutput)
if filter.size is not None: assert filter.size is not None
filter.size = filter_size * filter_size_y * num_filters * num_channels
opCls = ConvTransOperator if trans else ConvOperator opCls = ConvTransOperator if trans else ConvOperator
...@@ -4914,7 +4956,6 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): ...@@ -4914,7 +4956,6 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
:return: LayerOutput object. :return: LayerOutput object.
:rtype: LayerOutput :rtype: LayerOutput
""" """
assert input.layer_type == LayerType.CONV_LAYER
assert isinstance(input.activation, LinearActivation) assert isinstance(input.activation, LinearActivation)
assert groups > 1 assert groups > 1
if num_channels is None: if num_channels is None:
...@@ -5605,16 +5646,77 @@ def sum_cost(input, name=None, layer_attr=None): ...@@ -5605,16 +5646,77 @@ def sum_cost(input, name=None, layer_attr=None):
@wrap_name_default() @wrap_name_default()
@layer_support() @layer_support()
def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None): def huber_regression_cost(input,
label,
name=None,
delta=1.0,
coeff=1.0,
layer_attr=None):
""" """
A loss layer for huber loss. In statistics, the Huber loss is a loss function used in robust regression,
that is less sensitive to outliers in data than the squared error loss.
Given a prediction f(x), a label y and :math:`\delta`, the loss function
is defined as:
.. math:
loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta
loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise
The example usage is: The example usage is:
.. code-block:: python .. code-block:: python
cost = huber_cost(input=input_layer, cost = huber_regression_cost(input=input_layer, label=label_layer)
label=label_layer)
:param input: The first input layer.
:type input: LayerOutput.
:param label: The input label.
:type input: LayerOutput.
:param name: The name of this layers. It is not necessary.
:type name: None|basestring.
:param delta: The difference between the observed and predicted values.
:type delta: float.
:param coeff: The coefficient affects the gradient in the backward.
:type coeff: float.
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
:rtype: LayerOutput.
"""
assert isinstance(input, LayerOutput)
Layer(
name=name,
type=LayerType.HUBER_REGRESSION,
inputs=[input.name, label.name],
delta=delta,
coeff=coeff,
**ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(
name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1)
@wrap_name_default()
@layer_support()
def huber_classification_cost(input,
label,
name=None,
coeff=1.0,
layer_attr=None):
"""
For classification purposes, a variant of the Huber loss called modified Huber
is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber
loss is defined as:
.. math:
loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1
loss = -4yf(x), \text{otherwise}
The example usage is:
.. code-block:: python
cost = huber_classification_cost(input=input_layer, label=label_layer)
:param input: The first input layer. :param input: The first input layer.
:type input: LayerOutput. :type input: LayerOutput.
...@@ -5634,11 +5736,12 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None): ...@@ -5634,11 +5736,12 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
assert input.size == 1 assert input.size == 1
Layer( Layer(
name=name, name=name,
type=LayerType.HUBER, type=LayerType.HUBER_CLASSIFICATION,
inputs=[input.name, label.name], inputs=[input.name, label.name],
coeff=coeff, coeff=coeff,
**ExtraLayerAttribute.to_kwargs(layer_attr)) **ExtraLayerAttribute.to_kwargs(layer_attr))
return LayerOutput(name, LayerType.HUBER, parents=[input, label], size=1) return LayerOutput(
name, LayerType.HUBER_CLASSIFICATION, parents=[input, label], size=1)
@wrap_name_default() @wrap_name_default()
...@@ -6174,6 +6277,72 @@ def clip_layer(input, min, max, name=None): ...@@ -6174,6 +6277,72 @@ def clip_layer(input, min, max, name=None):
name, LayerType.CLIP_LAYER, parents=[input], size=input.size) name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
@wrap_name_default()
def seq_slice_layer(input, starts, ends, name=None):
"""
seq_slice_layer will return one or several sub-sequences from the
input sequence layer given start and end indices.
- If only start indices are given, and end indices are set to None,
this layer slices the input sequence from the given start indices
to its end.
- If only end indices are given, and start indices are set to None,
this layer slices the input sequence from its beginning to the
given end indices.
- If start and end indices are both given, they should have the same
number of elements.
If start or end indices contains more than one elements, the input sequence
will be sliced for multiple times.
.. code-block:: python
seq_silce = seq_slice_layer(input=input_seq,
starts=start_pos, ends=end_pos)
:param name: name of this layer.
:type name: basestring
:param input: input for this layer, it should be a sequence.
:type input: LayerOutput
:param starts: start indices to slice the input sequence.
:type starts: LayerOutput|None
:param ends: end indices to slice the input sequence.
:type ends: LayerOutput|None
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert isinstance(input, LayerOutput), (
'The first input of seq_slice layer must be a PaddlePaddle layer.')
if starts is not None:
assert isinstance(starts, LayerOutput), (
'The start indices for seq_slice layer '
'must be a PaddlePaddle layer.')
if ends is not None:
assert isinstance(ends, LayerOutput), (
'The end indices for seq_slice layer must be a PaddlePaddle layer.')
assert starts is not None or ends is not None, (
'start and end indices '
'cannot be set to None at the same time, at least one of '
'them should be given.')
if starts is not None and ends is not None:
assert starts.size == ends.size, (
'If start and end indices are both given to seq_slice_layer, '
'they should have the same width.')
Layer(
name=name,
type=LayerType.SEQ_SLICE,
inputs=input.name,
starts=starts.name if starts is not None else None,
ends=ends.name if ends is not None else None)
return LayerOutput(
name, LayerType.SEQ_SLICE, parents=[input], size=input.size)
@wrap_name_default() @wrap_name_default()
@layer_support() @layer_support()
def kmax_sequence_score_layer(input, name=None, beam_size=1): def kmax_sequence_score_layer(input, name=None, beam_size=1):
...@@ -6210,3 +6379,43 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1): ...@@ -6210,3 +6379,43 @@ def kmax_sequence_score_layer(input, name=None, beam_size=1):
return LayerOutput( return LayerOutput(
name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size) name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
@wrap_name_default("scale_shift")
@wrap_param_attr_default()
@wrap_bias_attr_default()
def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
"""
A layer applies a linear transformation to each element in each row of
the input matrix. For each element, the layer first re-scale it and then
adds a bias to it.
This layer is very like the SlopeInterceptLayer, except the scale and
bias are trainable.
.. math::
y = w * x + b
.. code-block:: python
scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
:param name: The Layer Name.
:type name: basestring
:param input: The input layer.
:type input: LayerOutput.
:param param_attr: The parameter attribute of scaling.
:type param_attr: ParameterAttribute
:param bias_attr: The parameter attribute of shifting.
:type bias_attr: ParameterAttribute
:return: LayerOutput object.
:rtype: LayerOutput
"""
Layer(
name=name,
type=LayerType.SCALE_SHIFT_LAYER,
inputs=Input(input.name, **param_attr.attr),
bias=ParamAttr.to_bias(bias_attr))
return LayerOutput(
name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
...@@ -8,6 +8,7 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops ...@@ -8,6 +8,7 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
test_kmax_seq_socre_layer test_seq_select_layers) test_kmax_seq_socre_layer test_seq_select_layers test_scale_shift_layer
test_seq_slice_layer)
export whole_configs=(test_split_datasource) export whole_configs=(test_split_datasource)
...@@ -12,6 +12,7 @@ img_conv = img_conv_layer( ...@@ -12,6 +12,7 @@ img_conv = img_conv_layer(
num_filters=64, num_filters=64,
filter_size=(32, 32), filter_size=(32, 32),
padding=(1, 1), padding=(1, 1),
dilation=(1, 1),
stride=(1, 1), stride=(1, 1),
act=LinearActivation()) act=LinearActivation())
img_bn = batch_norm_layer(input=img_conv, act=ReluActivation()) img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
......
...@@ -167,6 +167,20 @@ layers { ...@@ -167,6 +167,20 @@ layers {
softmax_selfnorm_alpha: 0.1 softmax_selfnorm_alpha: 0.1
coeff: 1.0 coeff: 1.0
} }
layers {
name: "__huber_regression_cost_0__"
type: "huber_regression"
size: 1
active_type: ""
inputs {
input_layer_name: "input"
}
inputs {
input_layer_name: "labels"
}
coeff: 1.0
delta: 1.0
}
layers { layers {
name: "huber_probs" name: "huber_probs"
type: "data" type: "data"
...@@ -180,8 +194,8 @@ layers { ...@@ -180,8 +194,8 @@ layers {
active_type: "" active_type: ""
} }
layers { layers {
name: "__huber_cost_0__" name: "__huber_classification_cost_0__"
type: "huber" type: "huber_classification"
size: 1 size: 1
active_type: "" active_type: ""
inputs { inputs {
...@@ -300,7 +314,8 @@ output_layer_names: "__rank_cost_0__" ...@@ -300,7 +314,8 @@ output_layer_names: "__rank_cost_0__"
output_layer_names: "__lambda_cost_0__" output_layer_names: "__lambda_cost_0__"
output_layer_names: "__cross_entropy_0__" output_layer_names: "__cross_entropy_0__"
output_layer_names: "__cross_entropy_with_selfnorm_0__" output_layer_names: "__cross_entropy_with_selfnorm_0__"
output_layer_names: "__huber_cost_0__" output_layer_names: "__huber_regression_cost_0__"
output_layer_names: "__huber_classification_cost_0__"
output_layer_names: "__multi_binary_label_cross_entropy_0__" output_layer_names: "__multi_binary_label_cross_entropy_0__"
output_layer_names: "__sum_cost_0__" output_layer_names: "__sum_cost_0__"
output_layer_names: "__nce_layer_0__" output_layer_names: "__nce_layer_0__"
...@@ -324,9 +339,10 @@ sub_models { ...@@ -324,9 +339,10 @@ sub_models {
layer_names: "__lambda_cost_0__" layer_names: "__lambda_cost_0__"
layer_names: "__cross_entropy_0__" layer_names: "__cross_entropy_0__"
layer_names: "__cross_entropy_with_selfnorm_0__" layer_names: "__cross_entropy_with_selfnorm_0__"
layer_names: "__huber_regression_cost_0__"
layer_names: "huber_probs" layer_names: "huber_probs"
layer_names: "huber_label" layer_names: "huber_label"
layer_names: "__huber_cost_0__" layer_names: "__huber_classification_cost_0__"
layer_names: "__multi_binary_label_cross_entropy_0__" layer_names: "__multi_binary_label_cross_entropy_0__"
layer_names: "__sum_cost_0__" layer_names: "__sum_cost_0__"
layer_names: "__nce_layer_0__" layer_names: "__nce_layer_0__"
...@@ -349,7 +365,8 @@ sub_models { ...@@ -349,7 +365,8 @@ sub_models {
output_layer_names: "__lambda_cost_0__" output_layer_names: "__lambda_cost_0__"
output_layer_names: "__cross_entropy_0__" output_layer_names: "__cross_entropy_0__"
output_layer_names: "__cross_entropy_with_selfnorm_0__" output_layer_names: "__cross_entropy_with_selfnorm_0__"
output_layer_names: "__huber_cost_0__" output_layer_names: "__huber_regression_cost_0__"
output_layer_names: "__huber_classification_cost_0__"
output_layer_names: "__multi_binary_label_cross_entropy_0__" output_layer_names: "__multi_binary_label_cross_entropy_0__"
output_layer_names: "__sum_cost_0__" output_layer_names: "__sum_cost_0__"
output_layer_names: "__nce_layer_0__" output_layer_names: "__nce_layer_0__"
......
type: "nn" type: "nn"
layers { layers {
name: "input" name: "input_seq"
type: "data"
size: 300
active_type: ""
}
layers {
name: "data"
type: "data" type: "data"
size: 128 size: 128
active_type: "" active_type: ""
...@@ -17,7 +11,7 @@ layers { ...@@ -17,7 +11,7 @@ layers {
size: 1 size: 1
active_type: "exponential" active_type: "exponential"
inputs { inputs {
input_layer_name: "data" input_layer_name: "input_seq"
input_parameter_name: "___fc_layer_0__.w0" input_parameter_name: "___fc_layer_0__.w0"
} }
bias_parameter_name: "___fc_layer_0__.wbias" bias_parameter_name: "___fc_layer_0__.wbias"
...@@ -51,15 +45,14 @@ parameters { ...@@ -51,15 +45,14 @@ parameters {
initial_strategy: 0 initial_strategy: 0
initial_smart: false initial_smart: false
} }
input_layer_names: "data" input_layer_names: "input_seq"
output_layer_names: "__kmax_sequence_score_layer_0__" output_layer_names: "__kmax_sequence_score_layer_0__"
sub_models { sub_models {
name: "root" name: "root"
layer_names: "input" layer_names: "input_seq"
layer_names: "data"
layer_names: "__fc_layer_0__" layer_names: "__fc_layer_0__"
layer_names: "__kmax_sequence_score_layer_0__" layer_names: "__kmax_sequence_score_layer_0__"
input_layer_names: "data" input_layer_names: "input_seq"
output_layer_names: "__kmax_sequence_score_layer_0__" output_layer_names: "__kmax_sequence_score_layer_0__"
is_recurrent_layer_group: false is_recurrent_layer_group: false
} }
......
type: "nn"
layers {
name: "data"
type: "data"
size: 100
active_type: ""
}
layers {
name: "__scale_shift_0__"
type: "scale_shift"
size: 100
active_type: ""
inputs {
input_layer_name: "data"
input_parameter_name: "___scale_shift_0__.w0"
}
}
layers {
name: "__scale_shift_1__"
type: "scale_shift"
size: 100
active_type: ""
inputs {
input_layer_name: "data"
input_parameter_name: "___scale_shift_1__.w0"
}
bias_parameter_name: "___scale_shift_1__.wbias"
}
parameters {
name: "___scale_shift_0__.w0"
size: 1
initial_mean: 0.0
initial_std: 1.0
dims: 1
dims: 1
initial_strategy: 0
initial_smart: true
}
parameters {
name: "___scale_shift_1__.w0"
size: 1
initial_mean: 0.0
initial_std: 1.0
dims: 1
dims: 1
initial_strategy: 0
initial_smart: true
}
parameters {
name: "___scale_shift_1__.wbias"
size: 1
initial_mean: 0.0
initial_std: 0.0
dims: 1
dims: 1
initial_strategy: 0
initial_smart: false
}
input_layer_names: "data"
output_layer_names: "__scale_shift_0__"
output_layer_names: "__scale_shift_1__"
sub_models {
name: "root"
layer_names: "data"
layer_names: "__scale_shift_0__"
layer_names: "__scale_shift_1__"
input_layer_names: "data"
output_layer_names: "__scale_shift_0__"
output_layer_names: "__scale_shift_1__"
is_recurrent_layer_group: false
}
type: "nn"
layers {
name: "word"
type: "data"
size: 128
active_type: ""
}
layers {
name: "starts"
type: "data"
size: 5
active_type: ""
}
layers {
name: "ends"
type: "data"
size: 5
active_type: ""
}
layers {
name: "__seq_slice_layer_0__"
type: "seq_slice"
size: 128
active_type: ""
inputs {
input_layer_name: "word"
}
inputs {
input_layer_name: "starts"
}
inputs {
input_layer_name: "ends"
}
}
layers {
name: "__seq_slice_layer_1__"
type: "seq_slice"
size: 128
active_type: ""
inputs {
input_layer_name: "word"
}
inputs {
input_layer_name: "starts"
}
select_first: true
}
layers {
name: "__seq_slice_layer_2__"
type: "seq_slice"
size: 128
active_type: ""
inputs {
input_layer_name: "word"
}
inputs {
input_layer_name: "ends"
}
select_first: false
}
input_layer_names: "word"
output_layer_names: "__seq_slice_layer_0__"
output_layer_names: "__seq_slice_layer_1__"
output_layer_names: "__seq_slice_layer_2__"
sub_models {
name: "root"
layer_names: "word"
layer_names: "starts"
layer_names: "ends"
layer_names: "__seq_slice_layer_0__"
layer_names: "__seq_slice_layer_1__"
layer_names: "__seq_slice_layer_2__"
input_layer_names: "word"
output_layer_names: "__seq_slice_layer_0__"
output_layer_names: "__seq_slice_layer_1__"
output_layer_names: "__seq_slice_layer_2__"
is_recurrent_layer_group: false
}
...@@ -33,7 +33,9 @@ outputs( ...@@ -33,7 +33,9 @@ outputs(
input=probs, label=xe_label), input=probs, label=xe_label),
cross_entropy_with_selfnorm( cross_entropy_with_selfnorm(
input=probs, label=xe_label), input=probs, label=xe_label),
huber_cost( huber_regression_cost(
input=seq_in, label=labels),
huber_classification_cost(
input=data_layer( input=data_layer(
name='huber_probs', size=1), name='huber_probs', size=1),
label=data_layer( label=data_layer(
......
...@@ -2,9 +2,7 @@ ...@@ -2,9 +2,7 @@
#coding=utf-8 #coding=utf-8
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
data = data_layer(name='input', size=300) data = data_layer(name="input_seq", size=128)
data = data_layer(name="data", size=128)
scores = fc_layer(input=data, size=1, act=ExpActivation()) scores = fc_layer(input=data, size=1, act=ExpActivation())
kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5) kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)
......
from paddle.trainer_config_helpers import *
data = data_layer(name='data', size=100)
scale = scale_shift_layer(input=data, bias_attr=False)
scale_shift = scale_shift_layer(input=data)
outputs(scale, scale_shift)
#!/usr/bin/env python
#coding=utf-8
from paddle.trainer_config_helpers import *
input_seq = data_layer("word", size=128)
starts = data_layer("starts", size=5)
ends = data_layer("ends", size=5)
seq_slice1 = seq_slice_layer(input=input_seq, starts=starts, ends=ends)
seq_slice2 = seq_slice_layer(input=input_seq, starts=starts, ends=None)
seq_slice3 = seq_slice_layer(input=input_seq, starts=None, ends=ends)
outputs(seq_slice1, seq_slice2, seq_slice3)
...@@ -13,6 +13,8 @@ py_test(test_add_two_op SRCS test_add_two_op.py) ...@@ -13,6 +13,8 @@ py_test(test_add_two_op SRCS test_add_two_op.py)
py_test(test_sigmoid_op SRCS test_sigmoid_op.py) py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
py_test(test_softmax_op SRCS test_softmax_op.py) py_test(test_softmax_op SRCS test_softmax_op.py)
py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py) py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
py_test(test_gather_op SRCS test_gather_op.py)
py_test(test_scatter_op SRCS test_scatter_op.py)
py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py) py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)
py_test(gradient_checker SRCS gradient_checker.py) py_test(gradient_checker SRCS gradient_checker.py)
...@@ -22,8 +24,11 @@ py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py) ...@@ -22,8 +24,11 @@ py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py) py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
py_test(test_operator SRCS test_operator.py) py_test(test_operator SRCS test_operator.py)
# py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py) py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
py_test(test_uniform_random_op SRCS test_uniform_random_op.py) py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
py_test(test_recurrent_op SRCS test_recurrent_op.py) py_test(test_recurrent_op SRCS test_recurrent_op.py)
py_test(test_sgd_op SRCS test_sgd_op.py) py_test(test_sgd_op SRCS test_sgd_op.py)
py_test(test_gradient_checker SRCS test_gradient_checker.py) py_test(test_gradient_checker SRCS test_gradient_checker.py)
py_test(test_lookup_table SRCS test_lookup_table.py)
py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
py_test(mnist SRCS mnist.py)
...@@ -23,12 +23,17 @@ def grad_var_name(var_name): ...@@ -23,12 +23,17 @@ def grad_var_name(var_name):
return var_name + "@GRAD" return var_name + "@GRAD"
def empty_var_name():
return "@EMPTY@"
def get_numeric_gradient(op, def get_numeric_gradient(op,
input_values, input_values,
output_name, output_name,
input_to_check, input_to_check,
delta=0.005, delta=0.005,
local_scope=None): local_scope=None,
in_place=False):
""" """
Get Numeric Gradient for an operator's input. Get Numeric Gradient for an operator's input.
...@@ -77,6 +82,11 @@ def get_numeric_gradient(op, ...@@ -77,6 +82,11 @@ def get_numeric_gradient(op,
def product(dim): def product(dim):
return reduce(lambda a, b: a * b, dim, 1) return reduce(lambda a, b: a * b, dim, 1)
def restore_inputs():
for var_name in input_values:
tensor_ = local_scope.find_var(var_name).get_tensor()
tensor_.set(numpy.copy(input_values[var_name]), core.CPUPlace())
# get the input tensor that we want to get it's numeric gradient. # get the input tensor that we want to get it's numeric gradient.
tensor_to_check = local_scope.find_var(input_to_check).get_tensor() tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
tensor_size = product(tensor_to_check.get_dims()) tensor_size = product(tensor_to_check.get_dims())
...@@ -86,6 +96,8 @@ def get_numeric_gradient(op, ...@@ -86,6 +96,8 @@ def get_numeric_gradient(op,
# we only compute gradient of one element each time. # we only compute gradient of one element each time.
# we use a for loop to compute the gradient of every element. # we use a for loop to compute the gradient of every element.
for i in xrange(tensor_size): for i in xrange(tensor_size):
if in_place:
restore_inputs()
# get one input element throw it's index i. # get one input element throw it's index i.
origin = tensor_to_check.get_float_element(i) origin = tensor_to_check.get_float_element(i)
...@@ -95,6 +107,8 @@ def get_numeric_gradient(op, ...@@ -95,6 +107,8 @@ def get_numeric_gradient(op,
y_pos = get_output() y_pos = get_output()
# plus delta to this element, run op and get the sum of the result tensor. # plus delta to this element, run op and get the sum of the result tensor.
if in_place:
restore_inputs()
x_neg = origin - delta x_neg = origin - delta
tensor_to_check.set_float_element(i, x_neg) tensor_to_check.set_float_element(i, x_neg)
y_neg = get_output() y_neg = get_output()
...@@ -160,8 +174,13 @@ class GradientChecker(unittest.TestCase): ...@@ -160,8 +174,13 @@ class GradientChecker(unittest.TestCase):
grad_tensor.set(data, place) grad_tensor.set(data, place)
# run backward op # run backward op
for name in backward_op.outputs(): backward_outs = backward_op.outputs()
backward_names = [
item for key in backward_outs for item in backward_outs[key]
]
for name in backward_names:
scope.new_var(name) scope.new_var(name)
backward_op.infer_shape(scope) backward_op.infer_shape(scope)
backward_op.run(scope, ctx) backward_op.run(scope, ctx)
...@@ -171,7 +190,7 @@ class GradientChecker(unittest.TestCase): ...@@ -171,7 +190,7 @@ class GradientChecker(unittest.TestCase):
] ]
return outs return outs
def compare_grad(self, forward_op, input_value): def compare_grad(self, forward_op, input_value, no_grad_set=None):
""" Compare the input gradients between CPU and GPU for the given forward """ Compare the input gradients between CPU and GPU for the given forward
operator. operator.
...@@ -179,15 +198,20 @@ class GradientChecker(unittest.TestCase): ...@@ -179,15 +198,20 @@ class GradientChecker(unittest.TestCase):
:type forward_op: Operator :type forward_op: Operator
:param input_value: input values. :param input_value: input values.
:type input_value: dict{string:numpy.array} :type input_value: dict{string:numpy.array}
:param no_grad_set: the set of variables names without gradients.
:type no_grad_set: a set of string
:raises: AssertionError, there is different gradient value. :raises: AssertionError, there is different gradient value.
""" """
backward_op = core.Operator.backward(forward_op, set()) if no_grad_set is None:
no_grad_set = set()
backward_op = core.Operator.backward(forward_op, no_grad_set)
# return if not compile with GPU or not implementing GPU kernel # return if not compile with GPU or not implementing GPU kernel
if not (core.is_compile_gpu() and backward_op.support_gpu()): if not (core.is_compile_gpu() and backward_op.support_gpu()):
return return
outputs = backward_op.outputs() outputs = backward_op.outputs()
out_names = [item for k in outputs for item in outputs[k]] out_names = [item for k in outputs for item in outputs[k]]
out_names = filter(lambda x: x != empty_var_name(), out_names)
cpu_grads = self.__get_gradient(forward_op, backward_op, input_value, cpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
out_names, core.CPUPlace()) out_names, core.CPUPlace())
gpu_grads = self.__get_gradient(forward_op, backward_op, input_value, gpu_grads = self.__get_gradient(forward_op, backward_op, input_value,
...@@ -237,13 +261,14 @@ class GradientChecker(unittest.TestCase): ...@@ -237,13 +261,14 @@ class GradientChecker(unittest.TestCase):
output_name, output_name,
no_grad_set=None, no_grad_set=None,
only_cpu=False, only_cpu=False,
in_place=False,
max_relative_error=0.005): max_relative_error=0.005):
""" """
:param forward_op: used to create backward_op :param forward_op: used to create backward_op
:param input_vars: numpy value of input variable. The following :param input_vars: numpy value of input variable. The following
computation will use these variables. computation will use these variables.
:param inputs_to_check: inputs var names that should check gradient. :param inputs_to_check: inputs var names that should check gradient.
:param output_name: output name that used to :param output_name: the output variable name of forward network.
:param max_relative_error: The relative tolerance parameter. :param max_relative_error: The relative tolerance parameter.
:param no_grad_set: used when create backward ops :param no_grad_set: used when create backward ops
:param only_cpu: only compute and check gradient on cpu kernel. :param only_cpu: only compute and check gradient on cpu kernel.
...@@ -269,7 +294,8 @@ class GradientChecker(unittest.TestCase): ...@@ -269,7 +294,8 @@ class GradientChecker(unittest.TestCase):
# get numerical gradients # get numerical gradients
numeric_grads = [ numeric_grads = [
get_numeric_gradient(forward_op, input_vars, output_name, name) get_numeric_gradient(
forward_op, input_vars, output_name, name, in_place=in_place)
for name in inputs_to_check for name in inputs_to_check
] ]
......
import paddle.v2.framework.core as core
from paddle.v2.framework.op import Operator
import numpy
import paddle.v2 as paddle
BATCH_SIZE = 100
scope = core.Scope()
place = core.CPUPlace()
# if you want to test GPU training, you can use gpu place
# place = core.GPUPlace(0)
dev_ctx = core.DeviceContext.create(place)
init_net = core.Net.create()
forward_net = core.Net.create()
backward_net = None
optimize_net = core.Net.create()
def atomic_id():
id = 0
while True:
yield id
id += 1
uniq_id = atomic_id().next
def data_layer(name, dims):
var = scope.new_var(name)
tensor = var.get_tensor()
tensor.set_dims(dims) # 1 is batch size holder.
return name
def feed_data(name, data):
assert isinstance(data, numpy.ndarray)
tensor = scope.find_var(name).get_tensor()
tensor.set_dims(data.shape)
if data.dtype == numpy.dtype('int32'):
tensor.alloc_int(place)
elif data.dtype == numpy.dtype('float32'):
tensor.alloc_float(place)
else:
raise ValueError("data type not supported")
tensor.set(data, place)
def grad_var_name(var_name):
return var_name + "@GRAD"
def sgd_optimizer(net, param_name, learning_rate=0.005):
grad_name = grad_var_name(param_name)
optimize_op = Operator(
"sgd",
param=param_name,
grad=grad_name,
param_out=param_name,
learning_rate=learning_rate)
net.append_op(optimize_op)
# should use operator and add these to the init_network
def init_param(net, param_name, dims):
scope.new_var(param_name)
op = Operator(
"uniform_random", Out=param_name, dims=dims, min=-0.5, max=0.5, seed=10)
op.infer_shape(scope)
net.append_op(op)
# fc_layer
def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
"""
Add a fc layer to net
:param input: input variable name.
:type input: str
:param size: fully connected layer size.
:param act: activation name
:param param: parameter attribute, used for initialize parameters.
:param bias: bias attribute. False will not have a bias.
:param name: the name of fc layer. If not set, model will generate a
readable name
:return: output variable name.
"""
if name is None:
name = 'fc_%d' % uniq_id()
if not isinstance(name, str):
raise ValueError("name should be string")
input_dims = scope.find_var(input).get_tensor().get_dims()
w_name = param or name + ".w"
init_param(net=init_net, param_name=w_name, dims=[input_dims[1], size])
sgd_optimizer(net=optimize_net, param_name=w_name, learning_rate=0.01)
pre_activation = name + ".mul.out"
scope.new_var(pre_activation)
mul_op = Operator("mul", X=input, Y=w_name, Out=pre_activation)
net.append_op(mul_op)
# create bias variable if needed
if bias:
bias_name = name + ".b"
init_param(net=init_net, param_name=bias_name, dims=[size])
sgd_optimizer(
net=optimize_net, param_name=bias_name, learning_rate=0.001)
bias_out = name + ".rowwise_add.out"
scope.new_var(bias_out)
rowwise_append_op = Operator(
"rowwise_add", X=pre_activation, b=bias_name, Out=bias_out)
net.append_op(rowwise_append_op)
pre_activation = bias_out
activation_op = Operator(act, X=pre_activation, Y=name)
net.append_op(activation_op)
scope.new_var(name)
net.infer_shape(scope)
return name
def cross_entropy_layer(net, input, label):
cost_name = 'cross_entropy_%d' % uniq_id()
cross_entropy_op = Operator(
"onehot_cross_entropy", X=input, label=label, Y=cost_name)
net.append_op(cross_entropy_op)
scope.new_var(cost_name)
net.infer_shape(scope)
return cost_name
def create_backward_net(forward_net):
net = core.Operator.backward(forward_net, set())
for input in net.inputs()["all"]:
var = scope.new_var(input)
var.get_tensor()
for output in net.outputs()["all"]:
var = scope.new_var(output)
var.get_tensor()
return net
def debug_print_op(op):
print("===============" + op.type() + "==============")
print("***inputs:***")
for input in op.inputs()["all"]:
print input, scope.find_var(input).get_tensor().get_dims()
print("\n***outputs:***")
for output in op.outputs()["all"]:
print output, scope.find_var(output).get_tensor().get_dims()
print("")
print("")
def set_cost(cost):
cost_shape = numpy.array(scope.find_var(cost).get_tensor()).shape
cost_grad = \
scope.find_var(grad_var_name(cost)).get_tensor()
cost_grad.set_dims(cost_shape)
cost_grad.alloc_float(place)
cost_grad.set(numpy.ones(cost_shape).astype("float32"), place)
def get_cost_mean(cost):
cost_data = numpy.array(scope.find_var(cost).get_tensor())
return cost_data.sum() / len(cost_data)
def error_rate(predict, label):
predict_var = numpy.array(scope.find_var(predict).get_tensor()).argmax(
axis=1)
label = numpy.array(scope.find_var(label).get_tensor())
error_num = numpy.sum(predict_var != label)
return error_num / float(len(label))
images = data_layer(name='pixel', dims=[BATCH_SIZE, 784])
labels = data_layer(name='label', dims=[BATCH_SIZE])
fc1 = fc_layer(net=forward_net, input=images, size=100, act="sigmoid")
fc2 = fc_layer(net=forward_net, input=fc1, size=100, act="sigmoid")
predict = fc_layer(net=forward_net, input=fc2, size=10, act="softmax")
cost = cross_entropy_layer(net=forward_net, input=predict, label=labels)
init_net.complete_add_op(True)
forward_net.complete_add_op(True)
backward_net = create_backward_net(forward_net)
optimize_net.complete_add_op(True)
print(init_net)
print(forward_net)
print(backward_net)
print(optimize_net)
debug_print_op(forward_net)
debug_print_op(backward_net)
debug_print_op(optimize_net)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=BATCH_SIZE)
def test(cost_name):
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
cost = []
error = []
for data in test_reader():
image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
feed_data(images, image_data)
feed_data(labels, label_data)
forward_net.infer_shape(scope)
forward_net.run(scope, dev_ctx)
cost.append(get_cost_mean(cost_name))
error.append(error_rate(predict, "label"))
print("cost=" + str(sum(cost) / float(len(cost))) + " error_rate=" + str(
sum(error) / float(len(error))))
PASS_NUM = 1
init_net.run(scope, dev_ctx)
for pass_id in range(PASS_NUM):
batch_id = 0
for data in train_reader():
image_data = numpy.array(map(lambda x: x[0], data)).astype("float32")
label_data = numpy.array(map(lambda x: x[1], data)).astype("int32")
feed_data(images, image_data)
feed_data(labels, label_data)
forward_net.infer_shape(scope)
forward_net.run(scope, dev_ctx)
set_cost(cost)
backward_net.infer_shape(scope)
backward_net.run(scope, dev_ctx)
optimize_net.run(scope, dev_ctx)
if batch_id % 100 == 0:
print("pass[" + str(pass_id) + "] batch_id[" + str(batch_id) + "]")
test(cost)
batch_id = batch_id + 1
...@@ -64,7 +64,8 @@ class OpTestMeta(type): ...@@ -64,7 +64,8 @@ class OpTestMeta(type):
actual = numpy.array(scope.find_var(out_name).get_tensor()) actual = numpy.array(scope.find_var(out_name).get_tensor())
expect = self.outputs[out_name] expect = self.outputs[out_name]
self.assertTrue( self.assertTrue(
numpy.allclose(actual, expect), numpy.allclose(
actual, expect, atol=1e-05),
"output name: " + out_name + "has diff") "output name: " + out_name + "has diff")
obj.test_all = test_all obj.test_all = test_all
......
...@@ -8,9 +8,8 @@ class TestCrossEntropy(unittest.TestCase): ...@@ -8,9 +8,8 @@ class TestCrossEntropy(unittest.TestCase):
__metaclass__ = OpTestMeta __metaclass__ = OpTestMeta
def setUp(self): def setUp(self):
# TODO this unit test is not passed
self.type = "onehot_cross_entropy" self.type = "onehot_cross_entropy"
batch_size = 100 batch_size = 30
class_num = 10 class_num = 10
X = numpy.random.random((batch_size, class_num)).astype("float32") X = numpy.random.random((batch_size, class_num)).astype("float32")
label = 5 * numpy.ones(batch_size).astype("int32") label = 5 * numpy.ones(batch_size).astype("int32")
...@@ -22,9 +21,9 @@ class TestCrossEntropy(unittest.TestCase): ...@@ -22,9 +21,9 @@ class TestCrossEntropy(unittest.TestCase):
class CrossEntropyGradOpTest(GradientChecker): class CrossEntropyGradOpTest(GradientChecker):
def test_softmax_grad(self): def test_check_grad(self):
op = create_op("onehot_cross_entropy") op = create_op("onehot_cross_entropy")
batch_size = 100 batch_size = 30
class_num = 10 class_num = 10
inputs = { inputs = {
"X": numpy.random.uniform( "X": numpy.random.uniform(
......
import unittest
from op_test_util import OpTestMeta
from gradient_checker import GradientChecker, create_op
import numpy
import paddle.v2.framework.core as core
from paddle.v2.framework.op import Operator
class TestGatherOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "gather"
xnp = numpy.random.random((10, 20)).astype("float32")
self.inputs = {
'X': xnp,
'Index': numpy.array([1, 3, 5]).astype("int32")
}
self.outputs = {'Out': self.inputs['X'][self.inputs['Index']]}
class TestGatherGradOp(GradientChecker):
def test_gather_grad(self):
op = create_op("gather")
xnp = numpy.random.random((10, 20)).astype("float32")
inputs = {'X': xnp, 'Index': numpy.array([1, 3, 5]).astype("int32")}
self.check_grad(op, inputs, set("X"), "Out")
if __name__ == "__main__":
unittest.main()
import unittest
import numpy as np
from op_test_util import OpTestMeta
from gradient_checker import GradientChecker, create_op
class TestSigmoidOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = 'lookup_table'
table = np.random.random((17, 31)).astype('float32')
ids = np.random.randint(0, 17, 4).astype('int32')
self.inputs = {'W': table, 'Ids': ids}
self.outputs = {'Out': table[ids]}
class TestSigmoidGradOp(GradientChecker):
def test_grad(self):
op = create_op('lookup_table')
table = np.random.random((17, 31)).astype('float32')
ids = np.random.randint(0, 17, 4).astype('int32')
inputs = {'W': table, 'Ids': ids}
# comapre gradients
self.compare_grad(op, inputs, set(['Ids']))
# check gradients
self.check_grad(op, inputs, set('W'), 'Out')
if __name__ == '__main__':
unittest.main()
import unittest
import numpy as np
from gradient_checker import GradientChecker, create_op
from op_test_util import OpTestMeta
class MinusOpTest(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "minus"
self.inputs = {
'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((32, 84)).astype("float32")
}
self.outputs = {'Out': (self.inputs['X'] - self.inputs['Y'])}
class MinusGradTest(GradientChecker):
def test_left(self):
op = create_op("minus")
inputs = {
"X": np.random.random((10, 10)).astype("float32"),
"Y": np.random.random((10, 10)).astype("float32")
}
self.check_grad(op, inputs, ["X", 'Y'], "Out")
if __name__ == '__main__':
unittest.main()
...@@ -6,8 +6,8 @@ import unittest ...@@ -6,8 +6,8 @@ import unittest
def fc(X, W, Y): def fc(X, W, Y):
ret_v = core.Net.create() ret_v = core.Net.create()
ret_v.add_op(Operator("mul", X="X", Y="W", Out="pre_activation")) ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
ret_v.add_op(Operator("sigmoid", X="pre_activation", Y=Y)) ret_v.append_op(Operator("sigmoid", X="pre_activation", Y=Y))
ret_v.complete_add_op(True) ret_v.complete_add_op(True)
return ret_v return ret_v
...@@ -16,12 +16,12 @@ class TestNet(unittest.TestCase): ...@@ -16,12 +16,12 @@ class TestNet(unittest.TestCase):
def test_net_all(self): def test_net_all(self):
net = core.Net.create() net = core.Net.create()
op1 = Operator("add_two", X="X", Y="Y", Out="Out") op1 = Operator("add_two", X="X", Y="Y", Out="Out")
net.add_op(op1) net.append_op(op1)
net2 = core.Net.create() net2 = core.Net.create()
net2.add_op(fc(X="X", W="w", Y="fc.out")) net2.append_op(fc(X="X", W="w", Y="fc.out"))
net2.complete_add_op(True) net2.complete_add_op(True)
net.add_op(net2) net.append_op(net2)
net.complete_add_op(True) net.complete_add_op(True)
expected = ''' expected = '''
......
...@@ -150,7 +150,7 @@ class TestRecurrentOp(unittest.TestCase): ...@@ -150,7 +150,7 @@ class TestRecurrentOp(unittest.TestCase):
sig_op = Operator("sigmoid", X="sum", Y="h@alias") sig_op = Operator("sigmoid", X="sum", Y="h@alias")
for op in [x_fc_op, h_fc_op, sum_op, sig_op]: for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
stepnet.add_op(op) stepnet.append_op(op)
stepnet.complete_add_op(True) stepnet.complete_add_op(True)
self.rnnop.set_stepnet(stepnet) self.rnnop.set_stepnet(stepnet)
......
...@@ -20,7 +20,7 @@ class RowwiseAddGradOpTest(GradientChecker): ...@@ -20,7 +20,7 @@ class RowwiseAddGradOpTest(GradientChecker):
def test_rowwise_add(self): def test_rowwise_add(self):
op = create_op("rowwise_add") op = create_op("rowwise_add")
inputs = { inputs = {
"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32"), "X": np.random.uniform(0.1, 1, [5, 10]).astype("float32"),
"b": np.random.uniform(0.1, 1, [10]).astype("float32") "b": np.random.uniform(0.1, 1, [10]).astype("float32")
} }
self.check_grad(op, inputs, set(["X", "b"]), "Out") self.check_grad(op, inputs, set(["X", "b"]), "Out")
......
import unittest
from op_test_util import OpTestMeta
from gradient_checker import GradientChecker, create_op
import numpy as np
from paddle.v2.framework.op import Operator
class IdentityTest(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "identity"
self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
self.outputs = {'Out': self.inputs['X']}
class IdentityGradOpTest(GradientChecker):
def test_normal(self):
op = create_op("identity")
inputs = {"X": np.random.random((10, 10)).astype("float32")}
self.check_grad(op, inputs, set("X"), "Out")
class ScaleTest(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "scale"
self.inputs = {'X': np.random.random((32, 784)).astype("float32")}
self.attrs = {'scale': -2.3}
self.outputs = {'Out': self.inputs['X'] * self.attrs['scale']}
class ScaleGradTest(GradientChecker):
def test_normal(self):
op = Operator("scale", X="X", Out="Out", scale=3.2)
self.check_grad(op,
{"X": np.random.random((10, 10)).astype("float32")},
set("X"), "Out")
if __name__ == '__main__':
unittest.main()
import unittest
from op_test_util import OpTestMeta
from gradient_checker import GradientChecker, create_op
import numpy
import paddle.v2.framework.core as core
from paddle.v2.framework.op import Operator
class TestScatterOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "scatter"
ref_np = numpy.ones((3, 3)).astype("float32")
index_np = numpy.array([1, 2]).astype("int32")
updates_np = numpy.random.random((2, 3)).astype("float32")
output_np = numpy.copy(ref_np)
output_np[index_np] += updates_np
self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
self.outputs = {'Out': output_np}
class TestScatterGradOp(GradientChecker):
def test_scatter_grad(self):
op = create_op("scatter")
# test data setup
ref_np = numpy.ones((3, 10)).astype("float32")
index_np = numpy.array([1, 2]).astype("int32")
updates_np = numpy.random.random((2, 10)).astype("float32")
output_np = numpy.copy(ref_np)
output_np[index_np] += updates_np
inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
self.check_grad(
op, inputs, set(["Updates", "Ref"]), "Out", in_place=True)
if __name__ == "__main__":
unittest.main()
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import numpy as np import numpy as np
from paddle.proto.ParameterConfig_pb2 import ParameterConfig from paddle.proto.ParameterConfig_pb2 import ParameterConfig
from collections import OrderedDict
import paddle.trainer.config_parser as cp import paddle.trainer.config_parser as cp
import struct import struct
import tarfile import tarfile
...@@ -42,9 +43,25 @@ def create(layers): ...@@ -42,9 +43,25 @@ def create(layers):
class Parameters(object): class Parameters(object):
""" """
Parameters is a dictionary contains Paddle's parameter. The key of `Parameters` manages all the learnable parameters in a neural network.
Parameters is the name of parameter. The value of Parameters is a plain It stores parameters' information in an OrderedDict. The key is
:code:`numpy.ndarry` . the name of a parameter, and value is a parameter's configuration(in
protobuf format), such as initialization mean and std, its size, whether it
is a static parameter, and so on.
:param __param_conf__: store the configurations of learnable parameters in
the network in an OrderedDict. Parameter is added one by one into the
dict by following their created order in the network: parameters of
the previous layers in a network are careted first. You can visit the
parameters from bottom to top by iterating over this dict.
:type __param_conf__: OrderedDict
:param __gradient_machines__: all of the parameters in a neural network are
appended to a PaddlePaddle gradient machine, which is used internally to
copy parameter values between C++ and Python end.
:type __gradient_machines__: list
:param __tmp_params__: a dict to store dummy parameters if no
__gradient_machines__ is appended to `Parameters`.
:type __tmp_params__: dict
Basically usage is Basically usage is
...@@ -62,7 +79,7 @@ class Parameters(object): ...@@ -62,7 +79,7 @@ class Parameters(object):
""" """
def __init__(self): def __init__(self):
self.__param_conf__ = dict() self.__param_conf__ = OrderedDict()
self.__gradient_machines__ = [] self.__gradient_machines__ = []
self.__tmp_params__ = dict() self.__tmp_params__ = dict()
...@@ -231,6 +248,9 @@ class Parameters(object): ...@@ -231,6 +248,9 @@ class Parameters(object):
:rtype: np.ndarray :rtype: np.ndarray
""" """
import py_paddle.swig_paddle as api import py_paddle.swig_paddle as api
if self.__param_conf__[key].is_static:
return np.zeros(self.__param_conf__[key].size, dtype=np.float32)
return self.__getter_inner(key, api.PARAMETER_GRADIENT) return self.__getter_inner(key, api.PARAMETER_GRADIENT)
def set(self, parameter_name, value): def set(self, parameter_name, value):
...@@ -250,7 +270,7 @@ class Parameters(object): ...@@ -250,7 +270,7 @@ class Parameters(object):
append gradient machine to parameters. This method is used internally in append gradient machine to parameters. This method is used internally in
Trainer.train. Trainer.train.
:param gradient_machine: Paddle C++ GradientMachine object. :param gradient_machine: PaddlePaddle C++ GradientMachine object.
:type gradient_machine: api.GradientMachine :type gradient_machine: api.GradientMachine
:return: :return:
""" """
......
...@@ -141,12 +141,13 @@ class CostLayerTest(unittest.TestCase): ...@@ -141,12 +141,13 @@ class CostLayerTest(unittest.TestCase):
cost8 = layer.rank_cost(left=score, right=score, label=score) cost8 = layer.rank_cost(left=score, right=score, label=score)
cost9 = layer.lambda_cost(input=inference, score=score) cost9 = layer.lambda_cost(input=inference, score=score)
cost10 = layer.sum_cost(input=inference) cost10 = layer.sum_cost(input=inference)
cost11 = layer.huber_cost(input=score, label=label) cost11 = layer.huber_regression_cost(input=score, label=label)
cost12 = layer.huber_classification_cost(input=score, label=label)
print layer.parse_network([cost1, cost2]) print layer.parse_network([cost1, cost2])
print layer.parse_network([cost3, cost4]) print layer.parse_network([cost3, cost4])
print layer.parse_network([cost5, cost6]) print layer.parse_network([cost5, cost6])
print layer.parse_network([cost7, cost8, cost9, cost10, cost11]) print layer.parse_network([cost7, cost8, cost9, cost10, cost11, cost12])
crf = layer.crf(input=inference, label=label) crf = layer.crf(input=inference, label=label)
crf_decoding = layer.crf_decoding(input=inference, size=3) crf_decoding = layer.crf_decoding(input=inference, size=3)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册