提交 285cf611 编写于 作者: Q qiaolongfei

Merge branch 'develop' of https://github.com/PaddlePaddle/book into little-fix-in-mnist

...@@ -4,3 +4,5 @@ pandoc.template ...@@ -4,3 +4,5 @@ pandoc.template
.DS_Store .DS_Store
.idea .idea
py_env* py_env*
*.ipynb
build
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
- id: convert-markdown-into-html - id: convert-markdown-into-html
name: convert-markdown-into-html name: convert-markdown-into-html
description: Convert README.md into index.html and README.en.md into index.en.html description: Convert README.md into index.html and README.en.md into index.en.html
entry: python pre-commit-hooks/convert_markdown_into_html.py entry: python .pre-commit-hooks/convert_markdown_into_html.py
language: system language: system
files: .+README(\.en)?\.md$ files: .+README(\.en)?\.md$
#!/bin/bash
cur_path="$(cd "$(dirname "$0")" && pwd -P)"
cd $cur_path/../
#convert md to ipynb
.tools/convert-markdown-into-ipynb-and-test.sh
paddle_tag=0.10.0rc2
book_tag=latest
#generate docker file
if [ ${USE_UBUNTU_REPO_MIRROR} ]; then
update_mirror_cmd="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
else
update_mirror_cmd="\\"
fi
#build docker image
echo "paddle_tag:"$paddle_tag
echo "book_tag:"$book_tag
cat > Dockerfile <<EOF
FROM paddlepaddle/paddle:${paddle_tag}
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
COPY . /book
RUN pip install -U nltk \
&& python /book/.tools/cache_dataset.py
RUN ${update_mirror_cmd}
apt-get update && \
apt-get install -y locales && \
apt-get -y install gcc && \
apt-get -y clean && \
localedef -f UTF-8 -i en_US en_US.UTF-8 && \
pip install -U matplotlib jupyter numpy requests scipy
EXPOSE 8888
CMD ["sh", "-c", "jupyter notebook --ip=0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.disable_check_xsrf=True /book/"]
EOF
docker build --no-cache -t paddlepaddle/book:${paddle_tag} -t paddlepaddle/book:${book_tag} .
#!/bin/env python
import paddle.v2.dataset as dataset
import nltk
#cifar
dataset.common.download(dataset.cifar.CIFAR100_URL, 'cifar',
dataset.cifar.CIFAR100_MD5)
dataset.common.download(dataset.cifar.CIFAR10_URL, 'cifar',
dataset.cifar.CIFAR10_MD5)
# Cache conll05
dataset.common.download(dataset.conll05.WORDDICT_URL, 'conll05st', \
dataset.conll05.WORDDICT_MD5)
dataset.common.download(dataset.conll05.VERBDICT_URL, 'conll05st', \
dataset.conll05.VERBDICT_MD5)
dataset.common.download(dataset.conll05.TRGDICT_URL, 'conll05st', \
dataset.conll05.TRGDICT_MD5)
dataset.common.download(dataset.conll05.EMB_URL, 'conll05st',
dataset.conll05.EMB_MD5)
dataset.common.download(dataset.conll05.DATA_URL, 'conll05st',
dataset.conll05.DATA_MD5)
# Cache imdb
dataset.common.download(dataset.imdb.URL, "imdb", dataset.imdb.MD5)
# Cache imikolov
dataset.common.download(dataset.imikolov.URL, "imikolov", dataset.imikolov.MD5)
# Cache movielens
dataset.common.download('http://files.grouplens.org/datasets/movielens/ml-1m.zip',\
'movielens','c4d9eecfca2ab87c1945afe126590906')
# Cache nltk
nltk.download('movie_reviews', download_dir=dataset.common.DATA_HOME)
# Cache uci housing
dataset.common.download(dataset.uci_housing.URL, "uci_housing", \
dataset.uci_housing.MD5)
# Cache vmt14
dataset.common.download(dataset.wmt14.URL_TRAIN, "wmt14",\
dataset.wmt14.MD5_TRAIN)
#mnist
dataset.common.download(dataset.mnist.TRAIN_IMAGE_URL, 'mnist',
dataset.mnist.TRAIN_IMAGE_MD5)
dataset.common.download(dataset.mnist.TRAIN_LABEL_URL, 'mnist',
dataset.mnist.TRAIN_LABEL_MD5)
dataset.common.download(dataset.mnist.TEST_IMAGE_URL, 'mnist',
dataset.mnist.TEST_IMAGE_MD5)
dataset.common.download(dataset.mnist.TEST_LABEL_URL, 'mnist',
dataset.mnist.TEST_LABEL_MD5)
...@@ -5,14 +5,14 @@ if [ $? -ne 0 ]; then ...@@ -5,14 +5,14 @@ if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
GOPATH=/tmp/go go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb export GOPATH=~/go; go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb
cur_path=$(dirname $(readlink -f $0)) cur_path="$(cd "$(dirname "$0")" && pwd -P)"
cd $cur_path/../ cd $cur_path/../
#convert md to ipynb #convert md to ipynb
for file in */{README,README\.en}.md ; do for file in */{README,README\.en}.md ; do
/tmp/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb" ~/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo >&2 "markdown-to-ipynb $file error" echo >&2 "markdown-to-ipynb $file error"
exit 1 exit 1
......
...@@ -19,7 +19,7 @@ before_install: ...@@ -19,7 +19,7 @@ before_install:
- pip install -U virtualenv pre-commit pip - pip install -U virtualenv pre-commit pip
- GOPATH=/tmp/go go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb - GOPATH=/tmp/go go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb
script: script:
- PATH=/tmp/go/bin:$PATH travis/precommit.sh - PATH=/tmp/go/bin:$PATH .travis/precommit.sh
notifications: notifications:
email: email:
on_success: change on_success: change
......
...@@ -189,7 +189,7 @@ ...@@ -189,7 +189,7 @@
" size=1,\n", " size=1,\n",
" act=paddle.activation.Linear())\n", " act=paddle.activation.Linear())\n",
"y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))\n", "y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))\n",
"cost = paddle.layer.regression_cost(input=y_predict, label=y)\n" "cost = paddle.layer.mse_cost(input=y_predict, label=y)\n"
], ],
"outputs": [ "outputs": [
{ {
......
...@@ -132,7 +132,7 @@ y_predict = paddle.layer.fc(input=x, ...@@ -132,7 +132,7 @@ y_predict = paddle.layer.fc(input=x,
size=1, size=1,
act=paddle.activation.Linear()) act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.regression_cost(input=y_predict, label=y) cost = paddle.layer.mse_cost(input=y_predict, label=y)
``` ```
### Create Parameters ### Create Parameters
......
...@@ -183,7 +183,7 @@ ...@@ -183,7 +183,7 @@
" size=1,\n", " size=1,\n",
" act=paddle.activation.Linear())\n", " act=paddle.activation.Linear())\n",
"y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))\n", "y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))\n",
"cost = paddle.layer.regression_cost(input=y_predict, label=y)\n" "cost = paddle.layer.mse_cost(input=y_predict, label=y)\n"
], ],
"outputs": [ "outputs": [
{ {
......
...@@ -126,7 +126,7 @@ y_predict = paddle.layer.fc(input=x, ...@@ -126,7 +126,7 @@ y_predict = paddle.layer.fc(input=x,
size=1, size=1,
act=paddle.activation.Linear()) act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.regression_cost(input=y_predict, label=y) cost = paddle.layer.mse_cost(input=y_predict, label=y)
``` ```
### 创建参数 ### 创建参数
......
...@@ -174,7 +174,7 @@ y_predict = paddle.layer.fc(input=x, ...@@ -174,7 +174,7 @@ y_predict = paddle.layer.fc(input=x,
size=1, size=1,
act=paddle.activation.Linear()) act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.regression_cost(input=y_predict, label=y) cost = paddle.layer.mse_cost(input=y_predict, label=y)
``` ```
### Create Parameters ### Create Parameters
......
...@@ -168,7 +168,7 @@ y_predict = paddle.layer.fc(input=x, ...@@ -168,7 +168,7 @@ y_predict = paddle.layer.fc(input=x,
size=1, size=1,
act=paddle.activation.Linear()) act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.regression_cost(input=y_predict, label=y) cost = paddle.layer.mse_cost(input=y_predict, label=y)
``` ```
### 创建参数 ### 创建参数
......
...@@ -10,7 +10,7 @@ def main(): ...@@ -10,7 +10,7 @@ def main():
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1)) y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.regression_cost(input=y_predict, label=y) cost = paddle.layer.mse_cost(input=y_predict, label=y)
# create parameters # create parameters
parameters = paddle.parameters.create(cost) parameters = paddle.parameters.create(cost)
......
...@@ -148,9 +148,9 @@ Figure 10. ResNet model for ImageNet ...@@ -148,9 +148,9 @@ Figure 10. ResNet model for ImageNet
## Dataset ## Dataset
Commonly used public datasets for image classification are CIFAR(https://www.cs.toronto.edu/~kriz/cifar.html), ImageNet(http://image-net.org/), COCO(http://mscoco.org/), etc. Those used for fine-grained image classification are CUB-200-2011(http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), Stanford Dog(http://vision.stanford.edu/aditya86/ImageNetDogs/), Oxford-flowers(http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among these, the ImageNet dataset is the largest. Most research results are reported on ImageNet as mentioned in the Model Overview section. Since 2010, the ImageNet dataset has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average. Commonly used public datasets for image classification are [CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html), [ImageNet](http://image-net.org/), [COCO](http://mscoco.org/), etc. Those used for fine-grained image classification are [CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), [Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/), [Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among these, the ImageNet dataset is the largest. Most research results are reported on ImageNet as mentioned in the Model Overview section. Since 2010, the ImageNet dataset has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average.
Since ImageNet is too large to be downloaded and trained efficiently, we use CIFAR-10 (https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR-10 as well as 10 images randomly sampled from each category. Since ImageNet is too large to be downloaded and trained efficiently, we use [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR-10 as well as 10 images randomly sampled from each category.
<p align="center"> <p align="center">
<img src="image/cifar.png" width="350"><br/> <img src="image/cifar.png" width="350"><br/>
...@@ -185,7 +185,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -185,7 +185,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
1. Define input data and its dimension 1. Define input data and its dimension
The input to the network is defined as `paddle.layer.data`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10. The input to the network is defined as `paddle.layer.data`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10.
```python ```python
datadim = 3 * 32 * 32 datadim = 3 * 32 * 32
...@@ -199,7 +199,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -199,7 +199,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
```python ```python
net = vgg_bn_drop(image) net = vgg_bn_drop(image)
``` ```
The input to VGG main module is from the data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail: The input to VGG main module is from the data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail:
```python ```python
def vgg_bn_drop(input): def vgg_bn_drop(input):
...@@ -232,17 +232,15 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -232,17 +232,15 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
return fc2 return fc2
``` ```
2.1. First, define a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`. 2.1. First, define a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer. 2.3. The last two layers are fully-connected layers of dimension 512.
2.3. The last two layers are fully-connected layers of dimension 512.
3. Define Classifier 3. Define Classifier
The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category. The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category.
```python ```python
out = paddle.layer.fc(input=net, out = paddle.layer.fc(input=net,
...@@ -252,7 +250,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -252,7 +250,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
4. Define Loss Function and Outputs 4. Define Loss Function and Outputs
In the context of supervised learning, labels of training images are defined in `paddle.layer.data` as well. During training, the cross-entropy loss function is used and the loss is the output of the network. During testing, the outputs are the probabilities calculated in the classifier. In the context of supervised learning, labels of training images are defined in `paddle.layer.data` as well. During training, the cross-entropy loss function is used and the loss is the output of the network. During testing, the outputs are the probabilities calculated in the classifier.
```python ```python
lbl = paddle.layer.data( lbl = paddle.layer.data(
......
...@@ -135,7 +135,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类 ...@@ -135,7 +135,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
## 数据准备 ## 数据准备
通用图像分类公开的标准数据集常用的有[CIFAR](<https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。 通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)[ImageNet](http://image-net.org/)[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。
由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。 由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。
...@@ -220,7 +220,7 @@ paddle.init(use_gpu=False, trainer_count=1) ...@@ -220,7 +220,7 @@ paddle.init(use_gpu=False, trainer_count=1)
return fc2 return fc2
``` ```
2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成, 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。
......
...@@ -190,9 +190,9 @@ Figure 10. ResNet model for ImageNet ...@@ -190,9 +190,9 @@ Figure 10. ResNet model for ImageNet
## Dataset ## Dataset
Commonly used public datasets for image classification are CIFAR(https://www.cs.toronto.edu/~kriz/cifar.html), ImageNet(http://image-net.org/), COCO(http://mscoco.org/), etc. Those used for fine-grained image classification are CUB-200-2011(http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), Stanford Dog(http://vision.stanford.edu/aditya86/ImageNetDogs/), Oxford-flowers(http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among these, the ImageNet dataset is the largest. Most research results are reported on ImageNet as mentioned in the Model Overview section. Since 2010, the ImageNet dataset has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average. Commonly used public datasets for image classification are [CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html), [ImageNet](http://image-net.org/), [COCO](http://mscoco.org/), etc. Those used for fine-grained image classification are [CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), [Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/), [Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among these, the ImageNet dataset is the largest. Most research results are reported on ImageNet as mentioned in the Model Overview section. Since 2010, the ImageNet dataset has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average.
Since ImageNet is too large to be downloaded and trained efficiently, we use CIFAR-10 (https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR-10 as well as 10 images randomly sampled from each category. Since ImageNet is too large to be downloaded and trained efficiently, we use [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR-10 as well as 10 images randomly sampled from each category.
<p align="center"> <p align="center">
<img src="image/cifar.png" width="350"><br/> <img src="image/cifar.png" width="350"><br/>
...@@ -227,7 +227,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -227,7 +227,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
1. Define input data and its dimension 1. Define input data and its dimension
The input to the network is defined as `paddle.layer.data`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10. The input to the network is defined as `paddle.layer.data`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10.
```python ```python
datadim = 3 * 32 * 32 datadim = 3 * 32 * 32
...@@ -241,7 +241,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -241,7 +241,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
```python ```python
net = vgg_bn_drop(image) net = vgg_bn_drop(image)
``` ```
The input to VGG main module is from the data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail: The input to VGG main module is from the data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail:
```python ```python
def vgg_bn_drop(input): def vgg_bn_drop(input):
...@@ -274,17 +274,15 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -274,17 +274,15 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
return fc2 return fc2
``` ```
2.1. First, define a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`. 2.1. First, define a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer. 2.3. The last two layers are fully-connected layers of dimension 512.
2.3. The last two layers are fully-connected layers of dimension 512.
3. Define Classifier 3. Define Classifier
The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category. The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category.
```python ```python
out = paddle.layer.fc(input=net, out = paddle.layer.fc(input=net,
...@@ -294,7 +292,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela ...@@ -294,7 +292,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
4. Define Loss Function and Outputs 4. Define Loss Function and Outputs
In the context of supervised learning, labels of training images are defined in `paddle.layer.data` as well. During training, the cross-entropy loss function is used and the loss is the output of the network. During testing, the outputs are the probabilities calculated in the classifier. In the context of supervised learning, labels of training images are defined in `paddle.layer.data` as well. During training, the cross-entropy loss function is used and the loss is the output of the network. During testing, the outputs are the probabilities calculated in the classifier.
```python ```python
lbl = paddle.layer.data( lbl = paddle.layer.data(
......
...@@ -177,7 +177,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类 ...@@ -177,7 +177,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
## 数据准备 ## 数据准备
通用图像分类公开的标准数据集常用的有[CIFAR](<https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)其中ImageNet数据集规模相对较大[模型概览](#模型概览)一章所讲大量研究成果基于ImageNetImageNet数据从2010年来稍有变化常用的是ImageNet-2012数据集该数据集包含1000个类别训练集包含1,281,167张图片每个类别数据732至1300张不等验证集包含50,000张图片平均每个类别50张图片 通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。
由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。 由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。
...@@ -262,7 +262,7 @@ paddle.init(use_gpu=False, trainer_count=1) ...@@ -262,7 +262,7 @@ paddle.init(use_gpu=False, trainer_count=1)
return fc2 return fc2
``` ```
2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成, 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。
......
...@@ -334,7 +334,7 @@ def event_handler(event): ...@@ -334,7 +334,7 @@ def event_handler(event):
sys.stdout.write('.') sys.stdout.write('.')
sys.stdout.flush() sys.stdout.flush()
if isinstance(event, paddle.event.EndPass): if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, reader_dict=reader_dict) result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
``` ```
......
...@@ -376,7 +376,7 @@ def event_handler(event): ...@@ -376,7 +376,7 @@ def event_handler(event):
sys.stdout.write('.') sys.stdout.write('.')
sys.stdout.flush() sys.stdout.flush()
if isinstance(event, paddle.event.EndPass): if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, reader_dict=reader_dict) result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
``` ```
......
# Semantic Role Labeling # Semantic Role Labeling
Source code of this chapter is in [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles). The source code of this chapter is live on [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles).
For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
## Background ## Background
Natural Language Analysis contains three components: Lexical Analysis, Syntactic Analysis, and Semantic Analysis. Semantic Role Labelling (SRL) is one way for Shallow Semantic Analysis. A predicate of a sentence is a property that a subject possesses or is characterized, such as what it does, what it is or how it is, which mostly corresponds to the core of an event. The noun associated with a predicate is called Argument. Semantic roles express the abstract roles that arguments of a predicate can take in the event, such as Agent, Patient, Theme, Experiencer, Beneficiary, Instrument, Location, Goal and Source, etc. Natural language analysis techniques consist of lexical, syntactic, and semantic analysis. **Semantic Role Labeling (SRL)** is an instance of **Shallow Semantic Analysis**.
In the following example, “遇到” (encounters) is a Predicate (“Pred”),“小明” (Ming) is an Agent,“小红” (Hong) is a Patient,“昨天” (yesterday) indicates the Time, and “公园” (park) is the Location. In a sentence, a **predicate** states a property or a characterization of a *subject*, such as what it does and what it is like. The predicate represents the core of an event, whereas the words accompanying the predicate are **arguments**. A **semantic role** refers to the abstract role an argument of a predicate take on in the event, including *agent*, *patient*, *theme*, *experiencer*, *beneficiary*, *instrument*, *location*, *goal*, and *source*.
$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$ In the following example of a Chinese sentence, "to encounter" is the predicate (*pred*); "Ming" is the *agent*; "Hong" is the *patient*; "yesterday" and "evening" are the *time*; finally, "the park" is the *location*.
Instead of in-depth analysis on semantic information, the goal of Semantic Role Labeling is to identify the relation of predicate and other constituents, e.g., predicate-argument structure, as specific semantic roles, which is an important intermediate step in a wide range of natural language understanding tasks (Information Extraction, Discourse Analysis, DeepQA etc). Predicates are always assumed to be given; the only thing is to identify arguments and their semantic roles. $$\mbox{[小明 Ming]}_{\mbox{Agent}}\mbox{[昨天 yesterday]}_{\mbox{Time}}\mbox{[晚上 evening]}_\mbox{Time}\mbox{在[公园 a park]}_{\mbox{Location}}\mbox{[遇到 to encounter]}_{\mbox{Predicate}}\mbox{了[小红 Hong]}_{\mbox{Patient}}\mbox{。}$$
Standard SRL system mostly builds on top of Syntactic Analysis and contains five steps: Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relation between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
1. Construct a syntactic parse tree, as shown in Fig. 1 Conventional SRL systems mostly build on top of syntactic analysis, usually consisting of five steps:
2. Identity candidate arguments of given predicate from constructed syntactic parse tree.
3. Prune most unlikely candidate arguments. 1. Construct a syntax tree, as shown in Fig. 1
4. Identify arguments, often by a binary classifier. 2. Identity the candidate arguments of the given predicate on the tree.
5. Multi-class semantic role labeling. Steps 2-3 usually introduce hand-designed features based on Syntactic Analysis (step 1). 3. Prune the most unlikely candidate arguments.
4. Identify the real arguments, often by a binary classifier.
5. Multi-classify on results from step 4 to label the semantic roles. Steps 2 and 3 usually introduce hand-designed features based on syntactic analysis (step 1).
<div align="center"> <div align="center">
<img src="image/dependency_parsing_en.png" width = "80%" align=center /><br> <img src="image/dependency_parsing_en.png" width = "80%" align=center /><br>
Fig 1. Syntactic parse tree Fig 1. Syntax tree
</div> </div>
However, complete syntactic analysis requires identifying the relation among all constitutes and the performance of SRL is sensitive to the precision of syntactic analysis, which makes SRL a very challenging task. To reduce the complexity and obtain some syntactic structure information, we often use shallow syntactic analysis. Shallow Syntactic Analysis is also called partial parsing or chunking. Unlike complete syntactic analysis which requires the construction of the complete parsing tree, Shallow Syntactic Analysis only need to identify some independent components with relatively simple structure, such as verb phrases (chunk). To avoid difficulties in constructing a syntactic tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking based SRL methods, which convert SRL as a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using BIO representation. For syntactic chunks forming a chunk of type A, the first chunk receives the B-A tag (Begin), the remaining ones receive the tag I-A (Inside), and all chunks outside receive the tag O-A. However, a complete syntactic analysis requires identifying the relation among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out receive the tag O.
The BIO representation of above example is shown in Fig.1. The BIO representation of above example is shown in Fig.1.
<div align="center"> <div align="center">
<img src="image/bio_example_en.png" width = "90%" align=center /><br> <img src="image/bio_example_en.png" width = "90%" align=center /><br>
Fig 2. BIO represention Fig 2. BIO representation
</div> </div>
This example illustrates the simplicity of sequence tagging because (1) shallow syntactic analysis reduces the precision requirement of syntactic analysis; (2) pruning candidate arguments is removed; 3) argument identification and tagging are finished at the same time. Such unified methods simplify the procedure, reduce the risk of accumulating errors and boost the performance further. This example illustrates the simplicity of sequence tagging, since
1. It only relies on shallow syntactic analysis, reduces the precision requirement of syntactic analysis;
2. Pruning the candidate arguments is no longer necessary;
3. Arguments are identified and tagged at the same time. Simplifying the workflow reduces the risk of accumulating errors; oftentimes, methods that unify multiple steps boost performance.
In this tutorial, our SRL system is built as an end-to-end system via a neural network. We take only text sequences, without using any syntactic parsing results or complex hand-designed features. We give public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) as an example to illustrate: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles by sequence tagging method. In this tutorial, our SRL system is built as an end-to-end system via a neural network. The system takes only text sequences as input, without using any syntactic parsing results or complex hand-designed features. The public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) is used for the following task: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles through sequence tagging.
## Model ## Model
Recurrent Neural Networks are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike Feed-forward neural networks, RNNs can model the dependency between elements of sequences. LSTMs as variants of RNNs aim to model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems. **Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
### Stacked Recurrent Neural Network ### Stacked Recurrent Neural Network
Deep Neural Networks allows extracting hierarchical representations. Higher layers can form more abstract/complex representations on top of lower layers. LSTMs, when unfolded in time, is a deep feed-forward neural network, because a computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. However, the computation carried out at each time-step is only linear transformation, which makes LSTMs a shallow model. Deep LSTMs are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be much efficient at representing some functions and modeling varying-length dependencies\[[2](#Reference)\]. *Deep Neural Networks* can extract hierarchical representations. The higher layers can form relatively abstract/complex representations, based on primitive features discovered through the lower layers. Unfolding LSTMs through time results in a deep feed-forward neural network. This is because any computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. On the other hand, due to parameter sharing over time, LSTMs are also *shallow*; that is, the computation carried out at each time-step is just a linear transformation. Deep LSTM networks are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be efficient at representing some functions and modeling varying-length dependencies\[[2](#Reference)\].
However, in a deep LSTM network, any gradient propagated back in depth needs to traverse a large number of nonlinear steps. As a result, while LSTMs of 4 layers can be trained properly, those with 4-8 have much worse performance. Conventional LSTMs prevent back-propagated errors from vanishing or exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well.
However, deep LSTMs increases the number of nonlinear steps the gradient has to traverse when propagated back in depth. For example, four layer LSTMs can be trained properly, but the performance becomes worse as the number of layers up to 4-8. Conventional LSTMs prevent backpropagated errors from vanishing and exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well. A single LSTM cell has three operations:
1. input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping);
2. hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs;
3. hidden-to-output: this part typically involves an activation operation on hidden states.
The operation of a single LSTM cell contain 3 parts: (1) input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping); (2) hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs; (3)hidden-to-output: this part typically involves an activation operation on hidden states. Based on the stacked LSTMs, we add a shortcut connection: take the input-to-hidden from the previous layer as a new input and learn another linear transformation. Based on the stacked LSTMs, we add shortcut connections: take the input-to-hidden from the previous layer as a new input and learn another linear transformation.
Fig.3 illustrate the final stacked recurrent neural networks. Fig.3 illustrates the final stacked recurrent neural networks.
<p align="center"> <p align="center">
<img src="./image/stacked_lstm_en.png" width = "40%" align=center><br> <img src="./image/stacked_lstm_en.png" width = "40%" align=center><br>
...@@ -65,9 +77,9 @@ Fig 3. Stacked Recurrent Neural Networks ...@@ -65,9 +77,9 @@ Fig 3. Stacked Recurrent Neural Networks
### Bidirectional Recurrent Neural Network ### Bidirectional Recurrent Neural Network
LSTMs can summarize the history of previous inputs seen up to now, but can not see the future. In most of NLP (natural language processing) tasks, the entire sentences are ready to use. Therefore, sequential learning might be much efficient if the future can be encoded as well like histories. While LSTMs can summarize the history -- all the previous input seen up until now -- they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
To address the above drawbacks, we can design bidirectional recurrent neural networks by making a minor modification. Higher LSTM layers process the sequence in reversed direction with previous lower LSTM layers, i.e., Deep LSTMs operate from left-to-right, right-to-left, left-to-right,..., in depth. Therefore, LSTM layers at time-step $t$ can see both histories and the future since the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks. To address, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
<p align="center"> <p align="center">
...@@ -75,16 +87,16 @@ To address the above drawbacks, we can design bidirectional recurrent neural net ...@@ -75,16 +87,16 @@ To address the above drawbacks, we can design bidirectional recurrent neural net
Fig 4. Bidirectional LSTMs Fig 4. Bidirectional LSTMs
</p> </p>
Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks[machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md) Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks [machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.en.md)
### Conditional Random Field ### Conditional Random Field (CRF)
The basic pipeline of Neural Networks solving problems is 1) all lower layers aim to learn representations; 2) the top layer is designed for learning the final task. In SRL tasks, CRF is built on top of the network for the final tag sequence prediction. It takes the representations provided by the last LSTM layer as input. Typically, a neural network's lower layers learn representations while its very top layer learns the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes as input the representations provided by the last LSTM layer.
CRF is a probabilistic graph model (undirected) with nodes denoting random variables and edges denoting dependencies between nodes. To be simplicity, CRFs learn conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input, $Y = (y_1, y_2, ... , y_n)$ are label sequences; Decoding is to search sequence $Y$ to maximize conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。 The CRF is an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. In essence, CRFs learn the conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input and $Y = (y_1, y_2, ... , y_n)$ are label sequences; to decode, simply search through $Y$ for a sequence that maximizes the conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
Sequence tagging tasks only consider input and output as linear sequences without extra dependent assumptions on graph model. Thus, the graph model of sequence tagging tasks is simple chain or line, which results in a Linear-Chain Conditional Random Field, shown in Fig.5. Sequence tagging tasks do not assume a lot of conditional independence, because they are only concerned with the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
<p align="center"> <p align="center">
<img src="./image/linear_chain_crf.png" width = "35%" align=center><br> <img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
...@@ -96,43 +108,43 @@ By the fundamental theorem of random fields \[[5](#Reference)\], the joint distr ...@@ -96,43 +108,43 @@ By the fundamental theorem of random fields \[[5](#Reference)\], the joint distr
$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$ $$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
where, $Z(X)$ is normalization constant, $t_j$ is feature function defined on edges, called transition feature, depending on $y_i$ and $y_{i-1}$ which represents transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. $s_k$ is feature function defined on nodes, called state feature, depending on $y_i$ and represents the probality of $y_i$ given input sequence $X$. $\lambda_j$ 和 $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Actually, $t$ and $s$ can be wrtten in the same form, then take summation over all nodes $i$: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, $f$ is defined as feature function. Thus, $P(Y|X)$ can be wrtten as: where, $Z(X)$ is normalization constant, ${t_j}$ represents the feature functions defined on edges called the *transition feature*, which denotes the transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. ${s_k}$ represents the feature function defined on nodes, called the state feature, denoting the probability of $y_i$ given input sequence $X$. In addition, $\lambda_j$ and $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Alternatively, $t$ and $s$ can be written in the same form that depends on $y_{i - 1}$, $y_i$, $X$, and $i$. Taking its summation over all nodes $i$, we have: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, which defines the *feature function* $f$. Thus, $P(Y|X)$ can be written as:
$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
$\omega$ are weights of feature function which should be learned in CRF models. At training stage, given input sequences and label sequences $D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, solve following objective function using MLE: where $\omega$ are the weights to the feature function that the CRF learns. While training, given input sequences and label sequences $D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, by maximum likelihood estimation (**MLE**), we construct the following objective function:
$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
This objective function can be solved via back-propagation in an end-to-end manner. At decoding stage, given input sequences $X$, search sequence $\bar{Y}$ to maximize conditional probability $\bar{P}(Y|X)$ via decoding methods (such as Viterbi, Beam Search). This objective function can be solved via back-propagation in an end-to-end manner. While decoding, given input sequences $X$, search for sequence $\bar{Y}$ to maximize the conditional probability $\bar{P}(Y|X)$ via decoding methods (such as *Viterbi*, or [Beam Search Algorithm](https://github.com/PaddlePaddle/book/blob/develop/07.machine_translation/README.en.md#Beam%20Search%20Algorithm)).
### DB-LSTM SRL model ### Deep Bidirectional LSTM (DB-LSTM) SRL model
Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has n predicates, we will process this sequence n times. One model is as follows: Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has $n$ predicates, we will process this sequence $n$ times. Here is the breakdown of a straight-forward model:
1. Construct inputs; 1. Construct inputs;
- input 1: predicate, input 2: sentence - input 1: predicate, input 2: sentence
- expand input 1 as a sequence with the same length with input 2 using one-hot representation; - expand input 1 into a sequence of the same length with input 2's sentence, using one-hot representation;
2. Convert one-hot sequences from step 1 to vector sequences via lookup table; 2. Convert the one-hot sequences from step 1 to vector sequences via a word embedding's lookup table;
3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs; 3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
4. Take representations from step 3 as inputs, label sequence as supervision signal, do sequence tagging tasks 4. Take the representation from step 3 as input, label sequence as supervisory signal, and realize sequence tagging tasks.
We can try above method. Here, we propose some modifications by introducing two simple but effective features: Here, we propose some improvements by introducing two simple but effective features:
- predicate context (ctx-p): A single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk. - predicate context (**ctx-p**): A single predicate word may not describe all the predicate information, especially when the same words appear multiple times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk.
- region mark ($m_r$): $m_r = 1$ to denote word in that position locates in the predicate context region, or $m_r = 0$ if not. - region mark ($m_r$): The binary marker on a word, $m_r$, takes the value of $1$ when the word is in the predicate context region, and $0$ if not.
After modification, the model is as follows: After these modifications, the model is as follows, as illustrated in Figure 6:
1. Construct inputs 1. Construct inputs
- Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, element value will be 1 if word locates in the predicate context region, 0 otherwise. - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if word is located in the predicate context region, 0 otherwise.
- expand input 2~3 as sequences with the same length with input 1 - expand input 2~3 into sequences with the same length with input 1
2. Convert input 1~4 to vector sequences via lookup table; input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables 2. Convert input 1~4 to vector sequences via word embedding lookup tables; While input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables.
3. Take four vector sequences from step 2 as inputs of bidirectional LSTMs; Train LSTMs to update representations 3. Take the four vector sequences from step 2 as inputs to bidirectional LSTMs; Train the LSTMs to update representations.
4. Take representation from step 3 as input of CRF, label sequence as supervision signal, do sequence tagging tasks 4. Take the representation from step 3 as input to CRF, label sequence as supervisory signal, and complete sequence tagging tasks.
<div align="center"> <div align="center">
...@@ -142,9 +154,9 @@ Fig 6. DB-LSTM for SRL tasks ...@@ -142,9 +154,9 @@ Fig 6. DB-LSTM for SRL tasks
## Data Preparation ## Data Preparation
In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. It is important to note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, if you want to train a usable neural network SRL system, consider paying for the full corpus. In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. Note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, for a usable neural network SRL system, please consider paying for the full corpus.
The original data includes a variety of information such as POS tagging, naming entity recognition, parsing tree, and so on. In this tutorial, we only use the data under the words folder (text sequence) and the props folder (label results) inside test.wsj parent folder. The data directory used in this tutorial is as follows: The original data includes a variety of information such as POS tagging, naming entity recognition, syntax tree, etc. In this tutorial, we only use the data under `test.wsj/words/` (text sequence) and `test.wsj/props/` (label results). The data directory used in this tutorial is as follows:
```text ```text
conll05st-release/ conll05st-release/
...@@ -153,9 +165,9 @@ conll05st-release/ ...@@ -153,9 +165,9 @@ conll05st-release/
└── words # 输入文本序列 └── words # 输入文本序列
``` ```
The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The label of the PropBank is different from the label that we used in the example at the beginning of the article, but the principle is the same. For the description of the label, please refer to the paper \[[9](#references)\]. The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The labeling of the PropBank is different from the labeling methods mentioned before, but shares with it the same underlying principle. For descriptions of the labeling, please refer to the paper \[[9](#references)\].
The raw data needs to be preprocessed before used by PaddlePaddle. The preprocessing consists of the following steps: The raw data needs to be preprocessed into formats that PaddlePaddle can handle. The preprocessing consists of the following steps:
1. Merge the text sequence and the tag sequence into the same record; 1. Merge the text sequence and the tag sequence into the same record;
2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate; 2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate;
...@@ -170,7 +182,7 @@ The raw data needs to be preprocessed before used by PaddlePaddle. The preproces ...@@ -170,7 +182,7 @@ The raw data needs to be preprocessed before used by PaddlePaddle. The preproces
# conll05.test gets preprocessed training instances. # conll05.test gets preprocessed training instances.
``` ```
After preprocessing completes, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. Following table is an example of a training sample. After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample.
| word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence| | word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence|
|---|---|---|---|---| |---|---|---|---|---|
...@@ -192,9 +204,9 @@ In addition to the data, we provide following resources: ...@@ -192,9 +204,9 @@ In addition to the data, we provide following resources:
| predicate_dict | predicate dictionary, total 3162 predicates | | predicate_dict | predicate dictionary, total 3162 predicates |
| emb | a pre-trained word vector lookup table, 32-dimentional | | emb | a pre-trained word vector lookup table, 32-dimentional |
We trained in the English Wikipedia language model to get a word vector lookup table used to initialize the SRL model. During the SRL model training process, the word vector lookup table is no longer updated. About the language model and the word vector lookup table can refer to [word vector](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) tutorial. There are 995,000,000 token in training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`. We trained a language model on the English Wikipedia to get a word vector lookup table used to initialize the SRL model. While training the SRL model, the word vector lookup table is no longer updated. To learn more about the language model and the word vector lookup table, please refer to the tutorial [word vector](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md). There are 995,000,000 tokens in the training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
Get dictionary, print dictionary size: Here we fetch the dictionary, and print its size:
```python ```python
import math import math
...@@ -214,12 +226,12 @@ print label_dict_len ...@@ -214,12 +226,12 @@ print label_dict_len
print pred_len print pred_len
``` ```
## Model configuration ## Model Configuration
- Define input data dimensions and model hyperparameters. - Define input data dimensions and model hyperparameters.
```python ```python
mark_dict_len = 2 # Value range of region mark. Region mark is either 0 or 1, so range is 2 mark_dict_len = 2 # value range of region mark. Region mark is either 0 or 1, so range is 2
word_dim = 32 # word vector dimension word_dim = 32 # word vector dimension
mark_dim = 5 # adjacent dimension mark_dim = 5 # adjacent dimension
hidden_dim = 512 # the dimension of LSTM hidden layer vector is 128 (512/4) hidden_dim = 512 # the dimension of LSTM hidden layer vector is 128 (512/4)
...@@ -249,9 +261,9 @@ mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) ...@@ -249,9 +261,9 @@ mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
target = paddle.layer.data(name='target', type=d_type(label_dict_len)) target = paddle.layer.data(name='target', type=d_type(label_dict_len))
``` ```
Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory) Note that `hidden_dim = 512` means a LSTM hidden vector of 128 dimension (512/4). Please refer to PaddlePaddle's official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)
- The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences. - Transform the word sequence itself, the predicate, the predicate context, and the region mark sequence into embedded vector sequences.
```python ```python
...@@ -280,7 +292,7 @@ emb_layers.append(predicate_embedding) ...@@ -280,7 +292,7 @@ emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
``` ```
- 8 LSTM units will be trained in "forward / backward" order. - 8 LSTM units are trained through alternating left-to-right / right-to-left order denoted by the variable `reverse`.
```python ```python
hidden_0 = paddle.layer.mixed( hidden_0 = paddle.layer.mixed(
...@@ -330,7 +342,7 @@ for i in range(1, depth): ...@@ -330,7 +342,7 @@ for i in range(1, depth):
input_tmp = [mix_hidden, lstm] input_tmp = [mix_hidden, lstm]
``` ```
- We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation. - We will concatenate the output of the top LSTM unit with its input, and project the result into a hidden layer. Then, we put a fully connected layer on top to get the final feature vector representation.
```python ```python
feature_out = paddle.layer.mixed( feature_out = paddle.layer.mixed(
...@@ -344,7 +356,7 @@ for i in range(1, depth): ...@@ -344,7 +356,7 @@ for i in range(1, depth):
], ) ], )
``` ```
- We use CRF as cost function, the parameter of CRF cost will be named `crfw`. - At the end of the network, we use CRF as the cost function; the parameter of CRF cost will be named `crfw`.
```python ```python
crf_cost = paddle.layer.crf( crf_cost = paddle.layer.crf(
...@@ -357,7 +369,7 @@ crf_cost = paddle.layer.crf( ...@@ -357,7 +369,7 @@ crf_cost = paddle.layer.crf(
learning_rate=mix_hidden_lr)) learning_rate=mix_hidden_lr))
``` ```
- CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer. The sharing of parameters among multiple layers is specified by the same parameter name in these layers. - The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer. The sharing of parameters among multiple layers is specified by using the same parameter name in these layers.
```python ```python
crf_dec = paddle.layer.crf_decoding( crf_dec = paddle.layer.crf_decoding(
...@@ -384,7 +396,7 @@ We can print out parameter name. It will be generated if not specified. ...@@ -384,7 +396,7 @@ We can print out parameter name. It will be generated if not specified.
print parameters.keys() print parameters.keys()
``` ```
Now we load pre-trained word lookup table. Now we load the pre-trained word lookup tables from word embeddings trained on the English language Wikipedia.
```python ```python
def load_parameter(file_name, h, w): def load_parameter(file_name, h, w):
...@@ -396,7 +408,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) ...@@ -396,7 +408,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
### Create Trainer ### Create Trainer
We will create trainer given model topology, parameters and optimization method. We will use most basic SGD method (momentum optimizer with 0 momentum). In the meantime, we will set learning rate and regularization. We will create trainer given model topology, parameters, and optimization method. We will use the most basic **SGD** method, which is a momentum optimizer with 0 momentum. Meanwhile, we will set learning rate and regularization.
```python ```python
optimizer = paddle.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
...@@ -413,7 +425,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost, ...@@ -413,7 +425,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
### Trainer ### Trainer
As mentioned in data preparation section, we will use CoNLL 2005 test corpus as training data set. `conll05.test()` outputs one training instance at a time. It will be shuffled, and batched into mini batches as input. As mentioned in data preparation section, we will use CoNLL 2005 test corpus as the training data set. `conll05.test()` outputs one training instance at a time. It is shuffled and batched into mini batches, and used as input.
```python ```python
reader = paddle.batch( reader = paddle.batch(
...@@ -421,7 +433,7 @@ reader = paddle.batch( ...@@ -421,7 +433,7 @@ reader = paddle.batch(
conll05.test(), buf_size=8192), batch_size=20) conll05.test(), buf_size=8192), batch_size=20)
``` ```
`feeding` is used to specify relationship between data instance and layer layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`. `feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
```python ```python
feeding = { feeding = {
...@@ -437,7 +449,7 @@ feeding = { ...@@ -437,7 +449,7 @@ feeding = {
} }
``` ```
`event_handle` can be used as callback for training events, it will be used as an argument for `train`. Following `event_handle` prints cost during training. `event_handler` can be used as callback for training events, it will be used as an argument for the `train` method. Following `event_handler` prints cost during training.
```python ```python
def event_handler(event): def event_handler(event):
...@@ -459,7 +471,7 @@ trainer.train( ...@@ -459,7 +471,7 @@ trainer.train(
## Conclusion ## Conclusion
Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we give SRL as an example to introduce how to use PaddlePaddle to do sequence tagging tasks. Proposed models are from our published paper\[[10](#Reference)\]. We only use test data as an illustration since train data on CoNLL 2005 dataset is not completely public. We hope to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models. Please check out our paper for more information and discussions. Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we use SRL as an example to illustrate using PaddlePaddle to do sequence tagging tasks. The models proposed are from our published paper\[[10](#Reference)\]. We only use test data for illustration since the training data on the CoNLL 2005 dataset is not completely public. This aims to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models in terms of performance. Please check out our paper for more information and discussions.
## Reference ## Reference
1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483. 1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
......
...@@ -93,7 +93,7 @@ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ ...@@ -93,7 +93,7 @@ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
$\omega$是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ,通过正则化的极大似然估计,求解如下优化目标: $\omega$是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ,通过正则化的极大似然估计,求解如下优化目标:
$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列$X$,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。 这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列$X$,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。
......
...@@ -42,63 +42,75 @@ ...@@ -42,63 +42,75 @@
<div id="markdown" style='display:none'> <div id="markdown" style='display:none'>
# Semantic Role Labeling # Semantic Role Labeling
Source code of this chapter is in [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles). The source code of this chapter is live on [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles).
For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
## Background ## Background
Natural Language Analysis contains three components: Lexical Analysis, Syntactic Analysis, and Semantic Analysis. Semantic Role Labelling (SRL) is one way for Shallow Semantic Analysis. A predicate of a sentence is a property that a subject possesses or is characterized, such as what it does, what it is or how it is, which mostly corresponds to the core of an event. The noun associated with a predicate is called Argument. Semantic roles express the abstract roles that arguments of a predicate can take in the event, such as Agent, Patient, Theme, Experiencer, Beneficiary, Instrument, Location, Goal and Source, etc. Natural language analysis techniques consist of lexical, syntactic, and semantic analysis. **Semantic Role Labeling (SRL)** is an instance of **Shallow Semantic Analysis**.
In the following example, “遇到” (encounters) is a Predicate (“Pred”),“小明” (Ming) is an Agent,“小红” (Hong) is a Patient,“昨天” (yesterday) indicates the Time, and “公园” (park) is the Location. In a sentence, a **predicate** states a property or a characterization of a *subject*, such as what it does and what it is like. The predicate represents the core of an event, whereas the words accompanying the predicate are **arguments**. A **semantic role** refers to the abstract role an argument of a predicate take on in the event, including *agent*, *patient*, *theme*, *experiencer*, *beneficiary*, *instrument*, *location*, *goal*, and *source*.
$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$ In the following example of a Chinese sentence, "to encounter" is the predicate (*pred*); "Ming" is the *agent*; "Hong" is the *patient*; "yesterday" and "evening" are the *time*; finally, "the park" is the *location*.
Instead of in-depth analysis on semantic information, the goal of Semantic Role Labeling is to identify the relation of predicate and other constituents, e.g., predicate-argument structure, as specific semantic roles, which is an important intermediate step in a wide range of natural language understanding tasks (Information Extraction, Discourse Analysis, DeepQA etc). Predicates are always assumed to be given; the only thing is to identify arguments and their semantic roles. $$\mbox{[小明 Ming]}_{\mbox{Agent}}\mbox{[昨天 yesterday]}_{\mbox{Time}}\mbox{[晚上 evening]}_\mbox{Time}\mbox{在[公园 a park]}_{\mbox{Location}}\mbox{[遇到 to encounter]}_{\mbox{Predicate}}\mbox{了[小红 Hong]}_{\mbox{Patient}}\mbox{。}$$
Standard SRL system mostly builds on top of Syntactic Analysis and contains five steps: Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relation between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
1. Construct a syntactic parse tree, as shown in Fig. 1 Conventional SRL systems mostly build on top of syntactic analysis, usually consisting of five steps:
2. Identity candidate arguments of given predicate from constructed syntactic parse tree.
3. Prune most unlikely candidate arguments. 1. Construct a syntax tree, as shown in Fig. 1
4. Identify arguments, often by a binary classifier. 2. Identity the candidate arguments of the given predicate on the tree.
5. Multi-class semantic role labeling. Steps 2-3 usually introduce hand-designed features based on Syntactic Analysis (step 1). 3. Prune the most unlikely candidate arguments.
4. Identify the real arguments, often by a binary classifier.
5. Multi-classify on results from step 4 to label the semantic roles. Steps 2 and 3 usually introduce hand-designed features based on syntactic analysis (step 1).
<div align="center"> <div align="center">
<img src="image/dependency_parsing_en.png" width = "80%" align=center /><br> <img src="image/dependency_parsing_en.png" width = "80%" align=center /><br>
Fig 1. Syntactic parse tree Fig 1. Syntax tree
</div> </div>
However, complete syntactic analysis requires identifying the relation among all constitutes and the performance of SRL is sensitive to the precision of syntactic analysis, which makes SRL a very challenging task. To reduce the complexity and obtain some syntactic structure information, we often use shallow syntactic analysis. Shallow Syntactic Analysis is also called partial parsing or chunking. Unlike complete syntactic analysis which requires the construction of the complete parsing tree, Shallow Syntactic Analysis only need to identify some independent components with relatively simple structure, such as verb phrases (chunk). To avoid difficulties in constructing a syntactic tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking based SRL methods, which convert SRL as a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using BIO representation. For syntactic chunks forming a chunk of type A, the first chunk receives the B-A tag (Begin), the remaining ones receive the tag I-A (Inside), and all chunks outside receive the tag O-A. However, a complete syntactic analysis requires identifying the relation among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out receive the tag O.
The BIO representation of above example is shown in Fig.1. The BIO representation of above example is shown in Fig.1.
<div align="center"> <div align="center">
<img src="image/bio_example_en.png" width = "90%" align=center /><br> <img src="image/bio_example_en.png" width = "90%" align=center /><br>
Fig 2. BIO represention Fig 2. BIO representation
</div> </div>
This example illustrates the simplicity of sequence tagging because (1) shallow syntactic analysis reduces the precision requirement of syntactic analysis; (2) pruning candidate arguments is removed; 3) argument identification and tagging are finished at the same time. Such unified methods simplify the procedure, reduce the risk of accumulating errors and boost the performance further. This example illustrates the simplicity of sequence tagging, since
1. It only relies on shallow syntactic analysis, reduces the precision requirement of syntactic analysis;
2. Pruning the candidate arguments is no longer necessary;
3. Arguments are identified and tagged at the same time. Simplifying the workflow reduces the risk of accumulating errors; oftentimes, methods that unify multiple steps boost performance.
In this tutorial, our SRL system is built as an end-to-end system via a neural network. We take only text sequences, without using any syntactic parsing results or complex hand-designed features. We give public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) as an example to illustrate: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles by sequence tagging method. In this tutorial, our SRL system is built as an end-to-end system via a neural network. The system takes only text sequences as input, without using any syntactic parsing results or complex hand-designed features. The public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) is used for the following task: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles through sequence tagging.
## Model ## Model
Recurrent Neural Networks are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike Feed-forward neural networks, RNNs can model the dependency between elements of sequences. LSTMs as variants of RNNs aim to model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems. **Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
### Stacked Recurrent Neural Network ### Stacked Recurrent Neural Network
Deep Neural Networks allows extracting hierarchical representations. Higher layers can form more abstract/complex representations on top of lower layers. LSTMs, when unfolded in time, is a deep feed-forward neural network, because a computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. However, the computation carried out at each time-step is only linear transformation, which makes LSTMs a shallow model. Deep LSTMs are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be much efficient at representing some functions and modeling varying-length dependencies\[[2](#Reference)\]. *Deep Neural Networks* can extract hierarchical representations. The higher layers can form relatively abstract/complex representations, based on primitive features discovered through the lower layers. Unfolding LSTMs through time results in a deep feed-forward neural network. This is because any computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. On the other hand, due to parameter sharing over time, LSTMs are also *shallow*; that is, the computation carried out at each time-step is just a linear transformation. Deep LSTM networks are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be efficient at representing some functions and modeling varying-length dependencies\[[2](#Reference)\].
However, in a deep LSTM network, any gradient propagated back in depth needs to traverse a large number of nonlinear steps. As a result, while LSTMs of 4 layers can be trained properly, those with 4-8 have much worse performance. Conventional LSTMs prevent back-propagated errors from vanishing or exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well.
However, deep LSTMs increases the number of nonlinear steps the gradient has to traverse when propagated back in depth. For example, four layer LSTMs can be trained properly, but the performance becomes worse as the number of layers up to 4-8. Conventional LSTMs prevent backpropagated errors from vanishing and exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well. A single LSTM cell has three operations:
1. input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping);
2. hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs;
3. hidden-to-output: this part typically involves an activation operation on hidden states.
The operation of a single LSTM cell contain 3 parts: (1) input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping); (2) hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs; (3)hidden-to-output: this part typically involves an activation operation on hidden states. Based on the stacked LSTMs, we add a shortcut connection: take the input-to-hidden from the previous layer as a new input and learn another linear transformation. Based on the stacked LSTMs, we add shortcut connections: take the input-to-hidden from the previous layer as a new input and learn another linear transformation.
Fig.3 illustrate the final stacked recurrent neural networks. Fig.3 illustrates the final stacked recurrent neural networks.
<p align="center"> <p align="center">
<img src="./image/stacked_lstm_en.png" width = "40%" align=center><br> <img src="./image/stacked_lstm_en.png" width = "40%" align=center><br>
...@@ -107,9 +119,9 @@ Fig 3. Stacked Recurrent Neural Networks ...@@ -107,9 +119,9 @@ Fig 3. Stacked Recurrent Neural Networks
### Bidirectional Recurrent Neural Network ### Bidirectional Recurrent Neural Network
LSTMs can summarize the history of previous inputs seen up to now, but can not see the future. In most of NLP (natural language processing) tasks, the entire sentences are ready to use. Therefore, sequential learning might be much efficient if the future can be encoded as well like histories. While LSTMs can summarize the history -- all the previous input seen up until now -- they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
To address the above drawbacks, we can design bidirectional recurrent neural networks by making a minor modification. Higher LSTM layers process the sequence in reversed direction with previous lower LSTM layers, i.e., Deep LSTMs operate from left-to-right, right-to-left, left-to-right,..., in depth. Therefore, LSTM layers at time-step $t$ can see both histories and the future since the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks. To address, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
<p align="center"> <p align="center">
...@@ -117,16 +129,16 @@ To address the above drawbacks, we can design bidirectional recurrent neural net ...@@ -117,16 +129,16 @@ To address the above drawbacks, we can design bidirectional recurrent neural net
Fig 4. Bidirectional LSTMs Fig 4. Bidirectional LSTMs
</p> </p>
Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks[machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md) Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks [machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.en.md)
### Conditional Random Field ### Conditional Random Field (CRF)
The basic pipeline of Neural Networks solving problems is 1) all lower layers aim to learn representations; 2) the top layer is designed for learning the final task. In SRL tasks, CRF is built on top of the network for the final tag sequence prediction. It takes the representations provided by the last LSTM layer as input. Typically, a neural network's lower layers learn representations while its very top layer learns the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes as input the representations provided by the last LSTM layer.
CRF is a probabilistic graph model (undirected) with nodes denoting random variables and edges denoting dependencies between nodes. To be simplicity, CRFs learn conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input, $Y = (y_1, y_2, ... , y_n)$ are label sequences; Decoding is to search sequence $Y$ to maximize conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。 The CRF is an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. In essence, CRFs learn the conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input and $Y = (y_1, y_2, ... , y_n)$ are label sequences; to decode, simply search through $Y$ for a sequence that maximizes the conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
Sequence tagging tasks only consider input and output as linear sequences without extra dependent assumptions on graph model. Thus, the graph model of sequence tagging tasks is simple chain or line, which results in a Linear-Chain Conditional Random Field, shown in Fig.5. Sequence tagging tasks do not assume a lot of conditional independence, because they are only concerned with the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
<p align="center"> <p align="center">
<img src="./image/linear_chain_crf.png" width = "35%" align=center><br> <img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
...@@ -138,43 +150,43 @@ By the fundamental theorem of random fields \[[5](#Reference)\], the joint distr ...@@ -138,43 +150,43 @@ By the fundamental theorem of random fields \[[5](#Reference)\], the joint distr
$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$ $$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
where, $Z(X)$ is normalization constant, $t_j$ is feature function defined on edges, called transition feature, depending on $y_i$ and $y_{i-1}$ which represents transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. $s_k$ is feature function defined on nodes, called state feature, depending on $y_i$ and represents the probality of $y_i$ given input sequence $X$. $\lambda_j$ 和 $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Actually, $t$ and $s$ can be wrtten in the same form, then take summation over all nodes $i$: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, $f$ is defined as feature function. Thus, $P(Y|X)$ can be wrtten as: where, $Z(X)$ is normalization constant, ${t_j}$ represents the feature functions defined on edges called the *transition feature*, which denotes the transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. ${s_k}$ represents the feature function defined on nodes, called the state feature, denoting the probability of $y_i$ given input sequence $X$. In addition, $\lambda_j$ and $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Alternatively, $t$ and $s$ can be written in the same form that depends on $y_{i - 1}$, $y_i$, $X$, and $i$. Taking its summation over all nodes $i$, we have: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, which defines the *feature function* $f$. Thus, $P(Y|X)$ can be written as:
$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
$\omega$ are weights of feature function which should be learned in CRF models. At training stage, given input sequences and label sequences $D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, solve following objective function using MLE: where $\omega$ are the weights to the feature function that the CRF learns. While training, given input sequences and label sequences $D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, by maximum likelihood estimation (**MLE**), we construct the following objective function:
$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
This objective function can be solved via back-propagation in an end-to-end manner. At decoding stage, given input sequences $X$, search sequence $\bar{Y}$ to maximize conditional probability $\bar{P}(Y|X)$ via decoding methods (such as Viterbi, Beam Search). This objective function can be solved via back-propagation in an end-to-end manner. While decoding, given input sequences $X$, search for sequence $\bar{Y}$ to maximize the conditional probability $\bar{P}(Y|X)$ via decoding methods (such as *Viterbi*, or [Beam Search Algorithm](https://github.com/PaddlePaddle/book/blob/develop/07.machine_translation/README.en.md#Beam%20Search%20Algorithm)).
### DB-LSTM SRL model ### Deep Bidirectional LSTM (DB-LSTM) SRL model
Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has n predicates, we will process this sequence n times. One model is as follows: Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has $n$ predicates, we will process this sequence $n$ times. Here is the breakdown of a straight-forward model:
1. Construct inputs; 1. Construct inputs;
- input 1: predicate, input 2: sentence - input 1: predicate, input 2: sentence
- expand input 1 as a sequence with the same length with input 2 using one-hot representation; - expand input 1 into a sequence of the same length with input 2's sentence, using one-hot representation;
2. Convert one-hot sequences from step 1 to vector sequences via lookup table; 2. Convert the one-hot sequences from step 1 to vector sequences via a word embedding's lookup table;
3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs; 3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
4. Take representations from step 3 as inputs, label sequence as supervision signal, do sequence tagging tasks 4. Take the representation from step 3 as input, label sequence as supervisory signal, and realize sequence tagging tasks.
We can try above method. Here, we propose some modifications by introducing two simple but effective features: Here, we propose some improvements by introducing two simple but effective features:
- predicate context (ctx-p): A single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk. - predicate context (**ctx-p**): A single predicate word may not describe all the predicate information, especially when the same words appear multiple times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk.
- region mark ($m_r$): $m_r = 1$ to denote word in that position locates in the predicate context region, or $m_r = 0$ if not. - region mark ($m_r$): The binary marker on a word, $m_r$, takes the value of $1$ when the word is in the predicate context region, and $0$ if not.
After modification, the model is as follows: After these modifications, the model is as follows, as illustrated in Figure 6:
1. Construct inputs 1. Construct inputs
- Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, element value will be 1 if word locates in the predicate context region, 0 otherwise. - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if word is located in the predicate context region, 0 otherwise.
- expand input 2~3 as sequences with the same length with input 1 - expand input 2~3 into sequences with the same length with input 1
2. Convert input 1~4 to vector sequences via lookup table; input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables 2. Convert input 1~4 to vector sequences via word embedding lookup tables; While input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables.
3. Take four vector sequences from step 2 as inputs of bidirectional LSTMs; Train LSTMs to update representations 3. Take the four vector sequences from step 2 as inputs to bidirectional LSTMs; Train the LSTMs to update representations.
4. Take representation from step 3 as input of CRF, label sequence as supervision signal, do sequence tagging tasks 4. Take the representation from step 3 as input to CRF, label sequence as supervisory signal, and complete sequence tagging tasks.
<div align="center"> <div align="center">
...@@ -184,9 +196,9 @@ Fig 6. DB-LSTM for SRL tasks ...@@ -184,9 +196,9 @@ Fig 6. DB-LSTM for SRL tasks
## Data Preparation ## Data Preparation
In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. It is important to note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, if you want to train a usable neural network SRL system, consider paying for the full corpus. In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. Note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, for a usable neural network SRL system, please consider paying for the full corpus.
The original data includes a variety of information such as POS tagging, naming entity recognition, parsing tree, and so on. In this tutorial, we only use the data under the words folder (text sequence) and the props folder (label results) inside test.wsj parent folder. The data directory used in this tutorial is as follows: The original data includes a variety of information such as POS tagging, naming entity recognition, syntax tree, etc. In this tutorial, we only use the data under `test.wsj/words/` (text sequence) and `test.wsj/props/` (label results). The data directory used in this tutorial is as follows:
```text ```text
conll05st-release/ conll05st-release/
...@@ -195,9 +207,9 @@ conll05st-release/ ...@@ -195,9 +207,9 @@ conll05st-release/
└── words # 输入文本序列 └── words # 输入文本序列
``` ```
The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The label of the PropBank is different from the label that we used in the example at the beginning of the article, but the principle is the same. For the description of the label, please refer to the paper \[[9](#references)\]. The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The labeling of the PropBank is different from the labeling methods mentioned before, but shares with it the same underlying principle. For descriptions of the labeling, please refer to the paper \[[9](#references)\].
The raw data needs to be preprocessed before used by PaddlePaddle. The preprocessing consists of the following steps: The raw data needs to be preprocessed into formats that PaddlePaddle can handle. The preprocessing consists of the following steps:
1. Merge the text sequence and the tag sequence into the same record; 1. Merge the text sequence and the tag sequence into the same record;
2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate; 2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate;
...@@ -212,7 +224,7 @@ The raw data needs to be preprocessed before used by PaddlePaddle. The preproces ...@@ -212,7 +224,7 @@ The raw data needs to be preprocessed before used by PaddlePaddle. The preproces
# conll05.test gets preprocessed training instances. # conll05.test gets preprocessed training instances.
``` ```
After preprocessing completes, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. Following table is an example of a training sample. After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample.
| word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence| | word sequence | predicate | predicate context(5 columns) | region mark sequence | label sequence|
|---|---|---|---|---| |---|---|---|---|---|
...@@ -234,9 +246,9 @@ In addition to the data, we provide following resources: ...@@ -234,9 +246,9 @@ In addition to the data, we provide following resources:
| predicate_dict | predicate dictionary, total 3162 predicates | | predicate_dict | predicate dictionary, total 3162 predicates |
| emb | a pre-trained word vector lookup table, 32-dimentional | | emb | a pre-trained word vector lookup table, 32-dimentional |
We trained in the English Wikipedia language model to get a word vector lookup table used to initialize the SRL model. During the SRL model training process, the word vector lookup table is no longer updated. About the language model and the word vector lookup table can refer to [word vector](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) tutorial. There are 995,000,000 token in training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`. We trained a language model on the English Wikipedia to get a word vector lookup table used to initialize the SRL model. While training the SRL model, the word vector lookup table is no longer updated. To learn more about the language model and the word vector lookup table, please refer to the tutorial [word vector](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md). There are 995,000,000 tokens in the training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
Get dictionary, print dictionary size: Here we fetch the dictionary, and print its size:
```python ```python
import math import math
...@@ -256,12 +268,12 @@ print label_dict_len ...@@ -256,12 +268,12 @@ print label_dict_len
print pred_len print pred_len
``` ```
## Model configuration ## Model Configuration
- Define input data dimensions and model hyperparameters. - Define input data dimensions and model hyperparameters.
```python ```python
mark_dict_len = 2 # Value range of region mark. Region mark is either 0 or 1, so range is 2 mark_dict_len = 2 # value range of region mark. Region mark is either 0 or 1, so range is 2
word_dim = 32 # word vector dimension word_dim = 32 # word vector dimension
mark_dim = 5 # adjacent dimension mark_dim = 5 # adjacent dimension
hidden_dim = 512 # the dimension of LSTM hidden layer vector is 128 (512/4) hidden_dim = 512 # the dimension of LSTM hidden layer vector is 128 (512/4)
...@@ -291,9 +303,9 @@ mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len)) ...@@ -291,9 +303,9 @@ mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
target = paddle.layer.data(name='target', type=d_type(label_dict_len)) target = paddle.layer.data(name='target', type=d_type(label_dict_len))
``` ```
Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。 Note that `hidden_dim = 512` means a LSTM hidden vector of 128 dimension (512/4). Please refer to PaddlePaddle's official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
- The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences. - Transform the word sequence itself, the predicate, the predicate context, and the region mark sequence into embedded vector sequences.
```python ```python
...@@ -322,7 +334,7 @@ emb_layers.append(predicate_embedding) ...@@ -322,7 +334,7 @@ emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding) emb_layers.append(mark_embedding)
``` ```
- 8 LSTM units will be trained in "forward / backward" order. - 8 LSTM units are trained through alternating left-to-right / right-to-left order denoted by the variable `reverse`.
```python ```python
hidden_0 = paddle.layer.mixed( hidden_0 = paddle.layer.mixed(
...@@ -372,7 +384,7 @@ for i in range(1, depth): ...@@ -372,7 +384,7 @@ for i in range(1, depth):
input_tmp = [mix_hidden, lstm] input_tmp = [mix_hidden, lstm]
``` ```
- We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation. - We will concatenate the output of the top LSTM unit with its input, and project the result into a hidden layer. Then, we put a fully connected layer on top to get the final feature vector representation.
```python ```python
feature_out = paddle.layer.mixed( feature_out = paddle.layer.mixed(
...@@ -386,7 +398,7 @@ for i in range(1, depth): ...@@ -386,7 +398,7 @@ for i in range(1, depth):
], ) ], )
``` ```
- We use CRF as cost function, the parameter of CRF cost will be named `crfw`. - At the end of the network, we use CRF as the cost function; the parameter of CRF cost will be named `crfw`.
```python ```python
crf_cost = paddle.layer.crf( crf_cost = paddle.layer.crf(
...@@ -399,7 +411,7 @@ crf_cost = paddle.layer.crf( ...@@ -399,7 +411,7 @@ crf_cost = paddle.layer.crf(
learning_rate=mix_hidden_lr)) learning_rate=mix_hidden_lr))
``` ```
- CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer. The sharing of parameters among multiple layers is specified by the same parameter name in these layers. - The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer. The sharing of parameters among multiple layers is specified by using the same parameter name in these layers.
```python ```python
crf_dec = paddle.layer.crf_decoding( crf_dec = paddle.layer.crf_decoding(
...@@ -426,7 +438,7 @@ We can print out parameter name. It will be generated if not specified. ...@@ -426,7 +438,7 @@ We can print out parameter name. It will be generated if not specified.
print parameters.keys() print parameters.keys()
``` ```
Now we load pre-trained word lookup table. Now we load the pre-trained word lookup tables from word embeddings trained on the English language Wikipedia.
```python ```python
def load_parameter(file_name, h, w): def load_parameter(file_name, h, w):
...@@ -438,7 +450,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32)) ...@@ -438,7 +450,7 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
### Create Trainer ### Create Trainer
We will create trainer given model topology, parameters and optimization method. We will use most basic SGD method (momentum optimizer with 0 momentum). In the meantime, we will set learning rate and regularization. We will create trainer given model topology, parameters, and optimization method. We will use the most basic **SGD** method, which is a momentum optimizer with 0 momentum. Meanwhile, we will set learning rate and regularization.
```python ```python
optimizer = paddle.optimizer.Momentum( optimizer = paddle.optimizer.Momentum(
...@@ -455,7 +467,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost, ...@@ -455,7 +467,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
### Trainer ### Trainer
As mentioned in data preparation section, we will use CoNLL 2005 test corpus as training data set. `conll05.test()` outputs one training instance at a time. It will be shuffled, and batched into mini batches as input. As mentioned in data preparation section, we will use CoNLL 2005 test corpus as the training data set. `conll05.test()` outputs one training instance at a time. It is shuffled and batched into mini batches, and used as input.
```python ```python
reader = paddle.batch( reader = paddle.batch(
...@@ -463,7 +475,7 @@ reader = paddle.batch( ...@@ -463,7 +475,7 @@ reader = paddle.batch(
conll05.test(), buf_size=8192), batch_size=20) conll05.test(), buf_size=8192), batch_size=20)
``` ```
`feeding` is used to specify relationship between data instance and layer layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`. `feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
```python ```python
feeding = { feeding = {
...@@ -479,7 +491,7 @@ feeding = { ...@@ -479,7 +491,7 @@ feeding = {
} }
``` ```
`event_handle` can be used as callback for training events, it will be used as an argument for `train`. Following `event_handle` prints cost during training. `event_handler` can be used as callback for training events, it will be used as an argument for the `train` method. Following `event_handler` prints cost during training.
```python ```python
def event_handler(event): def event_handler(event):
...@@ -501,7 +513,7 @@ trainer.train( ...@@ -501,7 +513,7 @@ trainer.train(
## Conclusion ## Conclusion
Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we give SRL as an example to introduce how to use PaddlePaddle to do sequence tagging tasks. Proposed models are from our published paper\[[10](#Reference)\]. We only use test data as an illustration since train data on CoNLL 2005 dataset is not completely public. We hope to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models. Please check out our paper for more information and discussions. Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we use SRL as an example to illustrate using PaddlePaddle to do sequence tagging tasks. The models proposed are from our published paper\[[10](#Reference)\]. We only use test data for illustration since the training data on the CoNLL 2005 dataset is not completely public. This aims to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models in terms of performance. Please check out our paper for more information and discussions.
## Reference ## Reference
1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483. 1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
......
...@@ -135,7 +135,7 @@ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$ ...@@ -135,7 +135,7 @@ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
$\omega$是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ,通过正则化的极大似然估计,求解如下优化目标: $\omega$是特征函数对应的权值,是CRF模型要学习的参数。训练时,对于给定的输入序列和对应的标记序列集合$D = \left[(X_1, Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ,通过正则化的极大似然估计,求解如下优化目标:
$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列$X$,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。 这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时,对于给定的输入序列$X$,通过解码算法(通常有:维特比算法、Beam Search)求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。
......
...@@ -399,7 +399,7 @@ for param in parameters.keys(): ...@@ -399,7 +399,7 @@ for param in parameters.keys():
```python ```python
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=5e-5, learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=1e-3)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(cost=cost, trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters, parameters=parameters,
update_equation=optimizer) update_equation=optimizer)
...@@ -423,7 +423,7 @@ for param in parameters.keys(): ...@@ -423,7 +423,7 @@ for param in parameters.keys():
trainer.train( trainer.train(
reader=wmt14_reader, reader=wmt14_reader,
event_handler=event_handler, event_handler=event_handler,
num_passes=10000, num_passes=2,
feeding=feeding) feeding=feeding)
``` ```
......
...@@ -361,7 +361,7 @@ for param in parameters.keys(): ...@@ -361,7 +361,7 @@ for param in parameters.keys():
```python ```python
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=5e-5, learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=1e-3)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(cost=cost, trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters, parameters=parameters,
update_equation=optimizer) update_equation=optimizer)
...@@ -388,7 +388,7 @@ for param in parameters.keys(): ...@@ -388,7 +388,7 @@ for param in parameters.keys():
trainer.train( trainer.train(
reader=wmt14_reader, reader=wmt14_reader,
event_handler=event_handler, event_handler=event_handler,
num_passes=10000, num_passes=2,
feeding=feeding) feeding=feeding)
``` ```
......
...@@ -107,7 +107,7 @@ def main(): ...@@ -107,7 +107,7 @@ def main():
# define optimize method and trainer # define optimize method and trainer
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=5e-5, learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=1e-3)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD( trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer) cost=cost, parameters=parameters, update_equation=optimizer)
...@@ -137,7 +137,7 @@ def main(): ...@@ -137,7 +137,7 @@ def main():
trainer.train( trainer.train(
reader=wmt14_reader, reader=wmt14_reader,
event_handler=event_handler, event_handler=event_handler,
num_passes=10000, num_passes=2,
feeding=feeding) feeding=feeding)
......
...@@ -441,7 +441,7 @@ for param in parameters.keys(): ...@@ -441,7 +441,7 @@ for param in parameters.keys():
```python ```python
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=5e-5, learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=1e-3)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(cost=cost, trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters, parameters=parameters,
update_equation=optimizer) update_equation=optimizer)
...@@ -465,7 +465,7 @@ for param in parameters.keys(): ...@@ -465,7 +465,7 @@ for param in parameters.keys():
trainer.train( trainer.train(
reader=wmt14_reader, reader=wmt14_reader,
event_handler=event_handler, event_handler=event_handler,
num_passes=10000, num_passes=2,
feeding=feeding) feeding=feeding)
``` ```
......
...@@ -403,7 +403,7 @@ for param in parameters.keys(): ...@@ -403,7 +403,7 @@ for param in parameters.keys():
```python ```python
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=5e-5, learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=1e-3)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(cost=cost, trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters, parameters=parameters,
update_equation=optimizer) update_equation=optimizer)
...@@ -430,7 +430,7 @@ for param in parameters.keys(): ...@@ -430,7 +430,7 @@ for param in parameters.keys():
trainer.train( trainer.train(
reader=wmt14_reader, reader=wmt14_reader,
event_handler=event_handler, event_handler=event_handler,
num_passes=10000, num_passes=2,
feeding=feeding) feeding=feeding)
``` ```
......
...@@ -87,6 +87,7 @@ The raw `MoiveLens` contains movie ratings, relevant features from both movies a ...@@ -87,6 +87,7 @@ The raw `MoiveLens` contains movie ratings, relevant features from both movies a
For instance, one movie's feature could be: For instance, one movie's feature could be:
```python ```python
import paddle.v2 as paddle
movie_info = paddle.dataset.movielens.movie_info() movie_info = paddle.dataset.movielens.movie_info()
print movie_info.values()[0] print movie_info.values()[0]
``` ```
...@@ -254,7 +255,7 @@ Finally, we can use cosine similarity to calculate the similarity between user c ...@@ -254,7 +255,7 @@ Finally, we can use cosine similarity to calculate the similarity between user c
```python ```python
inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5) inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
cost = paddle.layer.regression_cost( cost = paddle.layer.mse_cost(
input=inference, input=inference,
label=paddle.layer.data( label=paddle.layer.data(
name='score', type=paddle.data_type.dense_vector(1))) name='score', type=paddle.data_type.dense_vector(1)))
...@@ -282,7 +283,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, ...@@ -282,7 +283,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
```text ```text
[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score] [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__regression_cost_0__] [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__mse_cost_0__]
``` ```
### Training ### Training
......
...@@ -268,7 +268,7 @@ inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_feature ...@@ -268,7 +268,7 @@ inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_feature
```python ```python
cost = paddle.layer.regression_cost( cost = paddle.layer.mse_cost(
input=inference, input=inference,
label=paddle.layer.data( label=paddle.layer.data(
name='score', type=paddle.data_type.dense_vector(1))) name='score', type=paddle.data_type.dense_vector(1)))
...@@ -287,7 +287,7 @@ parameters = paddle.parameters.create(cost) ...@@ -287,7 +287,7 @@ parameters = paddle.parameters.create(cost)
``` ```
[INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score] [INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
[INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__regression_cost_0__] [INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__mse_cost_0__]
`parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候,我们没有指定参数名称,这里参数名称是自动生成的。当然,我们也可以指定每一个参数名称,方便日后维护。 `parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候,我们没有指定参数名称,这里参数名称是自动生成的。当然,我们也可以指定每一个参数名称,方便日后维护。
...@@ -311,7 +311,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, ...@@ -311,7 +311,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
``` ```
[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score] [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__regression_cost_0__] [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__mse_cost_0__]
### 训练 ### 训练
......
...@@ -129,6 +129,7 @@ The raw `MoiveLens` contains movie ratings, relevant features from both movies a ...@@ -129,6 +129,7 @@ The raw `MoiveLens` contains movie ratings, relevant features from both movies a
For instance, one movie's feature could be: For instance, one movie's feature could be:
```python ```python
import paddle.v2 as paddle
movie_info = paddle.dataset.movielens.movie_info() movie_info = paddle.dataset.movielens.movie_info()
print movie_info.values()[0] print movie_info.values()[0]
``` ```
...@@ -296,7 +297,7 @@ Finally, we can use cosine similarity to calculate the similarity between user c ...@@ -296,7 +297,7 @@ Finally, we can use cosine similarity to calculate the similarity between user c
```python ```python
inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5) inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
cost = paddle.layer.regression_cost( cost = paddle.layer.mse_cost(
input=inference, input=inference,
label=paddle.layer.data( label=paddle.layer.data(
name='score', type=paddle.data_type.dense_vector(1))) name='score', type=paddle.data_type.dense_vector(1)))
...@@ -324,7 +325,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, ...@@ -324,7 +325,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
```text ```text
[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score] [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__regression_cost_0__] [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__mse_cost_0__]
``` ```
### Training ### Training
......
...@@ -310,7 +310,7 @@ inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_feature ...@@ -310,7 +310,7 @@ inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_feature
```python ```python
cost = paddle.layer.regression_cost( cost = paddle.layer.mse_cost(
input=inference, input=inference,
label=paddle.layer.data( label=paddle.layer.data(
name='score', type=paddle.data_type.dense_vector(1))) name='score', type=paddle.data_type.dense_vector(1)))
...@@ -329,7 +329,7 @@ parameters = paddle.parameters.create(cost) ...@@ -329,7 +329,7 @@ parameters = paddle.parameters.create(cost)
``` ```
[INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score] [INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
[INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__regression_cost_0__] [INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__mse_cost_0__]
`parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候,我们没有指定参数名称,这里参数名称是自动生成的。当然,我们也可以指定每一个参数名称,方便日后维护。 `parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候,我们没有指定参数名称,这里参数名称是自动生成的。当然,我们也可以指定每一个参数名称,方便日后维护。
...@@ -353,7 +353,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, ...@@ -353,7 +353,7 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
``` ```
[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score] [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__regression_cost_0__] [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__mse_cost_0__]
### 训练 ### 训练
......
...@@ -61,7 +61,7 @@ def main(): ...@@ -61,7 +61,7 @@ def main():
inference = paddle.layer.cos_sim( inference = paddle.layer.cos_sim(
a=usr_combined_features, b=mov_combined_features, size=1, scale=5) a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
cost = paddle.layer.regression_cost( cost = paddle.layer.mse_cost(
input=inference, input=inference,
label=paddle.layer.data( label=paddle.layer.data(
name='score', type=paddle.data_type.dense_vector(1))) name='score', type=paddle.data_type.dense_vector(1)))
......
# Deep Learning with PaddlePaddle # Deep Learning with PaddlePaddle
1. [Fit a Line](http://book.paddlepaddle.org/fit_a_line/index.en.html) 1. [Fit a Line](http://book.paddlepaddle.org/01.fit_a_line/index.en.html)
1. [Recognize Digits](http://book.paddlepaddle.org/recognize_digits/index.en.html) 1. [Recognize Digits](http://book.paddlepaddle.org/02.recognize_digits/index.en.html)
1. [Image Classification](http://book.paddlepaddle.org/image_classification/index.en.html) 1. [Image Classification](http://book.paddlepaddle.org/03.image_classification/index.en.html)
1. [Word to Vector](http://book.paddlepaddle.org/word2vec/index.en.html) 1. [Word to Vector](http://book.paddlepaddle.org/04.word2vec/index.en.html)
1. [Understand Sentiment](http://book.paddlepaddle.org/understand_sentiment/index.en.html) 1. [Understand Sentiment](http://book.paddlepaddle.org/05.understand_sentiment/index.en.html)
1. [Label Semantic Roles](http://book.paddlepaddle.org/label_semantic_roles/index.en.html) 1. [Label Semantic Roles](http://book.paddlepaddle.org/06.label_semantic_roles/index.en.html)
1. [Machine Translation](http://book.paddlepaddle.org/machine_translation/index.en.html) 1. [Machine Translation](http://book.paddlepaddle.org/07.machine_translation/index.en.html)
1. [Recommender System](http://book.paddlepaddle.org/recommender_system/index.en.html) 1. [Recommender System](http://book.paddlepaddle.org/08.recommender_system/index.en.html)
## Running the Book
This book you are reading is interactive -- each chapter can run as a Jupyter Notebook.
We packed this book, Jupyter, PaddlePaddle, and all dependencies into a Docker image. So you don't need to install anything except Docker. If you are using Windows, please follow [this installation guide](https://www.docker.com/docker-windows). If you are running Mac, please follow [this](https://www.docker.com/docker-mac). For various Linux distros, please refer to https://www.docker.com. If you are using Windows or Mac, you might want to give Docker [more memory and CPUs/cores](http://stackoverflow.com/a/39720010/724872).
Just type
```bash
docker run -d -p 8888:8888 paddlepaddle/book
```
This command will download the pre-built Docker image from DockerHub.com and run it in a container. Please direct your Web browser to http://localhost:8888 to read the book.
If you are living in somewhere slow to access DockerHub.com, you might try our mirror server docker.paddlepaddle.org:
```bash
docker run -d -p 8888:8888 docker.paddlepaddle.org/book
```
## Contribute
Your contribution is welcome! Please feel free to file Pull Requests to add your chapter as a directory under `/pending`. Once it is going stable, the community would like to move it to `/`.
To write, run, and debug your chapters, you will need Python 2.x, Go >1.5. You can build the Docker image using [this script](https://github.com/PaddlePaddle/book/blob/develop/.tools/convert-markdown-into-ipynb-and-test.sh).
This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>. This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.
# 深度学习入门 # 深度学习入门
1. [新手入门](http://book.paddlepaddle.org/fit_a_line) [![Build Status](https://travis-ci.org/PaddlePaddle/book.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/book)
1. [识别数字](http://book.paddlepaddle.org/recognize_digits) [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://book.paddlepaddle.org/index.en.html)
1. [图像分类](http://book.paddlepaddle.org/image_classification) [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://book.paddlepaddle.org)
1. [词向量](http://book.paddlepaddle.org/word2vec) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
1. [情感分析](http://book.paddlepaddle.org/understand_sentiment)
1. [语义角色标注](http://book.paddlepaddle.org/label_semantic_roles) 1. [新手入门](http://book.paddlepaddle.org/01.fit_a_line)
1. [机器翻译](http://book.paddlepaddle.org/machine_translation) 1. [识别数字](http://book.paddlepaddle.org/02.recognize_digits)
1. [个性化推荐](http://book.paddlepaddle.org/recommender_system) 1. [图像分类](http://book.paddlepaddle.org/03.image_classification)
1. [词向量](http://book.paddlepaddle.org/04.word2vec)
1. [情感分析](http://book.paddlepaddle.org/05.understand_sentiment)
1. [语义角色标注](http://book.paddlepaddle.org/06.label_semantic_roles)
1. [机器翻译](http://book.paddlepaddle.org/07.machine_translation)
1. [个性化推荐](http://book.paddlepaddle.org/08.recommender_system)
## 运行这本书
您现在在看的这本书是一本“交互式”电子书 —— 每一章都可以运行在一个
Jupyter Notebook 里。
我们把 Jupyter、PaddlePaddle、以及各种被依赖的软件都打包进一个 Docker
image 了。所以您不需要自己来安装各种软件,只需要安装 Docker 即可。如果
您使用 Windows,可以参
[这里](https://www.docker.com/docker-windows)。如果您使用 Mac,可以
参考[这里](https://www.docker.com/docker-mac)。 对于各种 Linux 发行版,
请参考https://www.docker.com 。如果您使用 Windows 或者 Mac,可以通过如
下方法给 Docker 更多内存和CPU资源
(http://stackoverflow.com/a/39720010/724872)。
只需要在命令行窗口里运行:
```bash
docker run -d -p 8888:8888 paddlepaddle/book
```
这个命令会从 DockerHub.com 下载本书的 Docker image 并且运行之。请在浏
览器里访问 http://localhost:8888 即可阅读和在线编辑本书。
如果您访问 DockerHub.com 很慢,可以试试我们的另一个镜像
docker.paddlepaddle.org:
```bash
docker run -d -p 8888:8888 docker.paddlepaddle.org/book
```
## 贡献内容
您要是能贡献新的章节那就太好了!请发 Pull Requests 把您写的章节加入到
`/pending` 下面的一个子目录里。当这一章稳定下来,我们一起把您的目录挪
到根目录。
为了写作、运行、调试,您需要安装 Python 2.x, Go >1.5. 你可以用这
[脚本程序](https://github.com/PaddlePaddle/book/blob/develop/.tools/convert-markdown-into-ipynb-and-test.sh)
生成 Docker image。
**Note:** We also provide [English Readme](https://github.com/PaddlePaddle/book/blob/develop/README.en.md) for PaddlePaddle book.
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 图像分类\n",
"\n",
"本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。\n",
"\n",
"## 背景介绍\n",
"\n",
"图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。\n",
"\n",
"图像分类是根据图像的语义信息将不同类别图像区分开来,是计算机视觉中重要的基本问题,也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用,包括安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。\n",
"\n",
"\n",
"一般来说,图像分类通过手工特征或特征学习方法对整个图像进行全部描述,然后使用分类器判别物体类别,因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入,即一句话可以用一个装了词的袋子表示其特征,袋子中的词为句子中的单词、短语或字。对于图像而言,词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。\n",
"\n",
"而基于深度学习的图像分类方法,可以通过有监督或无监督的方式**学习**层次化的特征描述,从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩,CNN直接利用图像像素信息作为输入,最大程度上保留了输入图像的所有信息,通过卷积操作进行特征的提取和高层抽象,模型输出直接是图像识别的结果。这种基于\"输入-输出\"直接端到端的学习方法取得了非常好的效果,得到了广泛的应用。\n",
"\n",
"本教程主要介绍图像分类的深度学习模型,以及如何使用PaddlePaddle训练CNN模型。\n",
"\n",
"## 效果展示\n",
"\n",
"图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/dog_cat.png \" width=\"350\" \u003e\u003cbr/\u003e\n",
"图1. 通用图像分类展示\n",
"\u003c/p\u003e\n",
"\n",
"\n",
"图2展示了细粒度图像分类-花卉识别的效果,要求模型可以正确识别花的类别。\n",
"\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/flowers.png\" width=\"400\" \u003e\u003cbr/\u003e\n",
"图2. 细粒度图像分类展示\n",
"\u003c/p\u003e\n",
"\n",
"\n",
"一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/variations.png\" width=\"550\" \u003e\u003cbr/\u003e\n",
"图3. 扰动图片展示[22]\n",
"\u003c/p\u003e\n",
"\n",
"## 模型概览\n",
"\n",
"图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上,很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛,ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集,在本章中我们基于这些竞赛的一些论文介绍图像分类模型。\n",
"\n",
"在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成,但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。\n",
" 1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \\[[1](#参考文献)\\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \\[[2](#参考文献)\\]、LBP(Local Bianray Pattern, 局部二值模式) \\[[3](#参考文献)\\] 等,一般也采用多种特征描述子,防止丢失过多的有用信息。\n",
" 2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \\[[4](#参考文献)\\]、稀疏编码 \\[[5](#参考文献)\\]、局部线性约束编码 \\[[6](#参考文献)\\]、Fisher向量编码 \\[[7](#参考文献)\\] 等。\n",
" 3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。\n",
" 4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。\n",
"\n",
"这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \\[[18](#参考文献)\\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \\[[8](#参考文献)\\]。\n",
"\n",
"Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \\[[9](#参考文献)\\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/ilsvrc.png\" width=\"500\" \u003e\u003cbr/\u003e\n",
"图4. ILSVRC图像分类Top-5错误率\n",
"\u003c/p\u003e\n",
"\n",
"### CNN\n",
"\n",
"传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/lenet.png\"\u003e\u003cbr/\u003e\n",
"图5. CNN网络示例[20]\n",
"\u003c/p\u003e\n",
"\n",
"- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。\n",
"- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。\n",
"- 全连接层(fully-connected layer,或者fc layer): 输入层到隐藏层的神经元是全部连接的。\n",
"- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层,例如Sigmoid、Tanh、ReLu等来增强网络的表达能力,在CNN里最常使用的为ReLu激活函数。\n",
"- Dropout \\[[10](#参考文献)\\] : 在模型训练阶段随机让一些隐层节点权重不工作,提高网络的泛化能力,一定程度上防止过拟合。\n",
"\n",
"另外,在训练过程中由于每层参数不断更新,会导致下一次输入分布发生变化,这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \\[[14](#参考文献)\\] 中,每个batch对网络中的每一层特征都做归一化,使得每层分布相对稳定。BN算法不仅起到一定的正则作用,而且弱化了一些超参数的设计。经过实验证明,BN算法加速了模型收敛过程,在后来较深的模型中被广泛使用。\n",
"\n",
"接下来我们主要介绍VGG,GoogleNet和ResNet网络结构。\n",
"\n",
"### VGG\n",
"\n",
"牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \\[[11](#参考文献)\\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\\[[19](#参考文献)\\]就是借鉴VGG模型的结构。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/vgg16.png\" width=\"750\" \u003e\u003cbr/\u003e\n",
"图6. 基于ImageNet的VGG16模型\n",
"\u003c/p\u003e\n",
"\n",
"### GoogleNet\n",
"\n",
"GoogleNet \\[[12](#参考文献)\\] 在2014年ILSVRC的获得了冠军,在介绍该模型之前我们先来了解NIN(Network in Network)模型 \\[[13](#参考文献)\\] 和Inception模块,因为GoogleNet模型由多组Inception模块组成,模型设计借鉴了NIN的一些思想。\n",
"\n",
"NIN模型主要有两个特点:1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络,即在线性卷积后面增加若干层1x1的卷积,这样可以提取出高度非线性特征。2) 传统的CNN最后几层一般都是全连接层,参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图,然后采用全局均值池化(Avg-Pooling)替代全连接层,得到类别维度大小的向量,再进行分类。这种替代全连接层的方式有利于减少参数。\n",
"\n",
"Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/inception.png\" width=\"800\" \u003e\u003cbr/\u003e\n",
"图7. Inception模块\n",
"\u003c/p\u003e\n",
"\n",
"GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没有采用传统的多层全连接层,而是像NIN网络一样采用了均值池化层;但与NIN不同的是,池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外,由于网络中间层特征也很有判别性,GoogleNet在中间层添加了两个辅助分类器,在后向传播中增强梯度并且增强正则化,而整个网络的损失函数是这个三个分类器的损失加权求和。\n",
"\n",
"GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/googlenet.jpeg\" \u003e\u003cbr/\u003e\n",
"图8. GoogleNet[12]\n",
"\u003c/p\u003e\n",
"\n",
"\n",
"上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \\[[14](#参考文献)\\] 引入BN层;GoogleNet-v3 \\[[16](#参考文献)\\] 对一些卷积层做了分解,进一步提高网络非线性能力和加深网络;GoogleNet-v4 \\[[17](#参考文献)\\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升,介于篇幅,这里不再详细介绍v2到v4的结构。\n",
"\n",
"\n",
"### ResNet\n",
"\n",
"ResNet(Residual Network) \\[[15](#参考文献)\\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题,ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核,全卷积网络)的基础上,引入了残差模块。每个残差模块包含两条路径,其中一条路径是输入特征的直连通路,另一条路径对该特征做两到三次卷积操作得到该特征的残差,最后再将两条路径上的特征相加。\n",
"\n",
"残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256-\u003e64),下面的1x1卷积用来升维(图示例即64-\u003e256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64-\u003e64)。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/resnet_block.jpg\" width=\"400\"\u003e\u003cbr/\u003e\n",
"图9. 残差模块\n",
"\u003c/p\u003e\n",
"\n",
"图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/resnet.png\"\u003e\u003cbr/\u003e\n",
"图10. 基于ImageNet的ResNet模型\n",
"\u003c/p\u003e\n",
"\n",
"\n",
"## 数据准备\n",
"\n",
"通用图像分类公开的标准数据集常用的有[CIFAR](\u003chttps://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。\n",
"\n",
"由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](\u003chttps://www.cs.toronto.edu/~kriz/cifar.html\u003e)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/cifar.png\" width=\"350\"\u003e\u003cbr/\u003e\n",
"图11. CIFAR10数据集[21]\n",
"\u003c/p\u003e\n",
"\n",
"Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。\n",
"\n",
"通过输入`python train.py`,就可以开始训练模型了,以下小节将详细介绍`train.py`的相关内容。\n",
"\n",
"### 模型结构\n",
"\n",
"#### Paddle 初始化\n",
"\n",
"通过 `paddle.init`,初始化Paddle是否使用GPU,trainer的数目等等。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"import sys\n",
"import paddle.v2 as paddle\n",
"from vgg import vgg_bn_drop\n",
"from resnet import resnet_cifar10\n",
"\n",
"# PaddlePaddle init\n",
"paddle.init(use_gpu=False, trainer_count=1)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"本教程中我们提供了VGG和ResNet两个模型的配置。\n",
"\n",
"#### VGG\n",
"\n",
"首先介绍VGG模型结构,由于CIFAR10图片大小和数量相比ImageNet数据小很多,因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。\n",
"\n",
"1. 定义数据输入及其维度\n",
"\n",
" 网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
" datadim = 3 * 32 * 32\n",
" classdim = 10\n",
"\n",
" image = paddle.layer.data(\n",
" name=\"image\", type=paddle.data_type.dense_vector(datadim))\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"2. 定义VGG网络核心模块\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
" net = vgg_bn_drop(image)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
" def vgg_bn_drop(input):\n",
" def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):\n",
" return paddle.networks.img_conv_group(\n",
" input=ipt,\n",
" num_channels=num_channels,\n",
" pool_size=2,\n",
" pool_stride=2,\n",
" conv_num_filter=[num_filter] * groups,\n",
" conv_filter_size=3,\n",
" conv_act=paddle.activation.Relu(),\n",
" conv_with_batchnorm=True,\n",
" conv_batchnorm_drop_rate=dropouts,\n",
" pool_type=paddle.pooling.Max())\n",
"\n",
" conv1 = conv_block(input, 64, 2, [0.3, 0], 3)\n",
" conv2 = conv_block(conv1, 128, 2, [0.4, 0])\n",
" conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])\n",
" conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])\n",
" conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])\n",
"\n",
" drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)\n",
" fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())\n",
" bn = paddle.layer.batch_norm(\n",
" input=fc1,\n",
" act=paddle.activation.Relu(),\n",
" layer_attr=paddle.attr.Extra(drop_rate=0.5))\n",
" fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())\n",
" return fc2\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 `Conv-\u003eBN-\u003eReLu-\u003eDropout` 和 一组 `Pooling` 组成,\n",
"\n",
" 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。\n",
"\n",
" 2.3. 最后接两层512维的全连接。\n",
"\n",
"3. 定义分类器\n",
"\n",
" 通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
" out = paddle.layer.fc(input=net,\n",
" size=classdim,\n",
" act=paddle.activation.Softmax())\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"4. 定义损失函数和网络输出\n",
"\n",
" 在有监督训练中需要输入图像对应的类别信息,同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
" lbl = paddle.layer.data(\n",
" name=\"label\", type=paddle.data_type.integer_value(classdim))\n",
" cost = paddle.layer.classification_cost(input=out, label=lbl)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### ResNet\n",
"\n",
"ResNet模型的第1、3、4步和VGG模型相同,这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"net = resnet_cifar10(image, depth=56)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"先介绍`resnet_cifar10`中的一些基本函数,再介绍网络连接过程。\n",
"\n",
" - `conv_bn_layer` : 带BN的卷积层。\n",
" - `shortcut` : 残差模块的\"直连\"路径,\"直连\"实际分两种形式:残差模块输入和输出特征通道数不等时,采用1x1卷积的升维操作;残差模块输入和输出通道相等时,采用直连操作。\n",
" - `basicblock` : 一个基础残差模块,即图9左边所示,由两组3x3卷积组成的路径和一条\"直连\"路径组成。\n",
" - `bottleneck` : 一个瓶颈残差模块,即图9右边所示,由上下1x1卷积和中间3x3卷积组成的路径和一条\"直连\"路径组成。\n",
" - `layer_warp` : 一组残差模块,由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同,以用来减少特征图在垂直和水平方向的大小。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"def conv_bn_layer(input,\n",
" ch_out,\n",
" filter_size,\n",
" stride,\n",
" padding,\n",
" active_type=paddle.activation.Relu(),\n",
" ch_in=None):\n",
" tmp = paddle.layer.img_conv(\n",
" input=input,\n",
" filter_size=filter_size,\n",
" num_channels=ch_in,\n",
" num_filters=ch_out,\n",
" stride=stride,\n",
" padding=padding,\n",
" act=paddle.activation.Linear(),\n",
" bias_attr=False)\n",
" return paddle.layer.batch_norm(input=tmp, act=active_type)\n",
"\n",
"def shortcut(ipt, n_in, n_out, stride):\n",
" if n_in != n_out:\n",
" return conv_bn_layer(ipt, n_out, 1, stride, 0,\n",
" paddle.activation.Linear())\n",
" else:\n",
" return ipt\n",
"\n",
"def basicblock(ipt, ch_out, stride):\n",
" ch_in = ch_out * 2\n",
" tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)\n",
" tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())\n",
" short = shortcut(ipt, ch_in, ch_out, stride)\n",
" return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())\n",
"\n",
"def layer_warp(block_func, ipt, features, count, stride):\n",
" tmp = block_func(ipt, features, stride)\n",
" for i in range(1, count):\n",
" tmp = block_func(tmp, features, 1)\n",
" return tmp\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"`resnet_cifar10` 的连接结构主要有以下几个过程。\n",
"\n",
"1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。\n",
"2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。\n",
"3. 最后对网络做均值池化并返回该层。\n",
"\n",
"注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"def resnet_cifar10(ipt, depth=32):\n",
" # depth should be one of 20, 32, 44, 56, 110, 1202\n",
" assert (depth - 2) % 6 == 0\n",
" n = (depth - 2) / 6\n",
" nStages = {16, 64, 128}\n",
" conv1 = conv_bn_layer(\n",
" ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)\n",
" res1 = layer_warp(basicblock, conv1, 16, n, 1)\n",
" res2 = layer_warp(basicblock, res1, 32, n, 2)\n",
" res3 = layer_warp(basicblock, res2, 64, n, 2)\n",
" pool = paddle.layer.img_pool(\n",
" input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())\n",
" return pool\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## 训练模型\n",
"\n",
"### 定义参数\n",
"\n",
"首先依据模型配置的`cost`定义模型参数。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"# Create parameters\n",
"parameters = paddle.parameters.create(cost)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"print parameters.keys()\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### 构造训练(Trainer)\n",
"\n",
"根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的Momentum方法,同时设定了学习率、正则等。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"# Create optimizer\n",
"momentum_optimizer = paddle.optimizer.Momentum(\n",
" momentum=0.9,\n",
" regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),\n",
" learning_rate=0.1 / 128.0,\n",
" learning_rate_decay_a=0.1,\n",
" learning_rate_decay_b=50000 * 100,\n",
" learning_rate_schedule='discexp',\n",
" batch_size=128)\n",
"\n",
"# Create trainer\n",
"trainer = paddle.trainer.SGD(cost=cost,\n",
" parameters=parameters,\n",
" update_equation=momentum_optimizer)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"通过 `learning_rate_decay_a` (简写$a$) 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略,这里采用离散指数的方式调节学习率,计算公式如下, $n$ 代表已经处理过的累计总样本数,$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`。\n",
"\n",
"$$ lr = lr_{0} * a^ {\\lfloor \\frac{n}{ b}\\rfloor} $$\n",
"\n",
"\n",
"### 训练\n",
"\n",
"cifar.train10()每次产生一条样本,在完成shuffle和batch之后,作为训练的输入。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"reader=paddle.batch(\n",
" paddle.reader.shuffle(\n",
" paddle.dataset.cifar.train10(), buf_size=50000),\n",
" batch_size=128)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"通过`feeding`来指定每一个数据和`paddle.layer.data`的对应关系。例如: `cifar.train10()`产生数据的第0列对应image层的特征。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"feeding={'image': 0,\n",
" 'label': 1}\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"可以使用`event_handler`回调函数来观察训练过程,或进行测试等, 该回调函数是`trainer.train`函数里设定。\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"# End batch and end pass event handler\n",
"def event_handler(event):\n",
" if isinstance(event, paddle.event.EndIteration):\n",
" if event.batch_id % 100 == 0:\n",
" print \"\\nPass %d, Batch %d, Cost %f, %s\" % (\n",
" event.pass_id, event.batch_id, event.cost, event.metrics)\n",
" else:\n",
" sys.stdout.write('.')\n",
" sys.stdout.flush()\n",
" if isinstance(event, paddle.event.EndPass):\n",
" result = trainer.test(\n",
" reader=paddle.batch(\n",
" paddle.dataset.cifar.test10(), batch_size=128),\n",
" feeding=feeding)\n",
" print \"\\nTest with Pass %d, %s\" % (event.pass_id, result.metrics)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"通过`trainer.train`函数训练:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"trainer.train(\n",
" reader=reader,\n",
" num_passes=200,\n",
" event_handler=event_handler,\n",
" feeding=feeding)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"一轮训练log示例如下所示,经过1个pass, 训练集上平均error为0.6875 ,测试集上平均error为0.8852 。\n",
"\n",
"```text\n",
"Pass 0, Batch 0, Cost 2.473182, {'classification_error_evaluator': 0.9140625}\n",
"...................................................................................................\n",
"Pass 0, Batch 100, Cost 1.913076, {'classification_error_evaluator': 0.78125}\n",
"...................................................................................................\n",
"Pass 0, Batch 200, Cost 1.783041, {'classification_error_evaluator': 0.7421875}\n",
"...................................................................................................\n",
"Pass 0, Batch 300, Cost 1.668833, {'classification_error_evaluator': 0.6875}\n",
"..........................................................................................\n",
"Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}\n",
"```\n",
"\n",
"图12是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/plot.png\" width=\"400\" \u003e\u003cbr/\u003e\n",
"图12. CIFAR10数据集上VGG模型的分类错误率\n",
"\u003c/p\u003e\n",
"\n",
"\n",
"## 总结\n",
"\n",
"传统图像分类方法由多个阶段构成,框架较为复杂,而端到端的CNN模型结构可一步到位,而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型;然后基于CIFAR10数据集,介绍如何使用PaddlePaddle配置和训练CNN模型,尤其是VGG和ResNet模型;最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet,配置和训练流程是同样的,大家可以自行进行实验。\n",
"\n",
"\n",
"## 参考文献\n",
"\n",
"[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.\n",
"\n",
"[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.\n",
"\n",
"[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.\n",
"\n",
"[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.\n",
"\n",
"[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.\n",
"\n",
"[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.\n",
"\n",
"[7] Perronnin, F., Sánchez, J., \u0026 Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).\n",
"\n",
"[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.\n",
"\n",
"[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.\n",
"\n",
"[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.\n",
"\n",
"[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。\n",
"\n",
"[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)\n",
"\n",
"[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.\n",
"\n",
"[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.\n",
"\n",
"[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.\n",
"\n",
"[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).\n",
"\n",
"[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).\n",
"\n",
"[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.\n",
"\n",
"[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.\n",
"\n",
"[20] http://deeplearning.net/tutorial/lenet.html\n",
"\n",
"[21] https://www.cs.toronto.edu/~kriz/cifar.html\n",
"\n",
"[22] http://cs231n.github.io/classification/\n",
"\n",
"\u003cbr/\u003e\n",
"\u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e\u003cimg alt=\"知识共享许可协议\" style=\"border-width:0\" src=\"https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png\" /\u003e\u003c/a\u003e\u003cbr /\u003e\u003cspan xmlns:dct=\"http://purl.org/dc/terms/\" href=\"http://purl.org/dc/dcmitype/Text\" property=\"dct:title\" rel=\"dct:type\"\u003e本教程\u003c/span\u003e 由 \u003ca xmlns:cc=\"http://creativecommons.org/ns#\" href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" rel=\"cc:attributionURL\"\u003ePaddlePaddle\u003c/a\u003e 创作,采用 \u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议\u003c/a\u003e进行许可。\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
图像分类
=======
本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)
## 背景介绍
图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。
图像分类是根据图像的语义信息将不同类别图像区分开来,是计算机视觉中重要的基本问题,也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用,包括安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。
一般来说,图像分类通过手工特征或特征学习方法对整个图像进行全部描述,然后使用分类器判别物体类别,因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入,即一句话可以用一个装了词的袋子表示其特征,袋子中的词为句子中的单词、短语或字。对于图像而言,词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取****特征编码****分类器设计**三个过程。
而基于深度学习的图像分类方法,可以通过有监督或无监督的方式**学习**层次化的特征描述,从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩,CNN直接利用图像像素信息作为输入,最大程度上保留了输入图像的所有信息,通过卷积操作进行特征的提取和高层抽象,模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果,得到了广泛的应用。
本教程主要介绍图像分类的深度学习模型,以及如何使用PaddlePaddle训练CNN模型。
## 效果展示
图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。
<p align="center">
<img src="image/dog_cat.png " width="350" ><br/>
图1. 通用图像分类展示
</p>
图2展示了细粒度图像分类-花卉识别的效果,要求模型可以正确识别花的类别。
<p align="center">
<img src="image/flowers.png" width="400" ><br/>
图2. 细粒度图像分类展示
</p>
一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。
<p align="center">
<img src="image/variations.png" width="550" ><br/>
图3. 扰动图片展示[22]
</p>
## 模型概览
图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)[ImageNet](http://image-net.org/)等公开的数据集上,很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛,ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集,在本章中我们基于这些竞赛的一些论文介绍图像分类模型。
在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成,但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。
1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等,一般也采用多种特征描述子,防止丢失过多的有用信息。
2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。
4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。
这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\][NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]
Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。
<p align="center">
<img src="image/ilsvrc.png" width="500" ><br/>
图4. ILSVRC图像分类Top-5错误率
</p>
### CNN
传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。
<p align="center">
<img src="image/lenet.png"><br/>
图5. CNN网络示例[20]
</p>
- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。
- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。
- 全连接层(fully-connected layer,或者fc layer): 输入层到隐藏层的神经元是全部连接的。
- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层,例如Sigmoid、Tanh、ReLu等来增强网络的表达能力,在CNN里最常使用的为ReLu激活函数。
- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作,提高网络的泛化能力,一定程度上防止过拟合。
另外,在训练过程中由于每层参数不断更新,会导致下一次输入分布发生变化,这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中,每个batch对网络中的每一层特征都做归一化,使得每层分布相对稳定。BN算法不仅起到一定的正则作用,而且弱化了一些超参数的设计。经过实验证明,BN算法加速了模型收敛过程,在后来较深的模型中被广泛使用。
接下来我们主要介绍VGG,GoogleNet和ResNet网络结构。
### VGG
牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
<p align="center">
<img src="image/vgg16.png" width="750" ><br/>
图6. 基于ImageNet的VGG16模型
</p>
### GoogleNet
GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军,在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块,因为GoogleNet模型由多组Inception模块组成,模型设计借鉴了NIN的一些思想。
NIN模型主要有两个特点:1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络,即在线性卷积后面增加若干层1x1的卷积,这样可以提取出高度非线性特征。2) 传统的CNN最后几层一般都是全连接层,参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图,然后采用全局均值池化(Avg-Pooling)替代全连接层,得到类别维度大小的向量,再进行分类。这种替代全连接层的方式有利于减少参数。
Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。
<p align="center">
<img src="image/inception.png" width="800" ><br/>
图7. Inception模块
</p>
GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没有采用传统的多层全连接层,而是像NIN网络一样采用了均值池化层;但与NIN不同的是,池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外,由于网络中间层特征也很有判别性,GoogleNet在中间层添加了两个辅助分类器,在后向传播中增强梯度并且增强正则化,而整个网络的损失函数是这个三个分类器的损失加权求和。
GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。
<p align="center">
<img src="image/googlenet.jpeg" ><br/>
图8. GoogleNet[12]
</p>
上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层;GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解,进一步提高网络非线性能力和加深网络;GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升,介于篇幅,这里不再详细介绍v2到v4的结构。
### ResNet
ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题,ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核,全卷积网络)的基础上,引入了残差模块。每个残差模块包含两条路径,其中一条路径是输入特征的直连通路,另一条路径对该特征做两到三次卷积操作得到该特征的残差,最后再将两条路径上的特征相加。
残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256->64),下面的1x1卷积用来升维(图示例即64->256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
<p align="center">
<img src="image/resnet_block.jpg" width="400"><br/>
图9. 残差模块
</p>
图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。
<p align="center">
<img src="image/resnet.png"><br/>
图10. 基于ImageNet的ResNet模型
</p>
## 数据准备
### 数据介绍与下载
通用图像分类公开的标准数据集常用的有[CIFAR](<https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。
由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。
<p align="center">
<img src="image/cifar.png" width="350"><br/>
图11. CIFAR10数据集[21]
</p>
下面命令用于下载数据和基于训练集计算图像均值,在网络输入前,基于该均值对输入数据做预处理。
```bash
./data/get_data.sh
```
### 数据提供给PaddlePaddle
我们使用Python接口传递数据给系统,下面 `dataprovider.py` 针对CIFAR10数据给出了完整示例。
- `initializer` 函数进行dataprovider的初始化,这里加载图像的均值,定义了输入image和label两个字段的类型。
- `process` 函数将数据逐条传输给系统,在图像分类任务里,可以在该函数中完成数据扰动操作,再传输给PaddlePaddle。这里对训练集做随机左右翻转,并将原始图片减去均值后传输给系统。
```python
import numpy as np
import cPickle
from paddle.trainer.PyDataProvider2 import *
def initializer(settings, mean_path, is_train, **kwargs):
settings.is_train = is_train
settings.input_size = 3 * 32 * 32
settings.mean = np.load(mean_path)['mean']
settings.input_types = {
'image': dense_vector(settings.input_size),
'label': integer_value(10)
}
@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_list):
with open(file_list, 'r') as fdata:
for fname in fdata:
fo = open(fname.strip(), 'rb')
batch = cPickle.load(fo)
fo.close()
images = batch['data']
labels = batch['labels']
for im, lab in zip(images, labels):
if settings.is_train and np.random.randint(2):
im = im[:,:,::-1]
im = im - settings.mean
yield {
'image': im.astype('float32'),
'label': int(lab)
}
```
## 模型配置说明
### 数据定义
在模型配置中,定义通过 `define_py_data_sources2` 函数从 dataprovider 中读入数据, 其中 args 指定均值文件的路径。如果该配置文件用于预测,则不需要数据定义部分。
```python
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
if not is_predict:
define_py_data_sources2(
train_list='data/train.list',
test_list='data/test.list',
module='dataprovider',
obj='process',
args={'mean_path': 'data/mean.meta'})
```
### 算法配置
在模型配置中,通过 `settings` 设置训练使用的优化算法,并指定batch size 、初始学习率、momentum以及L2正则。
```python
settings(
batch_size=128,
learning_rate=0.1 / 128.0,
learning_rate_decay_a=0.1,
learning_rate_decay_b=50000 * 100,
learning_rate_schedule='discexp',
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * 128),)
```
通过 `learning_rate_decay_a` (简写$a$) 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略,这里采用离散指数的方式调节学习率,计算公式如下, $n$ 代表已经处理过的累计总样本数,$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`
$$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
### 模型结构
本教程中我们提供了VGG和ResNet两个模型的配置。
#### VGG
首先介绍VGG模型结构,由于CIFAR10图片大小和数量相比ImageNet数据小很多,因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。
1. 定义数据输入及其维度
网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。
```python
datadim = 3 * 32 * 32
classdim = 10
data = data_layer(name='image', size=datadim)
```
2. 定义VGG网络核心模块
```python
net = vgg_bn_drop(data)
```
VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下:
```python
def vgg_bn_drop(input, num_channels):
def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
return img_conv_group(
input=ipt,
num_channels=num_channels_,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=ReluActivation(),
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type=MaxPooling())
conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = dropout_layer(input=conv5, dropout_rate=0.5)
fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
bn = batch_norm_layer(
input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
return fc2
```
2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.trainer_config_helpers`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成,
2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。
2.3. 最后接两层512维的全连接。
3. 定义分类器
通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。
```python
out = fc_layer(input=net, size=class_num, act=SoftmaxActivation())
```
4. 定义损失函数和网络输出
在有监督训练中需要输入图像对应的类别信息,同样通过`data_layer`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。
```python
if not is_predict:
lbl = data_layer(name="label", size=class_num)
cost = classification_cost(input=out, label=lbl)
outputs(cost)
else:
outputs(out)
```
### ResNet
ResNet模型的第1、3、4步和VGG模型相同,这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
```python
net = resnet_cifar10(data, depth=56)
```
先介绍`resnet_cifar10`中的一些基本函数,再介绍网络连接过程。
- `conv_bn_layer` : 带BN的卷积层。
- `shortcut` : 残差模块的"直连"路径,"直连"实际分两种形式:残差模块输入和输出特征通道数不等时,采用1x1卷积的升维操作;残差模块输入和输出通道相等时,采用直连操作。
- `basicblock` : 一个基础残差模块,即图9左边所示,由两组3x3卷积组成的路径和一条"直连"路径组成。
- `bottleneck` : 一个瓶颈残差模块,即图9右边所示,由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。
- `layer_warp` : 一组残差模块,由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同,以用来减少特征图在垂直和水平方向的大小。
```python
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
active_type=ReluActivation(),
ch_in=None):
tmp = img_conv_layer(
input=input,
filter_size=filter_size,
num_channels=ch_in,
num_filters=ch_out,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(input=tmp, act=active_type)
def shortcut(ipt, n_in, n_out, stride):
if n_in != n_out:
return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
else:
return ipt
def basicblock(ipt, ch_out, stride):
ch_in = ipt.num_filters
tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
short = shortcut(ipt, ch_in, ch_out, stride)
return addto_layer(input=[ipt, short], act=ReluActivation())
def bottleneck(ipt, ch_out, stride):
ch_in = ipt.num_filter
tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
short = shortcut(ipt, ch_in, ch_out, stride)
return addto_layer(input=[ipt, short], act=ReluActivation())
def layer_warp(block_func, ipt, features, count, stride):
tmp = block_func(ipt, features, stride)
for i in range(1, count):
tmp = block_func(tmp, features, 1)
return tmp
```
`resnet_cifar10` 的连接结构主要有以下几个过程。
1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。
2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。
3. 最后对网络做均值池化并返回该层。
注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。
```python
def resnet_cifar10(ipt, depth=56):
# depth should be one of 20, 32, 44, 56, 110, 1202
assert (depth - 2) % 6 == 0
n = (depth - 2) / 6
nStages = {16, 64, 128}
conv1 = conv_bn_layer(ipt,
ch_in=3,
ch_out=16,
filter_size=3,
stride=1,
padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
pool = img_pool_layer(input=res3,
pool_size=8,
stride=1,
pool_type=AvgPooling())
return pool
```
## 模型训练
执行脚本 train.sh 进行模型训练, 其中指定配置文件、设备类型、线程个数、总共训练的轮数、模型存储路径等。
``` bash
sh train.sh
```
脚本 `train.sh` 如下:
```bash
#cfg=models/resnet.py
cfg=models/vgg.py
output=output
log=train.log
paddle train \
--config=$cfg \
--use_gpu=true \
--trainer_count=1 \
--log_period=100 \
--num_passes=300 \
--save_dir=$output \
2>&1 | tee $log
```
- `--config=$cfg` : 指定配置文件,默认是 `models/vgg.py`
- `--use_gpu=true` : 指定使用GPU训练,若使用CPU,设置为false。
- `--trainer_count=1` : 指定线程个数或GPU个数。
- `--log_period=100` : 指定日志打印的batch间隔。
- `--save_dir=$output` : 指定模型存储路径。
一轮训练log示例如下所示,经过1个pass, 训练集上平均error为0.79958 ,测试集上平均error为0.7858 。
```text
TrainerInternal.cpp:165] Batch=300 samples=38400 AvgCost=2.07708 CurrentCost=1.96158 Eval: classification_error_evaluator=0.81151 CurrentEval: classification_error_evaluator=0.789297
TrainerInternal.cpp:181] Pass=0 Batch=391 samples=50000 AvgCost=2.03348 Eval: classification_error_evaluator=0.79958
Tester.cpp:115] Test samples=10000 cost=1.99246 Eval: classification_error_evaluator=0.7858
```
图12是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。
<p align="center">
<img src="image/plot.png" width="400" ><br/>
图12. CIFAR10数据集上VGG模型的分类错误率
</p>
## 模型应用
在训练完成后,模型会保存在路径 `output/pass-%05d` 下,例如第300个pass的模型会保存在路径 `output/pass-00299`。 可以使用脚本 `classify.py` 对图片进行预测或提取特征,注意该脚本默认使用模型配置为 `models/vgg.py`
### 预测
可以按照下面方式预测图片的类别,默认使用GPU预测,如果使用CPU预测,在后面加参数 `-c`即可。
```bash
python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
```
预测结果为:
```text
Label of image/dog.png is: 5
```
### 特征提取
可以按照下面方式对图片提取特征,和预测使用方式不同的是指定job类型为extract,并需要指定提取的层。`classify.py` 默认以第一层卷积特征为例提取特征,并画出了类似图13的可视化图。VGG模型的第一层卷积有64个通道,图13展示了每个通道的灰度图。
```bash
python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
```
<p align="center">
<img src="image/fea_conv0.png" width="500"><br/>
图13. 卷积特征可视化图
</p>
## 总结
传统图像分类方法由多个阶段构成,框架较为复杂,而端到端的CNN模型结构可一步到位,而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型;然后基于CIFAR10数据集,介绍如何使用PaddlePaddle配置和训练CNN模型,尤其是VGG和ResNet模型;最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet,配置和训练流程是同样的,大家可以自行进行实验。
## 参考文献
[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
[20] http://deeplearning.net/tutorial/lenet.html
[21] https://www.cs.toronto.edu/~kriz/cifar.html
[22] http://cs231n.github.io/classification/
<br/>
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os, sys
import cPickle
import numpy as np
from PIL import Image
from optparse import OptionParser
import paddle.utils.image_util as image_util
from py_paddle import swig_paddle, DataProviderConverter
from paddle.trainer.PyDataProvider2 import dense_vector
from paddle.trainer.config_parser import parse_config
import logging
logging.basicConfig(
format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
logging.getLogger().setLevel(logging.INFO)
def vis_square(data, fname):
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
"""Take an array of shape (n, height, width) or (n, height, width, 3)
and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
# normalize data for display
data = (data - data.min()) / (data.max() - data.min())
# force the number of filters to be square
n = int(np.ceil(np.sqrt(data.shape[0])))
padding = (
((0, n**2 - data.shape[0]), (0, 1),
(0, 1)) # add some space between filters
+ ((0, 0), ) *
(data.ndim - 3)) # don't pad the last dimension (if there is one)
data = np.pad(
data, padding, mode='constant',
constant_values=1) # pad with ones (white)
# tile the filters into an image
data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(
range(4, data.ndim + 1)))
data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
plt.imshow(data, cmap='gray')
plt.savefig(fname)
plt.axis('off')
class ImageClassifier():
def __init__(self,
train_conf,
resize_dim,
crop_dim,
model_dir=None,
use_gpu=True,
mean_file=None,
oversample=False,
is_color=True):
self.train_conf = train_conf
self.model_dir = model_dir
if model_dir is None:
self.model_dir = os.path.dirname(train_conf)
self.resize_dim = resize_dim
self.crop_dims = [crop_dim, crop_dim]
self.oversample = oversample
self.is_color = is_color
self.transformer = image_util.ImageTransformer(is_color=is_color)
self.transformer.set_transpose((2, 0, 1))
self.transformer.set_channel_swap((2, 1, 0))
self.mean_file = mean_file
if self.mean_file is not None:
mean = np.load(self.mean_file)['mean']
mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
self.transformer.set_mean(mean) # mean pixel
else:
# if you use three mean value, set like:
# this three mean value is calculated from ImageNet.
self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
conf_args = "use_gpu=%d,is_predict=1" % (int(use_gpu))
conf = parse_config(train_conf, conf_args)
swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
self.network = swig_paddle.GradientMachine.createFromConfigProto(
conf.model_config)
assert isinstance(self.network, swig_paddle.GradientMachine)
self.network.loadParameters(self.model_dir)
dim = 3 * self.crop_dims[0] * self.crop_dims[1]
slots = [dense_vector(dim)]
self.converter = DataProviderConverter(slots)
def get_data(self, img_path):
"""
1. load image from img_path.
2. resize or oversampling.
3. transformer data: transpose, channel swap, sub mean.
return K x H x W ndarray.
img_path: image path.
"""
image = image_util.load_image(img_path, self.is_color)
# Another way to extract oversampled features is that
# cropping and averaging from large feature map which is
# calculated by large size of image.
# This way reduces the computation.
if self.oversample:
image = image_util.resize_image(image, self.resize_dim)
image = np.array(image)
input = np.zeros(
(1, image.shape[0], image.shape[1], 3), dtype=np.float32)
input[0] = image.astype(np.float32)
input = image_util.oversample(input, self.crop_dims)
else:
image = image.resize(self.crop_dims, Image.ANTIALIAS)
input = np.zeros(
(1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
input[0] = np.array(image).astype(np.float32)
data_in = []
for img in input:
img = self.transformer.transformer(img).flatten()
data_in.append([img.tolist()])
return data_in
def forward(self, input_data):
in_arg = self.converter(input_data)
return self.network.forwardTest(in_arg)
def forward(self, data, output_layer):
input = self.converter(data)
self.network.forwardTest(input)
output = self.network.getLayerOutputs(output_layer)
res = {}
if isinstance(output_layer, basestring):
output_layer = [output_layer]
for name in output_layer:
# For oversampling, average predictions across crops.
# If not, the shape of output[name]: (1, class_number),
# the mean is also applicable.
res[name] = output[name].mean(0)
return res
def option_parser():
usage = "%prog -c config -i data_list -w model_dir [options]"
parser = OptionParser(usage="usage: %s" % usage)
parser.add_option(
"--job",
action="store",
dest="job_type",
choices=[
'predict',
'extract',
],
default='predict',
help="The job type. \
predict: predicting,\
extract: extract features")
parser.add_option(
"--conf",
action="store",
dest="train_conf",
default='models/vgg.py',
help="network config")
parser.add_option(
"--data",
action="store",
dest="data_file",
default='image/dog.png',
help="image list")
parser.add_option(
"--model",
action="store",
dest="model_path",
default=None,
help="model path")
parser.add_option(
"-c", dest="cpu_gpu", action="store_false", help="Use cpu mode.")
parser.add_option(
"-g",
dest="cpu_gpu",
default=True,
action="store_true",
help="Use gpu mode.")
parser.add_option(
"--mean",
action="store",
dest="mean",
default='data/mean.meta',
help="The mean file.")
parser.add_option(
"--multi_crop",
action="store_true",
dest="multi_crop",
default=False,
help="Wether to use multiple crops on image.")
return parser.parse_args()
def main():
options, args = option_parser()
mean = 'data/mean.meta' if not options.mean else options.mean
conf = 'models/vgg.py' if not options.train_conf else options.train_conf
obj = ImageClassifier(
conf,
32,
32,
options.model_path,
use_gpu=options.cpu_gpu,
mean_file=mean,
oversample=options.multi_crop)
image_path = options.data_file
if options.job_type == 'predict':
output_layer = '__fc_layer_2__'
data = obj.get_data(image_path)
prob = obj.forward(data, output_layer)
lab = np.argsort(-prob[output_layer])
logging.info("Label of %s is: %d", image_path, lab[0])
elif options.job_type == "extract":
output_layer = '__conv_0__'
data = obj.get_data(options.data_file)
features = obj.forward(data, output_layer)
dshape = (64, 32, 32)
fea = features[output_layer].reshape(dshape)
vis_square(fea, 'fea_conv0.png')
if __name__ == '__main__':
main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import cPickle
DATA = "cifar-10-batches-py"
CHANNEL = 3
HEIGHT = 32
WIDTH = 32
def create_mean(dataset):
if not os.path.isfile("mean.meta"):
mean = np.zeros(CHANNEL * HEIGHT * WIDTH)
num = 0
for f in dataset:
batch = np.load(f)
mean += batch['data'].sum(0)
num += len(batch['data'])
mean /= num
print mean.size
data = {"mean": mean, "size": mean.size}
cPickle.dump(
data, open("mean.meta", 'w'), protocol=cPickle.HIGHEST_PROTOCOL)
def create_data():
train_set = [DATA + "/data_batch_%d" % (i + 1) for i in xrange(0, 5)]
test_set = [DATA + "/test_batch"]
# create mean values
create_mean(train_set)
# create dataset lists
if not os.path.isfile("train.txt"):
train = ["data/" + i for i in train_set]
open("train.txt", "w").write("\n".join(train))
open("train.list", "w").write("\n".join(["data/train.txt"]))
if not os.path.isfile("text.txt"):
test = ["data/" + i for i in test_set]
open("test.txt", "w").write("\n".join(test))
open("test.list", "w").write("\n".join(["data/test.txt"]))
if __name__ == '__main__':
create_data()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
tar zxf cifar-10-python.tar.gz
rm cifar-10-python.tar.gz
python cifar10.py
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import cPickle
from paddle.trainer.PyDataProvider2 import *
def initializer(settings, mean_path, is_train, **kwargs):
settings.is_train = is_train
settings.input_size = 3 * 32 * 32
settings.mean = np.load(mean_path)['mean']
settings.input_types = {
'image': dense_vector(settings.input_size),
'label': integer_value(10)
}
@provider(init_hook=initializer, pool_size=50000)
def process(settings, file_list):
with open(file_list, 'r') as fdata:
for fname in fdata:
fo = open(fname.strip(), 'rb')
batch = cPickle.load(fo)
fo.close()
images = batch['data']
labels = batch['labels']
for im, lab in zip(images, labels):
if settings.is_train and np.random.randint(2):
im = im.reshape(3, 32, 32)
im = im[:, :, ::-1]
im = im.flatten()
im = im - settings.mean
yield {'image': im.astype('float32'), 'label': int(lab)}
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
if not is_predict:
args = {'meta': 'data/mean.meta'}
define_py_data_sources2(
train_list='data/train.list',
test_list='data/test.list',
module='dataprovider',
obj='process',
args={'mean_path': 'data/mean.meta'})
settings(
batch_size=128,
learning_rate=0.1 / 128.0,
learning_rate_decay_a=0.1,
learning_rate_decay_b=50000 * 140,
learning_rate_schedule='discexp',
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0002 * 128))
def conv_bn_layer(input,
ch_out,
filter_size,
stride,
padding,
active_type=ReluActivation(),
ch_in=None):
tmp = img_conv_layer(
input=input,
filter_size=filter_size,
num_channels=ch_in,
num_filters=ch_out,
stride=stride,
padding=padding,
act=LinearActivation(),
bias_attr=False)
return batch_norm_layer(input=tmp, act=active_type)
def shortcut(ipt, n_in, n_out, stride):
if n_in != n_out:
print("n_in != n_out")
return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
else:
return ipt
def basicblock(ipt, ch_out, stride):
ch_in = ipt.num_filters
tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
short = shortcut(ipt, ch_in, ch_out, stride)
return addto_layer(input=[tmp, short], act=ReluActivation())
def bottleneck(ipt, ch_out, stride):
ch_in = ipt.num_filter
tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
short = shortcut(ipt, ch_in, ch_out * 4, stride)
return addto_layer(input=[tmp, short], act=ReluActivation())
def layer_warp(block_func, ipt, features, count, stride):
tmp = block_func(ipt, features, stride)
for i in range(1, count):
tmp = block_func(tmp, features, 1)
return tmp
def resnet_imagenet(ipt, depth=50):
cfg = {
18: ([2, 2, 2, 1], basicblock),
34: ([3, 4, 6, 3], basicblock),
50: ([3, 4, 6, 3], bottleneck),
101: ([3, 4, 23, 3], bottleneck),
152: ([3, 8, 36, 3], bottleneck)
}
stages, block_func = cfg[depth]
tmp = conv_bn_layer(
ipt, ch_in=3, ch_out=64, filter_size=7, stride=2, padding=3)
tmp = img_pool_layer(input=tmp, pool_size=3, stride=2)
tmp = layer_warp(block_func, tmp, 64, stages[0], 1)
tmp = layer_warp(block_func, tmp, 128, stages[1], 2)
tmp = layer_warp(block_func, tmp, 256, stages[2], 2)
tmp = layer_warp(block_func, tmp, 512, stages[3], 2)
tmp = img_pool_layer(
input=tmp, pool_size=7, stride=1, pool_type=AvgPooling())
tmp = fc_layer(input=tmp, size=1000, act=SoftmaxActivation())
return tmp
def resnet_cifar10(ipt, depth=32):
#depth should be one of 20, 32, 44, 56, 110, 1202
assert (depth - 2) % 6 == 0
n = (depth - 2) / 6
nStages = {16, 64, 128}
conv1 = conv_bn_layer(
ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
pool = img_pool_layer(
input=res3, pool_size=8, stride=1, pool_type=AvgPooling())
return pool
datadim = 3 * 32 * 32
classdim = 10
data = data_layer(name='image', size=datadim)
net = resnet_cifar10(data, depth=32)
out = fc_layer(input=net, size=10, act=SoftmaxActivation())
if not is_predict:
lbl = data_layer(name="label", size=classdim)
outputs(classification_cost(input=out, label=lbl))
else:
outputs(out)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
is_predict = get_config_arg("is_predict", bool, False)
if not is_predict:
define_py_data_sources2(
train_list='data/train.list',
test_list='data/test.list',
module='dataprovider',
obj='process',
args={'mean_path': 'data/mean.meta'})
settings(
batch_size=128,
learning_rate=0.1 / 128.0,
learning_rate_decay_a=0.1,
learning_rate_decay_b=50000 * 100,
learning_rate_schedule='discexp',
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * 128), )
def vgg_bn_drop(input):
def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
return img_conv_group(
input=ipt,
num_channels=num_channels,
pool_size=2,
pool_stride=2,
conv_num_filter=[num_filter] * groups,
conv_filter_size=3,
conv_act=ReluActivation(),
conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts,
pool_type=MaxPooling())
conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
drop = dropout_layer(input=conv5, dropout_rate=0.5)
fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
bn = batch_norm_layer(
input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
return fc2
datadim = 3 * 32 * 32
classdim = 10
data = data_layer(name='image', size=datadim)
net = vgg_bn_drop(data)
out = fc_layer(input=net, size=classdim, act=SoftmaxActivation())
if not is_predict:
lbl = data_layer(name="label", size=classdim)
cost = classification_cost(input=out, label=lbl)
outputs(cost)
else:
outputs(out)
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
#cfg=models/resnet.py
cfg=models/vgg.py
output=output
log=train.log
paddle train \
--config=$cfg \
--use_gpu=true \
--trainer_count=1 \
--log_period=100 \
--num_passes=300 \
--save_dir=$output \
2>&1 | tee $log
...@@ -103,35 +103,35 @@ ...@@ -103,35 +103,35 @@
<div class="card-block pl-0 pr-0 pt-0 pb-0"> <div class="card-block pl-0 pr-0 pt-0 pb-0">
<div class="list-group "> <div class="list-group ">
<a href="./fit_a_line/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./01.fit_a_line/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Linear Regression Linear Regression
</a> </a>
<a href="./recognize_digits/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./02.recognize_digits/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Recognize Digits Recognize Digits
</a> </a>
<a href="./image_classification/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./03.image_classification/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Image Classification Image Classification
</a> </a>
<a href="./word2vec/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./04.word2vec/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Word2Vec Word2Vec
</a> </a>
<a href="./understand_sentiment/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./05.understand_sentiment/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Sentiment Analysis Sentiment Analysis
</a> </a>
<a href="./label_semantic_roles/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./06.label_semantic_roles/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Semantic Role Labeling Semantic Role Labeling
</a> </a>
<a href="./machine_translation/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./07.machine_translation/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Machine Translation Machine Translation
</a> </a>
<a href="./recommender_system/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./08.recommender_system/index.en.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
Personalized Recommendation Personalized Recommendation
</a> </a>
...@@ -142,7 +142,7 @@ ...@@ -142,7 +142,7 @@
</div> </div>
</div> </div>
<div class="col"> <div class="col">
<iframe src="./fit_a_line/index.en.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe"> <iframe src="./01.fit_a_line/index.en.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe">
</iframe> </iframe>
</div> </div>
</div> </div>
......
...@@ -3,35 +3,35 @@ ...@@ -3,35 +3,35 @@
"chapters": [ "chapters": [
{ {
"name": "Linear Regression", "name": "Linear Regression",
"link": "./fit_a_line/index.en.html" "link": "./01.fit_a_line/index.en.html"
}, },
{ {
"name": "Recognize Digits", "name": "Recognize Digits",
"link": "./recognize_digits/index.en.html" "link": "./02.recognize_digits/index.en.html"
}, },
{ {
"name": "Image Classification", "name": "Image Classification",
"link": "./image_classification/index.en.html" "link": "./03.image_classification/index.en.html"
}, },
{ {
"name": "Word2Vec", "name": "Word2Vec",
"link": "./word2vec/index.en.html" "link": "./04.word2vec/index.en.html"
}, },
{ {
"name": "Sentiment Analysis", "name": "Sentiment Analysis",
"link": "./understand_sentiment/index.en.html" "link": "./05.understand_sentiment/index.en.html"
}, },
{ {
"name": "Semantic Role Labeling", "name": "Semantic Role Labeling",
"link": "./label_semantic_roles/index.en.html" "link": "./06.label_semantic_roles/index.en.html"
}, },
{ {
"name": "Machine Translation", "name": "Machine Translation",
"link": "./machine_translation/index.en.html" "link": "./07.machine_translation/index.en.html"
}, },
{ {
"name": "Personalized Recommendation", "name": "Personalized Recommendation",
"link": "./recommender_system/index.en.html" "link": "./08.recommender_system/index.en.html"
} }
] ]
} }
...@@ -107,35 +107,35 @@ ...@@ -107,35 +107,35 @@
<div class="card-block pl-0 pr-0 pt-0 pb-0"> <div class="card-block pl-0 pr-0 pt-0 pb-0">
<div class="list-group "> <div class="list-group ">
<a href="./fit_a_line/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./01.fit_a_line/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
新手入门 新手入门
</a> </a>
<a href="./recognize_digits/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./02.recognize_digits/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
识别数字 识别数字
</a> </a>
<a href="./image_classification/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./03.image_classification/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
图像分类 图像分类
</a> </a>
<a href="./word2vec/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./04.word2vec/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
词向量 词向量
</a> </a>
<a href="./understand_sentiment/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./05.understand_sentiment/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
情感分析 情感分析
</a> </a>
<a href="./label_semantic_roles/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./06.label_semantic_roles/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
语义角色标注 语义角色标注
</a> </a>
<a href="./machine_translation/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./07.machine_translation/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
机器翻译 机器翻译
</a> </a>
<a href="./recommender_system/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;"> <a href="./08.recommender_system/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
个性化推荐 个性化推荐
</a> </a>
...@@ -146,7 +146,7 @@ ...@@ -146,7 +146,7 @@
</div> </div>
</div> </div>
<div class="col"> <div class="col">
<iframe src="./fit_a_line/index.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe"> <iframe src="./01.fit_a_line/index.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe">
</iframe> </iframe>
</div> </div>
</div> </div>
......
...@@ -3,35 +3,35 @@ ...@@ -3,35 +3,35 @@
"chapters": [ "chapters": [
{ {
"name": "新手入门", "name": "新手入门",
"link": "./fit_a_line/index.html" "link": "./01.fit_a_line/index.html"
}, },
{ {
"name": "识别数字", "name": "识别数字",
"link": "./recognize_digits/index.html" "link": "./02.recognize_digits/index.html"
}, },
{ {
"name": "图像分类", "name": "图像分类",
"link": "./image_classification/index.html" "link": "./03.image_classification/index.html"
}, },
{ {
"name": "词向量", "name": "词向量",
"link": "./word2vec/index.html" "link": "./04.word2vec/index.html"
}, },
{ {
"name": "情感分析", "name": "情感分析",
"link": "./understand_sentiment/index.html" "link": "./05.understand_sentiment/index.html"
}, },
{ {
"name": "语义角色标注", "name": "语义角色标注",
"link": "./label_semantic_roles/index.html" "link": "./06.label_semantic_roles/index.html"
}, },
{ {
"name": "机器翻译", "name": "机器翻译",
"link": "./machine_translation/index.html" "link": "./07.machine_translation/index.html"
}, },
{ {
"name": "个性化推荐", "name": "个性化推荐",
"link": "./recommender_system/index.html" "link": "./08.recommender_system/index.html"
} }
] ]
} }
...@@ -118,7 +118,7 @@ ...@@ -118,7 +118,7 @@
</div> </div>
</div> </div>
<div class="col"> <div class="col">
<iframe src="./fit_a_line/index{% if is_en %}.en{% endif %}.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe"> <iframe src="./01.fit_a_line/index{% if is_en %}.en{% endif %}.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe">
</iframe> </iframe>
</div> </div>
</div> </div>
......
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Personalized Recommendation\n",
"\n",
"The source code of this tutorial is in [book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system).\n",
"\n",
"For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).\n",
"\n",
"\n",
"## Background\n",
"\n",
"With the fast growth of e-commerce, online videos, and online reading business, users have to rely on recommender systems to avoid manually browsing tremendous volume of choices. Recommender systems understand users' interest by mining user behavior and other properties of users and products.\n",
"\n",
"Some well know approaches include:\n",
"\n",
"- User behavior-based approach. A well-known method is collaborative filtering. The underlying assumption is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.\n",
"\n",
"- Content-based recommendation[[1](#reference)]. This approach infers feature vectors that represent products from their descriptions. It also infers feature vectors that represent users' interests. Then it measures the relevance of users and products by some distances between these feature vectors.\n",
"\n",
"- Hybrid approach[[2](#reference)]: This approach uses the content-based information to help address the cold start problem[[6](#reference)] in behavior-based approach.\n",
"\n",
"Among these options, collaborative filtering might be the most studied one. Some of its variants include user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.\n",
"\n",
"This tutorial explains a deep learning based approach and how to implement it using PaddlePaddle. We will train a model using a dataset that includes user information, movie information, and ratings. Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.\n",
"\n",
"\n",
"## Model Overview\n",
"\n",
"To know more about deep learning based recommendation, let us start from going over the Youtube recommender system[[7](#参考文献)] before introducing our hybrid model.\n",
"\n",
"\n",
"### YouTube's Deep Learning Recommendation Model\n",
"\n",
"YouTube is a video-sharing Web site with one of the largest user base in the world. Its recommender system serves more than a billion users. This system is composed of two major parts: candidate generation and ranking. The former selects few hundreds of candidates from millions of videos, and the latter ranks and outputs the top 10.\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/YouTube_Overview.en.png\" width=\"70%\" \u003e\u003cbr/\u003e\n",
"Figure 1. YouTube recommender system overview.\n",
"\u003c/p\u003e\n",
"\n",
"#### Candidate Generation Network\n",
"\n",
"Youtube models candidate generation as a multiclass classification problem with a huge number of classes equal to the number of videos. The architecture of the model is as follows:\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/Deep_candidate_generation_model_architecture.en.png\" width=\"70%\" \u003e\u003cbr/\u003e\n",
"Figure. Deep candidate geeration model.\n",
"\u003c/p\u003e\n",
"\n",
"The first stage of this model maps watching history and search queries into fixed-length representative features. Then, an MLP (multi-layer perceptron, as described in the [Recognize Digits](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md) tutorial) takes the concatenation of all representative vectors. The output of the MLP represents the user' *intrinsic interests*. At training time, it is used together with a softmax output layer for minimizing the classification error. At serving time, it is used to compute the relevance of the user with all movies.\n",
"\n",
"For a user $U$, the predicted watching probability of video $i$ is\n",
"\n",
"$$P(\\omega=i|u)=\\frac{e^{v_{i}u}}{\\sum_{j \\in V}e^{v_{j}u}}$$\n",
"\n",
"where $u$ is the representative vector of user $U$, $V$ is the corpus of all videos, $v_i$ is the representative vector of the $i$-th video. $u$ and $v_i$ are vectors of the same length, so we can compute their dot product using a fully connected layer.\n",
"\n",
"This model could have a performance issue as the softmax output covers millions of classification labels. To optimize performance, at the training time, the authors down-sample negative samples, so the actual number of classes is reduced to thousands. At serving time, the authors ignore the normalization of the softmax outputs, because the results are just for ranking.\n",
"\n",
"\n",
"#### Ranking Network\n",
"\n",
"The architecture of the ranking network is similar to that of the candidate generation network. Similar to ranking models widely used in online advertising, it uses rich features like video ID, last watching time, etc. The output layer of the ranking network is a weighted logistic regression, which rates all candidate videos.\n",
"\n",
"\n",
"### Hybrid Model\n",
"\n",
"In the section, let us introduce our movie recommendation system.\n",
"\n",
"In our network, the input includes features of users and movies. The user feature includes four properties: user ID, gender, occupation, and age. Movie features include their IDs, genres, and titles.\n",
"\n",
"We use fully-connected layers to map user features into representative feature vectors and concatenate them. The process of movie features is similar, except that for movie titles -- we feed titles into a text convolution network as described in the [sentiment analysis tutorial](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md))to get a fixed-length representative feature vector.\n",
"\n",
"Given the feature vectors of users and movies, we compute the relevance using cosine similarity. We minimize the squared error at training time.\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\n",
"\u003cimg src=\"image/rec_regression_network_en.png\" width=\"90%\" \u003e\u003cbr/\u003e\n",
"Figure 3. A hybrid recommendation model.\n",
"\u003c/p\u003e\n",
"\n",
"## Dataset\n",
"\n",
"We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m.zip) to train our model. This dataset includes 10,000 ratings of 4,000 movies from 6,000 users to 4,000 movies. Each rate is in the range of 1~5. Thanks to GroupLens Research for collecting, processing and publishing the dataset.\n",
"\n",
"`paddle.v2.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need for us to manually download and preprocess `MovieLens` dataset.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"# Run this block to show dataset's documentation\n",
"help(paddle.v2.dataset.movielens)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"The raw `MoiveLens` contains movie ratings, relevant features from both movies and users.\n",
"For instance, one movie's feature could be:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"movie_info = paddle.dataset.movielens.movie_info()\n",
"print movie_info.values()[0]\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"```text\n",
"\u003cMovieInfo id(1), title(Toy Story), categories(['Animation', \"Children's\", 'Comedy'])\u003e\n",
"```\n",
"\n",
"One user's feature could be:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"user_info = paddle.dataset.movielens.user_info()\n",
"print user_info.values()[0]\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"```text\n",
"\u003cUserInfo id(1), gender(F), age(1), job(10)\u003e\n",
"```\n",
"\n",
"In this dateset, the distribution of age is shown as follows:\n",
"\n",
"```text\n",
"1: \"Under 18\"\n",
"18: \"18-24\"\n",
"25: \"25-34\"\n",
"35: \"35-44\"\n",
"45: \"45-49\"\n",
"50: \"50-55\"\n",
"56: \"56+\"\n",
"```\n",
"\n",
"User's occupation is selected from the following options:\n",
"\n",
"```text\n",
"0: \"other\" or not specified\n",
"1: \"academic/educator\"\n",
"2: \"artist\"\n",
"3: \"clerical/admin\"\n",
"4: \"college/grad student\"\n",
"5: \"customer service\"\n",
"6: \"doctor/health care\"\n",
"7: \"executive/managerial\"\n",
"8: \"farmer\"\n",
"9: \"homemaker\"\n",
"10: \"K-12 student\"\n",
"11: \"lawyer\"\n",
"12: \"programmer\"\n",
"13: \"retired\"\n",
"14: \"sales/marketing\"\n",
"15: \"scientist\"\n",
"16: \"self-employed\"\n",
"17: \"technician/engineer\"\n",
"18: \"tradesman/craftsman\"\n",
"19: \"unemployed\"\n",
"20: \"writer\"\n",
"```\n",
"\n",
"Each record consists of three main components: user features, movie features and movie ratings.\n",
"Likewise, as a simple example, consider the following:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"train_set_creator = paddle.dataset.movielens.train()\n",
"train_sample = next(train_set_creator())\n",
"uid = train_sample[0]\n",
"mov_id = train_sample[len(user_info[uid].value())]\n",
"print \"User %s rates Movie %s with Score %s\"%(user_info[uid], movie_info[mov_id], train_sample[-1])\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"```text\n",
"User \u003cUserInfo id(1), gender(F), age(1), job(10)\u003e rates Movie \u003cMovieInfo id(1193), title(One Flew Over the Cuckoo's Nest), categories(['Drama'])\u003e with Score [5.0]\n",
"```\n",
"\n",
"The output shows that user 1 gave movie `1193` a rating of 5.\n",
"\n",
"After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works.\n",
"\n",
"## Model Architecture\n",
"\n",
"### Initialize PaddlePaddle\n",
"\n",
"First, we must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"%matplotlib inline\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from IPython import display\n",
"import cPickle\n",
"\n",
"import paddle.v2 as paddle\n",
"\n",
"paddle.init(use_gpu=False)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### Model Configuration\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"uid = paddle.layer.data(\n",
" name='user_id',\n",
" type=paddle.data_type.integer_value(\n",
" paddle.dataset.movielens.max_user_id() + 1))\n",
"usr_emb = paddle.layer.embedding(input=uid, size=32)\n",
"\n",
"usr_gender_id = paddle.layer.data(\n",
" name='gender_id', type=paddle.data_type.integer_value(2))\n",
"usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)\n",
"\n",
"usr_age_id = paddle.layer.data(\n",
" name='age_id',\n",
" type=paddle.data_type.integer_value(\n",
" len(paddle.dataset.movielens.age_table)))\n",
"usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)\n",
"\n",
"usr_job_id = paddle.layer.data(\n",
" name='job_id',\n",
" type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(\n",
" ) + 1))\n",
"usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"As shown in the above code, the input is four dimension integers for each user, that is, `user_id`,`gender_id`, `age_id` and `job_id`. In order to deal with these features conveniently, we use the language model in NLP to transform these discrete values into embedding vaules `usr_emb`, `usr_gender_emb`, `usr_age_emb` and `usr_job_emb`.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"usr_combined_features = paddle.layer.fc(\n",
" input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],\n",
" size=200,\n",
" act=paddle.activation.Tanh())\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Then, employing user features as input, directly connecting to a fully-connected layer, which is used to reduce dimension to 200.\n",
"\n",
"Furthermore, we do a similar transformation for each movie feature. The model configuration is:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"mov_id = paddle.layer.data(\n",
" name='movie_id',\n",
" type=paddle.data_type.integer_value(\n",
" paddle.dataset.movielens.max_movie_id() + 1))\n",
"mov_emb = paddle.layer.embedding(input=mov_id, size=32)\n",
"\n",
"mov_categories = paddle.layer.data(\n",
" name='category_id',\n",
" type=paddle.data_type.sparse_binary_vector(\n",
" len(paddle.dataset.movielens.movie_categories())))\n",
"\n",
"mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)\n",
"\n",
"\n",
"movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()\n",
"mov_title_id = paddle.layer.data(\n",
" name='movie_title',\n",
" type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))\n",
"mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)\n",
"mov_title_conv = paddle.networks.sequence_conv_pool(\n",
" input=mov_title_emb, hidden_size=32, context_len=3)\n",
"\n",
"mov_combined_features = paddle.layer.fc(\n",
" input=[mov_emb, mov_categories_hidden, mov_title_conv],\n",
" size=200,\n",
" act=paddle.activation.Tanh())\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Movie title, a sequence of words represented by an integer word index sequence, will be feed into a `sequence_conv_pool` layer, which will apply convolution and pooling on time dimension. Because pooling is done on time dimension, the output will be a fixed-length vector regardless the length of the input sequence.\n",
"\n",
"Finally, we can use cosine similarity to calculate the similarity between user characteristics and movie features.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)\n",
"cost = paddle.layer.regression_cost(\n",
" input=inference,\n",
" label=paddle.layer.data(\n",
" name='score', type=paddle.data_type.dense_vector(1)))\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## Model Training\n",
"\n",
"### Define Parameters\n",
"\n",
"First, we define the model parameters according to the previous model configuration `cost`.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"# Create parameters\n",
"parameters = paddle.parameters.create(cost)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### Create Trainer\n",
"\n",
"Before jumping into creating a training module, algorithm setting is also necessary. Here we specified Adam optimization algorithm via `paddle.optimizer`.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,\n",
" update_equation=paddle.optimizer.Adam(learning_rate=1e-4))\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"```text\n",
"[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]\n",
"[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__regression_cost_0__]\n",
"```\n",
"\n",
"### Training\n",
"\n",
"`paddle.dataset.movielens.train` will yield records during each pass, after shuffling, a batch input is generated for training.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"reader=paddle.reader.batch(\n",
" paddle.reader.shuffle(\n",
" paddle.dataset.movielens.trai(), buf_size=8192),\n",
" batch_size=256)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `movielens.train` corresponds to `user_id` feature.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"feeding = {\n",
" 'user_id': 0,\n",
" 'gender_id': 1,\n",
" 'age_id': 2,\n",
" 'job_id': 3,\n",
" 'movie_id': 4,\n",
" 'category_id': 5,\n",
" 'movie_title': 6,\n",
" 'score': 7\n",
"}\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Callback function `event_handler` will be called during training when a pre-defined event happens.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"step=0\n",
"\n",
"train_costs=[],[]\n",
"test_costs=[],[]\n",
"\n",
"def event_handler(event):\n",
" global step\n",
" global train_costs\n",
" global test_costs\n",
" if isinstance(event, paddle.event.EndIteration):\n",
" need_plot = False\n",
" if step % 10 == 0: # every 10 batches, record a train cost\n",
" train_costs[0].append(step)\n",
" train_costs[1].append(event.cost)\n",
"\n",
" if step % 1000 == 0: # every 1000 batches, record a test cost\n",
" result = trainer.test(reader=paddle.batch(\n",
" paddle.dataset.movielens.test(), batch_size=256))\n",
" test_costs[0].append(step)\n",
" test_costs[1].append(result.cost)\n",
"\n",
" if step % 100 == 0: # every 100 batches, update cost plot\n",
" plt.plot(*train_costs)\n",
" plt.plot(*test_costs)\n",
" plt.legend(['Train Cost', 'Test Cost'], loc='upper left')\n",
" display.clear_output(wait=True)\n",
" display.display(plt.gcf())\n",
" plt.gcf().clear()\n",
" step += 1\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Finally, we can invoke `trainer.train` to start training:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"trainer.train(\n",
" reader=reader,\n",
" event_handler=event_handler,\n",
" feeding=feeding,\n",
" num_passes=200)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## Conclusion\n",
"\n",
"This tutorial goes over traditional approaches in recommender system and a deep learning based approach. We also show that how to train and use the model with PaddlePaddle. Deep learning has been well used in computer vision and NLP, we look forward to its new successes in recommender systems.\n",
"\n",
"## Reference\n",
"\n",
"1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.\n",
"2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.\n",
"3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.\n",
"4. Sarwar, Badrul, et al. \"[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)\" *Proceedings of the 10th International Conference on World Wide Web*. ACM, 2001.\n",
"5. Kautz, Henry, Bart Selman, and Mehul Shah. \"[Referral Web: Combining Social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)\" Communications of the ACM 40.3 (1997): 63-65. APA\n",
"6. Yuan, Jianbo, et al. [\"Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach.\"](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).\n",
"7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.\n",
"\n",
"\u003cbr/\u003e\n",
"This tutorial is contributed by \u003ca xmlns:cc=\"http://creativecommons.org/ns#\" href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" rel=\"cc:attributionURL\"\u003ePaddlePaddle\u003c/a\u003e, and licensed under a \u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003eCreative Commons Attribution-NonCommercial-ShareAlike 4.0 International License\u003c/a\u003e.\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 个性化推荐\n",
"\n",
"本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。\n",
"\n",
"## 背景介绍\n",
"\n",
"在网络技术不断发展和电子商务规模不断扩大的背景下,商品数量和种类快速增长,用户需要花费大量时间才能找到自己想买的商品,这就是信息超载问题。为了解决这个难题,推荐系统(Recommender System)应运而生。\n",
"\n",
"个性化推荐系统是信息过滤系统(Information Filtering System)的子集,它可以用在很多领域,如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为,发现用户的个性化需求与兴趣特点,将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同,推荐系统不需要用户准确地描述出自己的需求,而是根据分析历史行为建模,主动提供满足用户兴趣和需求的信息。\n",
"\n",
"传统的推荐系统方法主要有:\n",
"\n",
"- 协同过滤推荐(Collaborative Filtering Recommendation):该方法收集分析用户历史行为、活动、偏好,计算一个用户与其他用户的相似度,利用目标用户的相似用户对商品评价的加权评价值,来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品;缺点是对于没有任何行为的新用户存在冷启动的问题,同时也存在用户与商品之间的交互数据不够多造成的稀疏问题,会导致模型难以找到相近用户。\n",
"- 基于内容过滤推荐[[1](#参考文献)](Content-based Filtering Recommendation):该方法利用商品的内容描述,抽象出有意义的特征,通过计算用户的兴趣和商品描述之间的相似度,来给用户做推荐。优点是简单直接,不需要依据其他用户对商品的评价,而是通过商品属性进行商品相似度度量,从而推荐给用户所感兴趣商品的相似商品;缺点是对于没有任何行为的新用户同样存在冷启动的问题。\n",
"- 组合推荐[[2](#参考文献)](Hybrid Recommendation):运用不同的输入和技术共同进行推荐,以弥补各自推荐技术的缺点。\n",
"\n",
"其中协同过滤是应用最广泛的技术之一,它又可以分为多个子类:基于用户 (User-Based)的推荐[[3](#参考文献)] 、基于物品(Item-Based)的推荐[[4](#参考文献)]、基于社交网络关系(Social-Based)的推荐[[5](#参考文献)]、基于模型(Model-based)的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想,此后,基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。\n",
"\n",
"深度学习具有优秀的自动提取特征的能力,能够学习多层次的抽象特征表示,并对异质或跨域的内容信息进行学习,可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型,以及如何使用PaddlePaddle实现模型。\n",
"\n",
"## 效果展示\n",
"\n",
"我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后,只需要输入对应的用户ID和电影ID,就可以得出一个匹配的分数(范围[1,5],分数越高视为兴趣越大),然后根据所有电影的推荐得分排序,推荐给用户可能感兴趣的电影。\n",
"\n",
"```\n",
"Input movie_id: 1962\n",
"Input user_id: 1\n",
"Prediction Score is 4.25\n",
"```\n",
"\n",
"## 模型概览\n",
"\n",
"本章中,我们首先介绍YouTube的视频推荐系统[[7](#参考文献)],然后介绍我们实现的融合推荐模型。\n",
"\n",
"### YouTube的深度神经网络推荐系统\n",
"\n",
"YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示:\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/YouTube_Overview.png\" width=\"70%\" \u003e\u003cbr/\u003e\n",
"图1. YouTube 推荐系统结构\n",
"\u003c/p\u003e\n",
"\n",
"#### 候选生成网络(Candidate Generation Network)\n",
"\n",
"候选生成网络将推荐问题建模为一个类别数极大的多类分类问题:对于一个Youtube用户,使用其观看历史(视频ID)、搜索词记录(search tokens)、人口学信息(如地理位置、用户登录设备)、二值特征(如性别,是否登录)和连续特征(如用户年龄)等,对视频库中所有视频进行多分类,得到每一类别的分类结果(即每一个视频的推荐概率),最终输出概率较高的几百个视频。\n",
"\n",
"首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\u003cimg src=\"image/Deep_candidate_generation_model_architecture.png\" width=\"70%\" \u003e\u003cbr/\u003e\n",
"图2. 候选生成网络结构\n",
"\u003c/p\u003e\n",
"\n",
"对于一个用户$U$,预测此刻用户要观看的视频$\\omega$为视频$i$的概率公式为:\n",
"\n",
"$$P(\\omega=i|u)=\\frac{e^{v_{i}u}}{\\sum_{j \\in V}e^{v_{j}u}}$$\n",
"\n",
"其中$u$为用户$U$的特征表示,$V$为视频库集合,$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量,两者点积可以通过全连接层实现。\n",
"\n",
"考虑到softmax分类的类别数非常多,为了保证一定的计算效率:1)训练阶段,使用负样本类别采样将实际计算的类别数缩小至数千;2)推荐(预测)阶段,忽略softmax的归一化计算(不影响结果),将类别打分问题简化为点积(dot product)空间中的最近邻(nearest neighbor)搜索问题,取与$u$最近的$k$个视频作为生成的候选。\n",
"\n",
"#### 排序网络(Ranking Network)\n",
"排序网络的结构类似于候选生成网络,但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似,这里也构造了大量的用于视频排序的相关特征(如视频 ID、上次观看时间等)。这些特征的处理方式和候选生成网络类似,不同之处是排序网络的顶部是一个加权逻辑回归(weighted logistic regression),它对所有候选视频进行打分,从高到底排序后将分数较高的一些视频返回给用户。\n",
"\n",
"### 融合推荐模型\n",
"\n",
"在下文的电影推荐系统中:\n",
"\n",
"1. 首先,使用用户特征和电影特征作为神经网络的输入,其中:\n",
"\n",
" - 用户特征融合了四个属性信息,分别是用户ID、性别、职业和年龄。\n",
"\n",
" - 电影特征融合了三个属性信息,分别是电影ID、电影类型ID和电影名称。\n",
"\n",
"2. 对用户特征,将用户ID映射为维度大小为256的向量表示,输入全连接层,并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。\n",
"\n",
"3. 对电影特征,将电影ID以类似用户ID的方式进行处理,电影类型ID以向量的形式直接输入全连接层,电影名称用文本卷积神经网络(详见[第5章](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md))得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。\n",
"\n",
"4. 得到用户和电影的向量表示后,计算二者的余弦相似度作为推荐系统的打分。最后,用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。\n",
"\n",
"\u003cp align=\"center\"\u003e\n",
"\n",
"\u003cimg src=\"image/rec_regression_network.png\" width=\"90%\" \u003e\u003cbr/\u003e\n",
"图3. 融合推荐模型\n",
"\u003c/p\u003e\n",
"\n",
"## 数据准备\n",
"\n",
"### 数据介绍与下载\n",
"\n",
"我们以 [MovieLens 百万数据集(ml-1m)](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价(评分范围 1~5 分,均为整数),由 GroupLens Research 实验室搜集整理。\n",
"\n",
"Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens`\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"import paddle.v2 as paddle\n",
"paddle.init(use_gpu=False)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"# Run this block to show dataset's documentation\n",
"# help(paddle.dataset.movielens)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"在原始数据中包含电影的特征数据,用户的特征数据,和用户对电影的评分。\n",
"\n",
"例如,其中某一个电影特征为:\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"movie_info = paddle.dataset.movielens.movie_info()\n",
"print movie_info.values()[0]\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" \u003cMovieInfo id(1), title(Toy Story ), categories(['Animation', \"Children's\", 'Comedy'])\u003e\n",
"\n",
"\n",
"这表示,电影的id是1,标题是《Toy Story》,该电影被分为到三个类别中。这三个类别是动画,儿童,喜剧。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"user_info = paddle.dataset.movielens.user_info()\n",
"print user_info.values()[0]\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" \u003cUserInfo id(1), gender(F), age(1), job(10)\u003e\n",
"\n",
"\n",
"这表示,该用户ID是1,女性,年龄比18岁还年轻。职业ID是10。\n",
"\n",
"\n",
"其中,年龄使用下列分布\n",
"* 1: \"Under 18\"\n",
"* 18: \"18-24\"\n",
"* 25: \"25-34\"\n",
"* 35: \"35-44\"\n",
"* 45: \"45-49\"\n",
"* 50: \"50-55\"\n",
"* 56: \"56+\"\n",
"\n",
"职业是从下面几种选项里面选则得出:\n",
"* 0: \"other\" or not specified\n",
"* 1: \"academic/educator\"\n",
"* 2: \"artist\"\n",
"* 3: \"clerical/admin\"\n",
"* 4: \"college/grad student\"\n",
"* 5: \"customer service\"\n",
"* 6: \"doctor/health care\"\n",
"* 7: \"executive/managerial\"\n",
"* 8: \"farmer\"\n",
"* 9: \"homemaker\"\n",
"* 10: \"K-12 student\"\n",
"* 11: \"lawyer\"\n",
"* 12: \"programmer\"\n",
"* 13: \"retired\"\n",
"* 14: \"sales/marketing\"\n",
"* 15: \"scientist\"\n",
"* 16: \"self-employed\"\n",
"* 17: \"technician/engineer\"\n",
"* 18: \"tradesman/craftsman\"\n",
"* 19: \"unemployed\"\n",
"* 20: \"writer\"\n",
"\n",
"而对于每一条训练/测试数据,均为 \u003c用户特征\u003e + \u003c电影特征\u003e + 评分。\n",
"\n",
"例如,我们获得第一条训练数据:\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"train_set_creator = paddle.dataset.movielens.train()\n",
"train_sample = next(train_set_creator())\n",
"uid = train_sample[0]\n",
"mov_id = train_sample[len(user_info[uid].value())]\n",
"print \"User %s rates Movie %s with Score %s\"%(user_info[uid], movie_info[mov_id], train_sample[-1])\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" User \u003cUserInfo id(1), gender(F), age(1), job(10)\u003e rates Movie \u003cMovieInfo id(1193), title(One Flew Over the Cuckoo's Nest ), categories(['Drama'])\u003e with Score [5.0]\n",
"\n",
"\n",
"即用户1对电影1193的评价为5分。\n",
"\n",
"## 模型配置说明\n",
"\n",
"下面我们开始根据输入数据的形式配置模型。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"uid = paddle.layer.data(\n",
" name='user_id',\n",
" type=paddle.data_type.integer_value(\n",
" paddle.dataset.movielens.max_user_id() + 1))\n",
"usr_emb = paddle.layer.embedding(input=uid, size=32)\n",
"\n",
"usr_gender_id = paddle.layer.data(\n",
" name='gender_id', type=paddle.data_type.integer_value(2))\n",
"usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)\n",
"\n",
"usr_age_id = paddle.layer.data(\n",
" name='age_id',\n",
" type=paddle.data_type.integer_value(\n",
" len(paddle.dataset.movielens.age_table)))\n",
"usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)\n",
"\n",
"usr_job_id = paddle.layer.data(\n",
" name='job_id',\n",
" type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(\n",
" ) + 1))\n",
"usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"如上述代码所示,对于每个用户,我们输入4维特征。其中包括`user_id`,`gender_id`,`age_id`,`job_id`。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便,我们借鉴NLP中的语言模型,将这几维离散的整数值,变换成embedding取出。分别形成`usr_emb`, `usr_gender_emb`, `usr_age_emb`, `usr_job_emb`。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"usr_combined_features = paddle.layer.fc(\n",
" input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],\n",
" size=200,\n",
" act=paddle.activation.Tanh())\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"然后,我们对于所有的用户特征,均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。\n",
"\n",
"进而,我们对每一个电影特征做类似的变换,网络配置为:\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"mov_id = paddle.layer.data(\n",
" name='movie_id',\n",
" type=paddle.data_type.integer_value(\n",
" paddle.dataset.movielens.max_movie_id() + 1))\n",
"mov_emb = paddle.layer.embedding(input=mov_id, size=32)\n",
"\n",
"mov_categories = paddle.layer.data(\n",
" name='category_id',\n",
" type=paddle.data_type.sparse_binary_vector(\n",
" len(paddle.dataset.movielens.movie_categories())))\n",
"\n",
"mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)\n",
"\n",
"\n",
"movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()\n",
"mov_title_id = paddle.layer.data(\n",
" name='movie_title',\n",
" type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))\n",
"mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)\n",
"mov_title_conv = paddle.networks.sequence_conv_pool(\n",
" input=mov_title_emb, hidden_size=32, context_len=3)\n",
"\n",
"mov_combined_features = paddle.layer.fc(\n",
" input=[mov_emb, mov_categories_hidden, mov_title_conv],\n",
" size=200,\n",
" act=paddle.activation.Tanh())\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"电影ID和电影类型分别映射到其对应的特征隐层。对于电影标题名称(title),一个ID序列表示的词语序列,在输入卷积层后,将得到每个时间窗口的特征(序列特征),然后通过在时间维度降采样得到固定维度的特征,整个过程在sequence_conv_pool实现。\n",
"\n",
"最后再将电影的特征融合进`mov_combined_features`中。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"进而,我们使用余弦相似度计算用户特征与电影特征的相似性。并将这个相似性拟合(回归)到用户评分上。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"cost = paddle.layer.regression_cost(\n",
" input=inference,\n",
" label=paddle.layer.data(\n",
" name='score', type=paddle.data_type.dense_vector(1)))\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"至此,我们的优化目标就是这个网络配置中的`cost`了。\n",
"\n",
"## 训练模型\n",
"\n",
"### 定义参数\n",
"神经网络的模型,我们可以简单的理解为网络拓朴结构+参数。之前一节,我们定义出了优化目标`cost`。这个`cost`即为网络模型的拓扑结构。我们开始训练模型,需要先定义出参数。定义方法为:\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"parameters = paddle.parameters.create(cost)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" [INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]\n",
" [INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__regression_cost_0__]\n",
"\n",
"\n",
"`parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候,我们没有指定参数名称,这里参数名称是自动生成的。当然,我们也可以指定每一个参数名称,方便日后维护。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"print parameters.keys()\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" [u'___fc_layer_2__.wbias', u'___fc_layer_2__.w2', u'___embedding_layer_3__.w0', u'___embedding_layer_5__.w0', u'___embedding_layer_2__.w0', u'___embedding_layer_1__.w0', u'___fc_layer_1__.wbias', u'___fc_layer_0__.wbias', u'___fc_layer_1__.w0', u'___fc_layer_0__.w2', u'___fc_layer_0__.w3', u'___fc_layer_0__.w0', u'___fc_layer_0__.w1', u'___fc_layer_2__.w1', u'___fc_layer_2__.w0', u'___embedding_layer_4__.w0', u'___sequence_conv_pool_0___conv_fc.w0', u'___embedding_layer_0__.w0', u'___sequence_conv_pool_0___conv_fc.wbias']\n",
"\n",
"\n",
"### 构造训练(trainer)\n",
"\n",
"下面,我们根据网络拓扑结构和模型参数来构造出一个本地训练(trainer)。在构造本地训练的时候,我们还需要指定这个训练的优化方法。这里我们使用Adam来作为优化算法。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,\n",
" update_equation=paddle.optimizer.Adam(learning_rate=1e-4))\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]\n",
" [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__regression_cost_0__]\n",
"\n",
"\n",
"### 训练\n",
"\n",
"下面我们开始训练过程。\n",
"\n",
"我们直接使用Paddle提供的数据集读取程序。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和预测数据集。并且通过`reader_dict`来指定每一个数据和data_layer的对应关系。\n",
"\n",
"例如,这里的reader_dict表示的是,对于数据层 `user_id`,使用了reader中每一条数据的第0个元素。`gender_id`数据层使用了第1个元素。以此类推。\n",
"\n",
"训练过程是完全自动的。我们可以使用event_handler来观察训练过程,或进行测试等。这里我们在event_handler里面绘制了训练误差曲线和测试误差曲线。并且保存了模型。\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"%matplotlib inline\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from IPython import display\n",
"import cPickle\n",
"\n",
"feeding = {\n",
" 'user_id': 0,\n",
" 'gender_id': 1,\n",
" 'age_id': 2,\n",
" 'job_id': 3,\n",
" 'movie_id': 4,\n",
" 'category_id': 5,\n",
" 'movie_title': 6,\n",
" 'score': 7\n",
"}\n",
"\n",
"step=0\n",
"\n",
"train_costs=[],[]\n",
"test_costs=[],[]\n",
"\n",
"def event_handler(event):\n",
" global step\n",
" global train_costs\n",
" global test_costs\n",
" if isinstance(event, paddle.event.EndIteration):\n",
" need_plot = False\n",
" if step % 10 == 0: # every 10 batches, record a train cost\n",
" train_costs[0].append(step)\n",
" train_costs[1].append(event.cost)\n",
"\n",
" if step % 1000 == 0: # every 1000 batches, record a test cost\n",
" result = trainer.test(reader=paddle.batch(\n",
" paddle.dataset.movielens.test(), batch_size=256))\n",
" test_costs[0].append(step)\n",
" test_costs[1].append(result.cost)\n",
"\n",
" if step % 100 == 0: # every 100 batches, update cost plot\n",
" plt.plot(*train_costs)\n",
" plt.plot(*test_costs)\n",
" plt.legend(['Train Cost', 'Test Cost'], loc='upper left')\n",
" display.clear_output(wait=True)\n",
" display.display(plt.gcf())\n",
" plt.gcf().clear()\n",
" step += 1\n",
"\n",
"trainer.train(\n",
" reader=paddle.batch(\n",
" paddle.reader.shuffle(\n",
" paddle.dataset.movielens.train(), buf_size=8192),\n",
" batch_size=256),\n",
" event_handler=event_handler,\n",
" feeding=feeding,\n",
" num_passes=2)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n",
"![png](./image/output_32_0.png)\n",
"\n",
"## 应用模型\n",
"\n",
"在训练了几轮以后,您可以对模型进行推断。我们可以使用任意一个用户ID和电影ID,来预测该用户对该电影的评分。示例程序为:\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"editable": true
},
"source": [
"import copy\n",
"user_id = 234\n",
"movie_id = 345\n",
"\n",
"user = user_info[user_id]\n",
"movie = movie_info[movie_id]\n",
"\n",
"feature = user.value() + movie.value()\n",
"\n",
"infer_dict = copy.copy(feeding)\n",
"del infer_dict['score']\n",
"\n",
"prediction = paddle.infer(output=inference, parameters=parameters, input=[feature], feeding=infer_dict)\n",
"score = (prediction[0][0] + 5.0) / 2\n",
"print \"[Predict] User %d Rating Movie %d With Score %.2f\"%(user_id, movie_id, score)\n"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"execution_count": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
" [INFO 2017-03-06 17:17:08,132 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title]\n",
" [INFO 2017-03-06 17:17:08,134 networks.py:1478] The output order is [__cos_sim_0__]\n",
"\n",
"\n",
" [Predict] User 234 Rating Movie 345 With Score 4.16\n",
"\n",
"\n",
"## 总结\n",
"\n",
"本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统,并以电影推荐为例,使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面,而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术,也将会在推荐系统领域大放异彩。\n",
"\n",
"## 参考文献\n",
"\n",
"1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.\n",
"2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.\n",
"3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.\n",
"4. Sarwar, Badrul, et al. \"[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)\" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.\n",
"5. Kautz, Henry, Bart Selman, and Mehul Shah. \"[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)\" Communications of the ACM 40.3 (1997): 63-65. APA\n",
"6. Yuan, Jianbo, et al. [\"Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach.\"](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).\n",
"7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.\n",
"\n",
"\u003cbr/\u003e\n",
"\u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e\u003cimg alt=\"知识共享许可协议\" style=\"border-width:0\" src=\"https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png\" /\u003e\u003c/a\u003e\u003cbr /\u003e\u003cspan xmlns:dct=\"http://purl.org/dc/terms/\" href=\"http://purl.org/dc/dcmitype/Text\" property=\"dct:title\" rel=\"dct:type\"\u003e本教程\u003c/span\u003e 由 \u003ca xmlns:cc=\"http://creativecommons.org/ns#\" href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" rel=\"cc:attributionURL\"\u003ePaddlePaddle\u003c/a\u003e 创作,采用 \u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议\u003c/a\u003e进行许可。\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册