diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..94143827ed065ca0d7d5be1b765d255c5c32cd9a --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +Dockerfile diff --git a/.gitignore b/.gitignore index db437f8ac27fcfb4e18e4d71ec0f8b18c90fe1a3..e7f8501f2c04d0ddb9a27202b3e91d33c47d9de8 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ pandoc.template .DS_Store .idea py_env* +*.ipynb +build diff --git a/.tools/build_docker.sh b/.tools/build_docker.sh index 55faa715a4fbbda97d8ff6422fb2e3d7fe8668fc..0b7735967ff2aae90a3f6597917e51517a8eba44 100755 --- a/.tools/build_docker.sh +++ b/.tools/build_docker.sh @@ -5,36 +5,39 @@ cd $cur_path/../ #convert md to ipynb .tools/convert-markdown-into-ipynb-and-test.sh -paddle_version=0.10.0rc2 +paddle_tag=0.10.0rc2 +book_tag=latest #generate docker file if [ ${USE_UBUNTU_REPO_MIRROR} ]; then - UPDATE_MIRROR_CMD="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\" + update_mirror_cmd="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\" else - UPDATE_MIRROR_CMD="\\" + update_mirror_cmd="\\" fi -mkdir -p build -cat > build/Dockerfile < +#build docker image +echo "paddle_tag:"$paddle_tag +echo "book_tag:"$book_tag -RUN ${UPDATE_MIRROR_CMD} - apt-get install locales -RUN localedef -f UTF-8 -i en_US en_US.UTF-8 +cat > Dockerfile < -RUN apt-get -y install gcc && \ - apt-get -y clean +COPY . /book -RUN pip install -U matplotlib jupyter numpy requests scipy +RUN pip install -U nltk \ + && python /book/.tools/cache_dataset.py -COPY . /book -RUN rm -rf /book/build +RUN ${update_mirror_cmd} + apt-get update && \ + apt-get install -y locales && \ + apt-get -y install gcc && \ + apt-get -y clean && \ + localedef -f UTF-8 -i en_US en_US.UTF-8 && \ + pip install -U matplotlib jupyter numpy requests scipy EXPOSE 8888 CMD ["sh", "-c", "jupyter notebook --ip=0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.disable_check_xsrf=True /book/"] EOF -#build docker image -echo "paddle_version:"$paddle_version -docker build --no-cache -t paddlepaddle/book:${paddle_version} -t paddlepaddle/book:latest -f ./build/Dockerfile . +docker build --no-cache -t paddlepaddle/book:${paddle_tag} -t paddlepaddle/book:${book_tag} . diff --git a/.tools/cache_dataset.py b/.tools/cache_dataset.py new file mode 100755 index 0000000000000000000000000000000000000000..0c552f84f635c8beb4538c08af153f76e5c5cff5 --- /dev/null +++ b/.tools/cache_dataset.py @@ -0,0 +1,42 @@ +#!/bin/env python +import paddle.v2.dataset as dataset +import nltk + +#cifar +dataset.common.download(dataset.cifar.CIFAR100_URL, 'cifar', + dataset.cifar.CIFAR100_MD5) +dataset.common.download(dataset.cifar.CIFAR100_URL, 'cifar', + dataset.cifar.CIFAR100_MD5) +dataset.common.download(dataset.cifar.CIFAR10_URL, 'cifar', + dataset.cifar.CIFAR10_MD5) +dataset.common.download(dataset.cifar.CIFAR10_URL, 'cifar', + dataset.cifar.CIFAR10_MD5) + +# Cache conll05 +dataset.common.download(dataset.conll05.WORDDICT_URL, 'conll05st', \ + dataset.conll05.WORDDICT_MD5) +dataset.common.download(dataset.conll05.VERBDICT_URL, 'conll05st', \ + dataset.conll05.VERBDICT_MD5) +dataset.common.download(dataset.conll05.TRGDICT_URL, 'conll05st', \ + dataset.conll05.TRGDICT_MD5) + +# Cache imdb +dataset.common.download(dataset.imdb.URL, "imdb", dataset.imdb.MD5) + +# Cache imikolov +dataset.common.download(dataset.imikolov.URL, "imikolov", dataset.imikolov.MD5) + +# Cache movielens +dataset.common.download('http://files.grouplens.org/datasets/movielens/ml-1m.zip',\ + 'movielens','c4d9eecfca2ab87c1945afe126590906') + +# Cache nltk +nltk.download('movie_reviews', download_dir=dataset.common.DATA_HOME) + +# Cache uci housing +dataset.common.download(dataset.uci_housing.URL, "uci_housing", \ + dataset.uci_housing.MD5) + +# Cache vmt14 +dataset.common.download(dataset.wmt14.URL_TRAIN, "wmt14",\ + dataset.wmt14.MD5_TRAIN) diff --git a/.tools/convert-markdown-into-ipynb-and-test.sh b/.tools/convert-markdown-into-ipynb-and-test.sh index 710b61957a82c033531d267aaccafcc616e6e46d..a038a2bc1df218d5fb22e0082cd511e991acb125 100755 --- a/.tools/convert-markdown-into-ipynb-and-test.sh +++ b/.tools/convert-markdown-into-ipynb-and-test.sh @@ -5,14 +5,14 @@ if [ $? -ne 0 ]; then exit 1 fi -GOPATH=~/.go go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb +export GOPATH=~/go; go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb cur_path="$(cd "$(dirname "$0")" && pwd -P)" cd $cur_path/../ #convert md to ipynb for file in */{README,README\.en}.md ; do - /tmp/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb" + ~/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb" if [ $? -ne 0 ]; then echo >&2 "markdown-to-ipynb $file error" exit 1 diff --git a/03.image_classification/README.ipynb b/03.image_classification/README.ipynb deleted file mode 100644 index e542f32c1fedba5ccd12cc04ad053aa5df4a0dec..0000000000000000000000000000000000000000 --- a/03.image_classification/README.ipynb +++ /dev/null @@ -1,877 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 图像分类\n", - "\n", - "本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。\n", - "\n", - "## 背景介绍\n", - "\n", - "图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。\n", - "\n", - "图像分类是根据图像的语义信息将不同类别图像区分开来,是计算机视觉中重要的基本问题,也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用,包括安防领域的人脸识别和智能视频分析等,交通领域的交通场景识别,互联网领域基于内容的图像检索和相册自动归类,医学领域的图像识别等。\n", - "\n", - "\n", - "一般来说,图像分类通过手工特征或特征学习方法对整个图像进行全部描述,然后使用分类器判别物体类别,因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入,即一句话可以用一个装了词的袋子表示其特征,袋子中的词为句子中的单词、短语或字。对于图像而言,词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。\n", - "\n", - "而基于深度学习的图像分类方法,可以通过有监督或无监督的方式**学习**层次化的特征描述,从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩,CNN直接利用图像像素信息作为输入,最大程度上保留了输入图像的所有信息,通过卷积操作进行特征的提取和高层抽象,模型输出直接是图像识别的结果。这种基于\"输入-输出\"直接端到端的学习方法取得了非常好的效果,得到了广泛的应用。\n", - "\n", - "本教程主要介绍图像分类的深度学习模型,以及如何使用PaddlePaddle训练CNN模型。\n", - "\n", - "## 效果展示\n", - "\n", - "图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果,即模型可以正确识别图像上的主要物体。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/dog_cat.png \" width=\"350\" \u003e\u003cbr/\u003e\n", - "图1. 通用图像分类展示\n", - "\u003c/p\u003e\n", - "\n", - "\n", - "图2展示了细粒度图像分类-花卉识别的效果,要求模型可以正确识别花的类别。\n", - "\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/flowers.png\" width=\"400\" \u003e\u003cbr/\u003e\n", - "图2. 细粒度图像分类展示\n", - "\u003c/p\u003e\n", - "\n", - "\n", - "一个好的模型既要对不同类别识别正确,同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动,较好的模型会像聪明的人类一样能够正确识别。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/variations.png\" width=\"550\" \u003e\u003cbr/\u003e\n", - "图3. 扰动图片展示[22]\n", - "\u003c/p\u003e\n", - "\n", - "## 模型概览\n", - "\n", - "图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上,很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛,ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集,在本章中我们基于这些竞赛的一些论文介绍图像分类模型。\n", - "\n", - "在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成,但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。\n", - " 1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \\[[1](#参考文献)\\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \\[[2](#参考文献)\\]、LBP(Local Bianray Pattern, 局部二值模式) \\[[3](#参考文献)\\] 等,一般也采用多种特征描述子,防止丢失过多的有用信息。\n", - " 2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \\[[4](#参考文献)\\]、稀疏编码 \\[[5](#参考文献)\\]、局部线性约束编码 \\[[6](#参考文献)\\]、Fisher向量编码 \\[[7](#参考文献)\\] 等。\n", - " 3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。\n", - " 4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。\n", - "\n", - "这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \\[[18](#参考文献)\\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \\[[8](#参考文献)\\]。\n", - "\n", - "Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \\[[9](#参考文献)\\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/ilsvrc.png\" width=\"500\" \u003e\u003cbr/\u003e\n", - "图4. ILSVRC图像分类Top-5错误率\n", - "\u003c/p\u003e\n", - "\n", - "### CNN\n", - "\n", - "传统CNN包含卷积层、全连接层等组件,并采用softmax多类别分类器和多类交叉熵损失函数,一个典型的卷积神经网络如图5所示,我们先介绍用来构造CNN的常见组件。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/lenet.png\"\u003e\u003cbr/\u003e\n", - "图5. CNN网络示例[20]\n", - "\u003c/p\u003e\n", - "\n", - "- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。\n", - "- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。\n", - "- 全连接层(fully-connected layer,或者fc layer): 输入层到隐藏层的神经元是全部连接的。\n", - "- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层,例如Sigmoid、Tanh、ReLu等来增强网络的表达能力,在CNN里最常使用的为ReLu激活函数。\n", - "- Dropout \\[[10](#参考文献)\\] : 在模型训练阶段随机让一些隐层节点权重不工作,提高网络的泛化能力,一定程度上防止过拟合。\n", - "\n", - "另外,在训练过程中由于每层参数不断更新,会导致下一次输入分布发生变化,这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \\[[14](#参考文献)\\] 中,每个batch对网络中的每一层特征都做归一化,使得每层分布相对稳定。BN算法不仅起到一定的正则作用,而且弱化了一些超参数的设计。经过实验证明,BN算法加速了模型收敛过程,在后来较深的模型中被广泛使用。\n", - "\n", - "接下来我们主要介绍VGG,GoogleNet和ResNet网络结构。\n", - "\n", - "### VGG\n", - "\n", - "牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \\[[11](#参考文献)\\] 。该模型相比以往模型进一步加宽和加深了网络结构,它的核心是五组卷积操作,每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积,卷积核的数目由较浅组的64增多到最深组的512,同一组内的卷积核数目是一样的。卷积之后接两层全连接层,之后是分类层。由于每组内卷积层的不同,有11、13、16、19层这几种模型,下图展示一个16层的网络结构。VGG模型结构相对简洁,提出之后也有很多文章基于此模型进行研究,如在ImageNet上首次公开超过人眼识别的模型\\[[19](#参考文献)\\]就是借鉴VGG模型的结构。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/vgg16.png\" width=\"750\" \u003e\u003cbr/\u003e\n", - "图6. 基于ImageNet的VGG16模型\n", - "\u003c/p\u003e\n", - "\n", - "### GoogleNet\n", - "\n", - "GoogleNet \\[[12](#参考文献)\\] 在2014年ILSVRC的获得了冠军,在介绍该模型之前我们先来了解NIN(Network in Network)模型 \\[[13](#参考文献)\\] 和Inception模块,因为GoogleNet模型由多组Inception模块组成,模型设计借鉴了NIN的一些思想。\n", - "\n", - "NIN模型主要有两个特点:1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络,即在线性卷积后面增加若干层1x1的卷积,这样可以提取出高度非线性特征。2) 传统的CNN最后几层一般都是全连接层,参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图,然后采用全局均值池化(Avg-Pooling)替代全连接层,得到类别维度大小的向量,再进行分类。这种替代全连接层的方式有利于减少参数。\n", - "\n", - "Inception模块如下图7所示,图(a)是最简单的设计,输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数,拼接后会导致特征的通道数较大,经过几层这样的模块堆积后,通道数会越来越大,导致参数和计算量也随之增大。为了改善这个缺点,图(b)引入3个1x1卷积层进行降维,所谓的降维就是减少通道数,同时如NIN模型中提到的1x1卷积也可以修正线性特征。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/inception.png\" width=\"800\" \u003e\u003cbr/\u003e\n", - "图7. Inception模块\n", - "\u003c/p\u003e\n", - "\n", - "GoogleNet由多组Inception模块堆积而成。另外,在网络最后也没有采用传统的多层全连接层,而是像NIN网络一样采用了均值池化层;但与NIN不同的是,池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外,由于网络中间层特征也很有判别性,GoogleNet在中间层添加了两个辅助分类器,在后向传播中增强梯度并且增强正则化,而整个网络的损失函数是这个三个分类器的损失加权求和。\n", - "\n", - "GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普通的卷积组成;接下来由三组子网络组成,第一组子网络包含2个Inception模块,第二组包含5个Inception模块,第三组包含2个Inception模块;然后接均值池化层、全连接层。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/googlenet.jpeg\" \u003e\u003cbr/\u003e\n", - "图8. GoogleNet[12]\n", - "\u003c/p\u003e\n", - "\n", - "\n", - "上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \\[[14](#参考文献)\\] 引入BN层;GoogleNet-v3 \\[[16](#参考文献)\\] 对一些卷积层做了分解,进一步提高网络非线性能力和加深网络;GoogleNet-v4 \\[[17](#参考文献)\\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升,介于篇幅,这里不再详细介绍v2到v4的结构。\n", - "\n", - "\n", - "### ResNet\n", - "\n", - "ResNet(Residual Network) \\[[15](#参考文献)\\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题,ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核,全卷积网络)的基础上,引入了残差模块。每个残差模块包含两条路径,其中一条路径是输入特征的直连通路,另一条路径对该特征做两到三次卷积操作得到该特征的残差,最后再将两条路径上的特征相加。\n", - "\n", - "残差模块如图9所示,左边是基本模块连接方式,由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式,之所以称为瓶颈,是因为上面的1x1卷积用来降维(图示例即256-\u003e64),下面的1x1卷积用来升维(图示例即64-\u003e256),这样中间3x3卷积的输入和输出通道数都较小(图示例即64-\u003e64)。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/resnet_block.jpg\" width=\"400\"\u003e\u003cbr/\u003e\n", - "图9. 残差模块\n", - "\u003c/p\u003e\n", - "\n", - "图10展示了50、101、152层网络连接示意图,使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快,成功的训练了上百乃至近千层的卷积神经网络。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/resnet.png\"\u003e\u003cbr/\u003e\n", - "图10. 基于ImageNet的ResNet模型\n", - "\u003c/p\u003e\n", - "\n", - "\n", - "## 数据准备\n", - "\n", - "通用图像分类公开的标准数据集常用的有[CIFAR](\u003chttps://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。\n", - "\n", - "由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](\u003chttps://www.cs.toronto.edu/~kriz/cifar.html\u003e)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/cifar.png\" width=\"350\"\u003e\u003cbr/\u003e\n", - "图11. CIFAR10数据集[21]\n", - "\u003c/p\u003e\n", - "\n", - "Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。\n", - "\n", - "通过输入`python train.py`,就可以开始训练模型了,以下小节将详细介绍`train.py`的相关内容。\n", - "\n", - "### 模型结构\n", - "\n", - "#### Paddle 初始化\n", - "\n", - "通过 `paddle.init`,初始化Paddle是否使用GPU,trainer的数目等等。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "import sys\n", - "import paddle.v2 as paddle\n", - "from vgg import vgg_bn_drop\n", - "from resnet import resnet_cifar10\n", - "\n", - "# PaddlePaddle init\n", - "paddle.init(use_gpu=False, trainer_count=1)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "本教程中我们提供了VGG和ResNet两个模型的配置。\n", - "\n", - "#### VGG\n", - "\n", - "首先介绍VGG模型结构,由于CIFAR10图片大小和数量相比ImageNet数据小很多,因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。\n", - "\n", - "1. 定义数据输入及其维度\n", - "\n", - " 网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - " datadim = 3 * 32 * 32\n", - " classdim = 10\n", - "\n", - " image = paddle.layer.data(\n", - " name=\"image\", type=paddle.data_type.dense_vector(datadim))\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "2. 定义VGG网络核心模块\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - " net = vgg_bn_drop(image)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - " def vgg_bn_drop(input):\n", - " def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):\n", - " return paddle.networks.img_conv_group(\n", - " input=ipt,\n", - " num_channels=num_channels,\n", - " pool_size=2,\n", - " pool_stride=2,\n", - " conv_num_filter=[num_filter] * groups,\n", - " conv_filter_size=3,\n", - " conv_act=paddle.activation.Relu(),\n", - " conv_with_batchnorm=True,\n", - " conv_batchnorm_drop_rate=dropouts,\n", - " pool_type=paddle.pooling.Max())\n", - "\n", - " conv1 = conv_block(input, 64, 2, [0.3, 0], 3)\n", - " conv2 = conv_block(conv1, 128, 2, [0.4, 0])\n", - " conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])\n", - " conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])\n", - " conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])\n", - "\n", - " drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)\n", - " fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())\n", - " bn = paddle.layer.batch_norm(\n", - " input=fc1,\n", - " act=paddle.activation.Relu(),\n", - " layer_attr=paddle.attr.Extra(drop_rate=0.5))\n", - " fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())\n", - " return fc2\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " 2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 `Conv-\u003eBN-\u003eReLu-\u003eDropout` 和 一组 `Pooling` 组成,\n", - "\n", - " 2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。\n", - "\n", - " 2.3. 最后接两层512维的全连接。\n", - "\n", - "3. 定义分类器\n", - "\n", - " 通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - " out = paddle.layer.fc(input=net,\n", - " size=classdim,\n", - " act=paddle.activation.Softmax())\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "4. 定义损失函数和网络输出\n", - "\n", - " 在有监督训练中需要输入图像对应的类别信息,同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - " lbl = paddle.layer.data(\n", - " name=\"label\", type=paddle.data_type.integer_value(classdim))\n", - " cost = paddle.layer.classification_cost(input=out, label=lbl)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### ResNet\n", - "\n", - "ResNet模型的第1、3、4步和VGG模型相同,这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "net = resnet_cifar10(image, depth=56)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "先介绍`resnet_cifar10`中的一些基本函数,再介绍网络连接过程。\n", - "\n", - " - `conv_bn_layer` : 带BN的卷积层。\n", - " - `shortcut` : 残差模块的\"直连\"路径,\"直连\"实际分两种形式:残差模块输入和输出特征通道数不等时,采用1x1卷积的升维操作;残差模块输入和输出通道相等时,采用直连操作。\n", - " - `basicblock` : 一个基础残差模块,即图9左边所示,由两组3x3卷积组成的路径和一条\"直连\"路径组成。\n", - " - `bottleneck` : 一个瓶颈残差模块,即图9右边所示,由上下1x1卷积和中间3x3卷积组成的路径和一条\"直连\"路径组成。\n", - " - `layer_warp` : 一组残差模块,由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同,以用来减少特征图在垂直和水平方向的大小。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "def conv_bn_layer(input,\n", - " ch_out,\n", - " filter_size,\n", - " stride,\n", - " padding,\n", - " active_type=paddle.activation.Relu(),\n", - " ch_in=None):\n", - " tmp = paddle.layer.img_conv(\n", - " input=input,\n", - " filter_size=filter_size,\n", - " num_channels=ch_in,\n", - " num_filters=ch_out,\n", - " stride=stride,\n", - " padding=padding,\n", - " act=paddle.activation.Linear(),\n", - " bias_attr=False)\n", - " return paddle.layer.batch_norm(input=tmp, act=active_type)\n", - "\n", - "def shortcut(ipt, n_in, n_out, stride):\n", - " if n_in != n_out:\n", - " return conv_bn_layer(ipt, n_out, 1, stride, 0,\n", - " paddle.activation.Linear())\n", - " else:\n", - " return ipt\n", - "\n", - "def basicblock(ipt, ch_out, stride):\n", - " ch_in = ch_out * 2\n", - " tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)\n", - " tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())\n", - " short = shortcut(ipt, ch_in, ch_out, stride)\n", - " return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())\n", - "\n", - "def layer_warp(block_func, ipt, features, count, stride):\n", - " tmp = block_func(ipt, features, stride)\n", - " for i in range(1, count):\n", - " tmp = block_func(tmp, features, 1)\n", - " return tmp\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "`resnet_cifar10` 的连接结构主要有以下几个过程。\n", - "\n", - "1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。\n", - "2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。\n", - "3. 最后对网络做均值池化并返回该层。\n", - "\n", - "注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "def resnet_cifar10(ipt, depth=32):\n", - " # depth should be one of 20, 32, 44, 56, 110, 1202\n", - " assert (depth - 2) % 6 == 0\n", - " n = (depth - 2) / 6\n", - " nStages = {16, 64, 128}\n", - " conv1 = conv_bn_layer(\n", - " ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)\n", - " res1 = layer_warp(basicblock, conv1, 16, n, 1)\n", - " res2 = layer_warp(basicblock, res1, 32, n, 2)\n", - " res3 = layer_warp(basicblock, res2, 64, n, 2)\n", - " pool = paddle.layer.img_pool(\n", - " input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())\n", - " return pool\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## 训练模型\n", - "\n", - "### 定义参数\n", - "\n", - "首先依据模型配置的`cost`定义模型参数。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "# Create parameters\n", - "parameters = paddle.parameters.create(cost)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "可以打印参数名字,如果在网络配置中没有指定名字,则默认生成。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "print parameters.keys()\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### 构造训练(Trainer)\n", - "\n", - "根据网络拓扑结构和模型参数来构造出trainer用来训练,在构造时还需指定优化方法,这里使用最基本的Momentum方法,同时设定了学习率、正则等。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "# Create optimizer\n", - "momentum_optimizer = paddle.optimizer.Momentum(\n", - " momentum=0.9,\n", - " regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),\n", - " learning_rate=0.1 / 128.0,\n", - " learning_rate_decay_a=0.1,\n", - " learning_rate_decay_b=50000 * 100,\n", - " learning_rate_schedule='discexp',\n", - " batch_size=128)\n", - "\n", - "# Create trainer\n", - "trainer = paddle.trainer.SGD(cost=cost,\n", - " parameters=parameters,\n", - " update_equation=momentum_optimizer)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "通过 `learning_rate_decay_a` (简写$a$) 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略,这里采用离散指数的方式调节学习率,计算公式如下, $n$ 代表已经处理过的累计总样本数,$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`。\n", - "\n", - "$$ lr = lr_{0} * a^ {\\lfloor \\frac{n}{ b}\\rfloor} $$\n", - "\n", - "\n", - "### 训练\n", - "\n", - "cifar.train10()每次产生一条样本,在完成shuffle和batch之后,作为训练的输入。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "reader=paddle.batch(\n", - " paddle.reader.shuffle(\n", - " paddle.dataset.cifar.train10(), buf_size=50000),\n", - " batch_size=128)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "通过`feeding`来指定每一个数据和`paddle.layer.data`的对应关系。例如: `cifar.train10()`产生数据的第0列对应image层的特征。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "feeding={'image': 0,\n", - " 'label': 1}\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "可以使用`event_handler`回调函数来观察训练过程,或进行测试等, 该回调函数是`trainer.train`函数里设定。\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "# End batch and end pass event handler\n", - "def event_handler(event):\n", - " if isinstance(event, paddle.event.EndIteration):\n", - " if event.batch_id % 100 == 0:\n", - " print \"\\nPass %d, Batch %d, Cost %f, %s\" % (\n", - " event.pass_id, event.batch_id, event.cost, event.metrics)\n", - " else:\n", - " sys.stdout.write('.')\n", - " sys.stdout.flush()\n", - " if isinstance(event, paddle.event.EndPass):\n", - " result = trainer.test(\n", - " reader=paddle.batch(\n", - " paddle.dataset.cifar.test10(), batch_size=128),\n", - " feeding=feeding)\n", - " print \"\\nTest with Pass %d, %s\" % (event.pass_id, result.metrics)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "通过`trainer.train`函数训练:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "trainer.train(\n", - " reader=reader,\n", - " num_passes=200,\n", - " event_handler=event_handler,\n", - " feeding=feeding)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "一轮训练log示例如下所示,经过1个pass, 训练集上平均error为0.6875 ,测试集上平均error为0.8852 。\n", - "\n", - "```text\n", - "Pass 0, Batch 0, Cost 2.473182, {'classification_error_evaluator': 0.9140625}\n", - "...................................................................................................\n", - "Pass 0, Batch 100, Cost 1.913076, {'classification_error_evaluator': 0.78125}\n", - "...................................................................................................\n", - "Pass 0, Batch 200, Cost 1.783041, {'classification_error_evaluator': 0.7421875}\n", - "...................................................................................................\n", - "Pass 0, Batch 300, Cost 1.668833, {'classification_error_evaluator': 0.6875}\n", - "..........................................................................................\n", - "Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}\n", - "```\n", - "\n", - "图12是训练的分类错误率曲线图,运行到第200个pass后基本收敛,最终得到测试集上分类错误率为8.54%。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/plot.png\" width=\"400\" \u003e\u003cbr/\u003e\n", - "图12. CIFAR10数据集上VGG模型的分类错误率\n", - "\u003c/p\u003e\n", - "\n", - "\n", - "## 总结\n", - "\n", - "传统图像分类方法由多个阶段构成,框架较为复杂,而端到端的CNN模型结构可一步到位,而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型;然后基于CIFAR10数据集,介绍如何使用PaddlePaddle配置和训练CNN模型,尤其是VGG和ResNet模型;最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet,配置和训练流程是同样的,大家可以自行进行实验。\n", - "\n", - "\n", - "## 参考文献\n", - "\n", - "[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.\n", - "\n", - "[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.\n", - "\n", - "[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.\n", - "\n", - "[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.\n", - "\n", - "[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.\n", - "\n", - "[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.\n", - "\n", - "[7] Perronnin, F., Sánchez, J., \u0026 Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).\n", - "\n", - "[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.\n", - "\n", - "[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.\n", - "\n", - "[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.\n", - "\n", - "[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。\n", - "\n", - "[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)\n", - "\n", - "[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.\n", - "\n", - "[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.\n", - "\n", - "[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.\n", - "\n", - "[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).\n", - "\n", - "[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).\n", - "\n", - "[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.\n", - "\n", - "[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.\n", - "\n", - "[20] http://deeplearning.net/tutorial/lenet.html\n", - "\n", - "[21] https://www.cs.toronto.edu/~kriz/cifar.html\n", - "\n", - "[22] http://cs231n.github.io/classification/\n", - "\n", - "\u003cbr/\u003e\n", - "\u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e\u003cimg alt=\"知识共享许可协议\" style=\"border-width:0\" src=\"https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png\" /\u003e\u003c/a\u003e\u003cbr /\u003e\u003cspan xmlns:dct=\"http://purl.org/dc/terms/\" href=\"http://purl.org/dc/dcmitype/Text\" property=\"dct:title\" rel=\"dct:type\"\u003e本教程\u003c/span\u003e 由 \u003ca xmlns:cc=\"http://creativecommons.org/ns#\" href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" rel=\"cc:attributionURL\"\u003ePaddlePaddle\u003c/a\u003e 创作,采用 \u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议\u003c/a\u003e进行许可。\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/08.recommender_system/README.en.ipynb b/08.recommender_system/README.en.ipynb deleted file mode 100644 index 2eb6ca909d12bc37e8f96cf5ff1ddaa8bf330697..0000000000000000000000000000000000000000 --- a/08.recommender_system/README.en.ipynb +++ /dev/null @@ -1,740 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Personalized Recommendation\n", - "\n", - "The source code of this tutorial is in [book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system).\n", - "\n", - "For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).\n", - "\n", - "\n", - "## Background\n", - "\n", - "With the fast growth of e-commerce, online videos, and online reading business, users have to rely on recommender systems to avoid manually browsing tremendous volume of choices. Recommender systems understand users' interest by mining user behavior and other properties of users and products.\n", - "\n", - "Some well know approaches include:\n", - "\n", - "- User behavior-based approach. A well-known method is collaborative filtering. The underlying assumption is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.\n", - "\n", - "- Content-based recommendation[[1](#reference)]. This approach infers feature vectors that represent products from their descriptions. It also infers feature vectors that represent users' interests. Then it measures the relevance of users and products by some distances between these feature vectors.\n", - "\n", - "- Hybrid approach[[2](#reference)]: This approach uses the content-based information to help address the cold start problem[[6](#reference)] in behavior-based approach.\n", - "\n", - "Among these options, collaborative filtering might be the most studied one. Some of its variants include user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.\n", - "\n", - "This tutorial explains a deep learning based approach and how to implement it using PaddlePaddle. We will train a model using a dataset that includes user information, movie information, and ratings. Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.\n", - "\n", - "\n", - "## Model Overview\n", - "\n", - "To know more about deep learning based recommendation, let us start from going over the Youtube recommender system[[7](#参考文献)] before introducing our hybrid model.\n", - "\n", - "\n", - "### YouTube's Deep Learning Recommendation Model\n", - "\n", - "YouTube is a video-sharing Web site with one of the largest user base in the world. Its recommender system serves more than a billion users. This system is composed of two major parts: candidate generation and ranking. The former selects few hundreds of candidates from millions of videos, and the latter ranks and outputs the top 10.\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/YouTube_Overview.en.png\" width=\"70%\" \u003e\u003cbr/\u003e\n", - "Figure 1. YouTube recommender system overview.\n", - "\u003c/p\u003e\n", - "\n", - "#### Candidate Generation Network\n", - "\n", - "Youtube models candidate generation as a multiclass classification problem with a huge number of classes equal to the number of videos. The architecture of the model is as follows:\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/Deep_candidate_generation_model_architecture.en.png\" width=\"70%\" \u003e\u003cbr/\u003e\n", - "Figure. Deep candidate geeration model.\n", - "\u003c/p\u003e\n", - "\n", - "The first stage of this model maps watching history and search queries into fixed-length representative features. Then, an MLP (multi-layer perceptron, as described in the [Recognize Digits](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md) tutorial) takes the concatenation of all representative vectors. The output of the MLP represents the user' *intrinsic interests*. At training time, it is used together with a softmax output layer for minimizing the classification error. At serving time, it is used to compute the relevance of the user with all movies.\n", - "\n", - "For a user $U$, the predicted watching probability of video $i$ is\n", - "\n", - "$$P(\\omega=i|u)=\\frac{e^{v_{i}u}}{\\sum_{j \\in V}e^{v_{j}u}}$$\n", - "\n", - "where $u$ is the representative vector of user $U$, $V$ is the corpus of all videos, $v_i$ is the representative vector of the $i$-th video. $u$ and $v_i$ are vectors of the same length, so we can compute their dot product using a fully connected layer.\n", - "\n", - "This model could have a performance issue as the softmax output covers millions of classification labels. To optimize performance, at the training time, the authors down-sample negative samples, so the actual number of classes is reduced to thousands. At serving time, the authors ignore the normalization of the softmax outputs, because the results are just for ranking.\n", - "\n", - "\n", - "#### Ranking Network\n", - "\n", - "The architecture of the ranking network is similar to that of the candidate generation network. Similar to ranking models widely used in online advertising, it uses rich features like video ID, last watching time, etc. The output layer of the ranking network is a weighted logistic regression, which rates all candidate videos.\n", - "\n", - "\n", - "### Hybrid Model\n", - "\n", - "In the section, let us introduce our movie recommendation system.\n", - "\n", - "In our network, the input includes features of users and movies. The user feature includes four properties: user ID, gender, occupation, and age. Movie features include their IDs, genres, and titles.\n", - "\n", - "We use fully-connected layers to map user features into representative feature vectors and concatenate them. The process of movie features is similar, except that for movie titles -- we feed titles into a text convolution network as described in the [sentiment analysis tutorial](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md))to get a fixed-length representative feature vector.\n", - "\n", - "Given the feature vectors of users and movies, we compute the relevance using cosine similarity. We minimize the squared error at training time.\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\n", - "\u003cimg src=\"image/rec_regression_network_en.png\" width=\"90%\" \u003e\u003cbr/\u003e\n", - "Figure 3. A hybrid recommendation model.\n", - "\u003c/p\u003e\n", - "\n", - "## Dataset\n", - "\n", - "We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m.zip) to train our model. This dataset includes 10,000 ratings of 4,000 movies from 6,000 users to 4,000 movies. Each rate is in the range of 1~5. Thanks to GroupLens Research for collecting, processing and publishing the dataset.\n", - "\n", - "`paddle.v2.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need for us to manually download and preprocess `MovieLens` dataset.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "# Run this block to show dataset's documentation\n", - "help(paddle.v2.dataset.movielens)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "The raw `MoiveLens` contains movie ratings, relevant features from both movies and users.\n", - "For instance, one movie's feature could be:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "movie_info = paddle.dataset.movielens.movie_info()\n", - "print movie_info.values()[0]\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "```text\n", - "\u003cMovieInfo id(1), title(Toy Story), categories(['Animation', \"Children's\", 'Comedy'])\u003e\n", - "```\n", - "\n", - "One user's feature could be:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "user_info = paddle.dataset.movielens.user_info()\n", - "print user_info.values()[0]\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "```text\n", - "\u003cUserInfo id(1), gender(F), age(1), job(10)\u003e\n", - "```\n", - "\n", - "In this dateset, the distribution of age is shown as follows:\n", - "\n", - "```text\n", - "1: \"Under 18\"\n", - "18: \"18-24\"\n", - "25: \"25-34\"\n", - "35: \"35-44\"\n", - "45: \"45-49\"\n", - "50: \"50-55\"\n", - "56: \"56+\"\n", - "```\n", - "\n", - "User's occupation is selected from the following options:\n", - "\n", - "```text\n", - "0: \"other\" or not specified\n", - "1: \"academic/educator\"\n", - "2: \"artist\"\n", - "3: \"clerical/admin\"\n", - "4: \"college/grad student\"\n", - "5: \"customer service\"\n", - "6: \"doctor/health care\"\n", - "7: \"executive/managerial\"\n", - "8: \"farmer\"\n", - "9: \"homemaker\"\n", - "10: \"K-12 student\"\n", - "11: \"lawyer\"\n", - "12: \"programmer\"\n", - "13: \"retired\"\n", - "14: \"sales/marketing\"\n", - "15: \"scientist\"\n", - "16: \"self-employed\"\n", - "17: \"technician/engineer\"\n", - "18: \"tradesman/craftsman\"\n", - "19: \"unemployed\"\n", - "20: \"writer\"\n", - "```\n", - "\n", - "Each record consists of three main components: user features, movie features and movie ratings.\n", - "Likewise, as a simple example, consider the following:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "train_set_creator = paddle.dataset.movielens.train()\n", - "train_sample = next(train_set_creator())\n", - "uid = train_sample[0]\n", - "mov_id = train_sample[len(user_info[uid].value())]\n", - "print \"User %s rates Movie %s with Score %s\"%(user_info[uid], movie_info[mov_id], train_sample[-1])\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "```text\n", - "User \u003cUserInfo id(1), gender(F), age(1), job(10)\u003e rates Movie \u003cMovieInfo id(1193), title(One Flew Over the Cuckoo's Nest), categories(['Drama'])\u003e with Score [5.0]\n", - "```\n", - "\n", - "The output shows that user 1 gave movie `1193` a rating of 5.\n", - "\n", - "After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works.\n", - "\n", - "## Model Architecture\n", - "\n", - "### Initialize PaddlePaddle\n", - "\n", - "First, we must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "%matplotlib inline\n", - "\n", - "import matplotlib.pyplot as plt\n", - "from IPython import display\n", - "import cPickle\n", - "\n", - "import paddle.v2 as paddle\n", - "\n", - "paddle.init(use_gpu=False)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### Model Configuration\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "uid = paddle.layer.data(\n", - " name='user_id',\n", - " type=paddle.data_type.integer_value(\n", - " paddle.dataset.movielens.max_user_id() + 1))\n", - "usr_emb = paddle.layer.embedding(input=uid, size=32)\n", - "\n", - "usr_gender_id = paddle.layer.data(\n", - " name='gender_id', type=paddle.data_type.integer_value(2))\n", - "usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)\n", - "\n", - "usr_age_id = paddle.layer.data(\n", - " name='age_id',\n", - " type=paddle.data_type.integer_value(\n", - " len(paddle.dataset.movielens.age_table)))\n", - "usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)\n", - "\n", - "usr_job_id = paddle.layer.data(\n", - " name='job_id',\n", - " type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(\n", - " ) + 1))\n", - "usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "As shown in the above code, the input is four dimension integers for each user, that is, `user_id`,`gender_id`, `age_id` and `job_id`. In order to deal with these features conveniently, we use the language model in NLP to transform these discrete values into embedding vaules `usr_emb`, `usr_gender_emb`, `usr_age_emb` and `usr_job_emb`.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "usr_combined_features = paddle.layer.fc(\n", - " input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],\n", - " size=200,\n", - " act=paddle.activation.Tanh())\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Then, employing user features as input, directly connecting to a fully-connected layer, which is used to reduce dimension to 200.\n", - "\n", - "Furthermore, we do a similar transformation for each movie feature. The model configuration is:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "mov_id = paddle.layer.data(\n", - " name='movie_id',\n", - " type=paddle.data_type.integer_value(\n", - " paddle.dataset.movielens.max_movie_id() + 1))\n", - "mov_emb = paddle.layer.embedding(input=mov_id, size=32)\n", - "\n", - "mov_categories = paddle.layer.data(\n", - " name='category_id',\n", - " type=paddle.data_type.sparse_binary_vector(\n", - " len(paddle.dataset.movielens.movie_categories())))\n", - "\n", - "mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)\n", - "\n", - "\n", - "movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()\n", - "mov_title_id = paddle.layer.data(\n", - " name='movie_title',\n", - " type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))\n", - "mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)\n", - "mov_title_conv = paddle.networks.sequence_conv_pool(\n", - " input=mov_title_emb, hidden_size=32, context_len=3)\n", - "\n", - "mov_combined_features = paddle.layer.fc(\n", - " input=[mov_emb, mov_categories_hidden, mov_title_conv],\n", - " size=200,\n", - " act=paddle.activation.Tanh())\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Movie title, a sequence of words represented by an integer word index sequence, will be feed into a `sequence_conv_pool` layer, which will apply convolution and pooling on time dimension. Because pooling is done on time dimension, the output will be a fixed-length vector regardless the length of the input sequence.\n", - "\n", - "Finally, we can use cosine similarity to calculate the similarity between user characteristics and movie features.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)\n", - "cost = paddle.layer.mse_cost(\n", - " input=inference,\n", - " label=paddle.layer.data(\n", - " name='score', type=paddle.data_type.dense_vector(1)))\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Model Training\n", - "\n", - "### Define Parameters\n", - "\n", - "First, we define the model parameters according to the previous model configuration `cost`.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "# Create parameters\n", - "parameters = paddle.parameters.create(cost)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### Create Trainer\n", - "\n", - "Before jumping into creating a training module, algorithm setting is also necessary. Here we specified Adam optimization algorithm via `paddle.optimizer`.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,\n", - " update_equation=paddle.optimizer.Adam(learning_rate=1e-4))\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "```text\n", - "[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]\n", - "[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__mse_cost_0__]\n", - "```\n", - "\n", - "### Training\n", - "\n", - "`paddle.dataset.movielens.train` will yield records during each pass, after shuffling, a batch input is generated for training.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "reader=paddle.reader.batch(\n", - " paddle.reader.shuffle(\n", - " paddle.dataset.movielens.trai(), buf_size=8192),\n", - " batch_size=256)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `movielens.train` corresponds to `user_id` feature.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "feeding = {\n", - " 'user_id': 0,\n", - " 'gender_id': 1,\n", - " 'age_id': 2,\n", - " 'job_id': 3,\n", - " 'movie_id': 4,\n", - " 'category_id': 5,\n", - " 'movie_title': 6,\n", - " 'score': 7\n", - "}\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Callback function `event_handler` will be called during training when a pre-defined event happens.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "step=0\n", - "\n", - "train_costs=[],[]\n", - "test_costs=[],[]\n", - "\n", - "def event_handler(event):\n", - " global step\n", - " global train_costs\n", - " global test_costs\n", - " if isinstance(event, paddle.event.EndIteration):\n", - " need_plot = False\n", - " if step % 10 == 0: # every 10 batches, record a train cost\n", - " train_costs[0].append(step)\n", - " train_costs[1].append(event.cost)\n", - "\n", - " if step % 1000 == 0: # every 1000 batches, record a test cost\n", - " result = trainer.test(reader=paddle.batch(\n", - " paddle.dataset.movielens.test(), batch_size=256))\n", - " test_costs[0].append(step)\n", - " test_costs[1].append(result.cost)\n", - "\n", - " if step % 100 == 0: # every 100 batches, update cost plot\n", - " plt.plot(*train_costs)\n", - " plt.plot(*test_costs)\n", - " plt.legend(['Train Cost', 'Test Cost'], loc='upper left')\n", - " display.clear_output(wait=True)\n", - " display.display(plt.gcf())\n", - " plt.gcf().clear()\n", - " step += 1\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Finally, we can invoke `trainer.train` to start training:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "trainer.train(\n", - " reader=reader,\n", - " event_handler=event_handler,\n", - " feeding=feeding,\n", - " num_passes=200)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Conclusion\n", - "\n", - "This tutorial goes over traditional approaches in recommender system and a deep learning based approach. We also show that how to train and use the model with PaddlePaddle. Deep learning has been well used in computer vision and NLP, we look forward to its new successes in recommender systems.\n", - "\n", - "## Reference\n", - "\n", - "1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.\n", - "2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.\n", - "3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.\n", - "4. Sarwar, Badrul, et al. \"[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)\" *Proceedings of the 10th International Conference on World Wide Web*. ACM, 2001.\n", - "5. Kautz, Henry, Bart Selman, and Mehul Shah. \"[Referral Web: Combining Social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)\" Communications of the ACM 40.3 (1997): 63-65. APA\n", - "6. Yuan, Jianbo, et al. [\"Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach.\"](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).\n", - "7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.\n", - "\n", - "\u003cbr/\u003e\n", - "This tutorial is contributed by \u003ca xmlns:cc=\"http://creativecommons.org/ns#\" href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" rel=\"cc:attributionURL\"\u003ePaddlePaddle\u003c/a\u003e, and licensed under a \u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003eCreative Commons Attribution-NonCommercial-ShareAlike 4.0 International License\u003c/a\u003e.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/08.recommender_system/README.ipynb b/08.recommender_system/README.ipynb deleted file mode 100644 index df6835a5c837c6a2d44e6455bdd19b8b7e040ebc..0000000000000000000000000000000000000000 --- a/08.recommender_system/README.ipynb +++ /dev/null @@ -1,795 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 个性化推荐\n", - "\n", - "本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。\n", - "\n", - "## 背景介绍\n", - "\n", - "在网络技术不断发展和电子商务规模不断扩大的背景下,商品数量和种类快速增长,用户需要花费大量时间才能找到自己想买的商品,这就是信息超载问题。为了解决这个难题,推荐系统(Recommender System)应运而生。\n", - "\n", - "个性化推荐系统是信息过滤系统(Information Filtering System)的子集,它可以用在很多领域,如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为,发现用户的个性化需求与兴趣特点,将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同,推荐系统不需要用户准确地描述出自己的需求,而是根据分析历史行为建模,主动提供满足用户兴趣和需求的信息。\n", - "\n", - "传统的推荐系统方法主要有:\n", - "\n", - "- 协同过滤推荐(Collaborative Filtering Recommendation):该方法收集分析用户历史行为、活动、偏好,计算一个用户与其他用户的相似度,利用目标用户的相似用户对商品评价的加权评价值,来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品;缺点是对于没有任何行为的新用户存在冷启动的问题,同时也存在用户与商品之间的交互数据不够多造成的稀疏问题,会导致模型难以找到相近用户。\n", - "- 基于内容过滤推荐[[1](#参考文献)](Content-based Filtering Recommendation):该方法利用商品的内容描述,抽象出有意义的特征,通过计算用户的兴趣和商品描述之间的相似度,来给用户做推荐。优点是简单直接,不需要依据其他用户对商品的评价,而是通过商品属性进行商品相似度度量,从而推荐给用户所感兴趣商品的相似商品;缺点是对于没有任何行为的新用户同样存在冷启动的问题。\n", - "- 组合推荐[[2](#参考文献)](Hybrid Recommendation):运用不同的输入和技术共同进行推荐,以弥补各自推荐技术的缺点。\n", - "\n", - "其中协同过滤是应用最广泛的技术之一,它又可以分为多个子类:基于用户 (User-Based)的推荐[[3](#参考文献)] 、基于物品(Item-Based)的推荐[[4](#参考文献)]、基于社交网络关系(Social-Based)的推荐[[5](#参考文献)]、基于模型(Model-based)的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想,此后,基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。\n", - "\n", - "深度学习具有优秀的自动提取特征的能力,能够学习多层次的抽象特征表示,并对异质或跨域的内容信息进行学习,可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型,以及如何使用PaddlePaddle实现模型。\n", - "\n", - "## 效果展示\n", - "\n", - "我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后,只需要输入对应的用户ID和电影ID,就可以得出一个匹配的分数(范围[1,5],分数越高视为兴趣越大),然后根据所有电影的推荐得分排序,推荐给用户可能感兴趣的电影。\n", - "\n", - "```\n", - "Input movie_id: 1962\n", - "Input user_id: 1\n", - "Prediction Score is 4.25\n", - "```\n", - "\n", - "## 模型概览\n", - "\n", - "本章中,我们首先介绍YouTube的视频推荐系统[[7](#参考文献)],然后介绍我们实现的融合推荐模型。\n", - "\n", - "### YouTube的深度神经网络推荐系统\n", - "\n", - "YouTube是世界上最大的视频上传、分享和发现网站,YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成:候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选,排序网络对候选进行打分排序,输出排名最高的数十个结果。系统结构如图1所示:\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/YouTube_Overview.png\" width=\"70%\" \u003e\u003cbr/\u003e\n", - "图1. YouTube 推荐系统结构\n", - "\u003c/p\u003e\n", - "\n", - "#### 候选生成网络(Candidate Generation Network)\n", - "\n", - "候选生成网络将推荐问题建模为一个类别数极大的多类分类问题:对于一个Youtube用户,使用其观看历史(视频ID)、搜索词记录(search tokens)、人口学信息(如地理位置、用户登录设备)、二值特征(如性别,是否登录)和连续特征(如用户年龄)等,对视频库中所有视频进行多分类,得到每一类别的分类结果(即每一个视频的推荐概率),最终输出概率较高的几百个视频。\n", - "\n", - "首先,将观看历史及搜索词记录这类历史信息,映射为向量后取平均值得到定长表示;同时,输入人口学特征以优化新用户的推荐效果,并将二值特征和连续特征归一化处理到[0, 1]范围。接下来,将所有特征表示拼接为一个向量,并输入给非线形多层感知器(MLP,详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md)教程)处理。最后,训练时将MLP的输出给softmax做分类,预测时计算用户的综合特征(MLP的输出)与所有视频的相似度,取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\u003cimg src=\"image/Deep_candidate_generation_model_architecture.png\" width=\"70%\" \u003e\u003cbr/\u003e\n", - "图2. 候选生成网络结构\n", - "\u003c/p\u003e\n", - "\n", - "对于一个用户$U$,预测此刻用户要观看的视频$\\omega$为视频$i$的概率公式为:\n", - "\n", - "$$P(\\omega=i|u)=\\frac{e^{v_{i}u}}{\\sum_{j \\in V}e^{v_{j}u}}$$\n", - "\n", - "其中$u$为用户$U$的特征表示,$V$为视频库集合,$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量,两者点积可以通过全连接层实现。\n", - "\n", - "考虑到softmax分类的类别数非常多,为了保证一定的计算效率:1)训练阶段,使用负样本类别采样将实际计算的类别数缩小至数千;2)推荐(预测)阶段,忽略softmax的归一化计算(不影响结果),将类别打分问题简化为点积(dot product)空间中的最近邻(nearest neighbor)搜索问题,取与$u$最近的$k$个视频作为生成的候选。\n", - "\n", - "#### 排序网络(Ranking Network)\n", - "排序网络的结构类似于候选生成网络,但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似,这里也构造了大量的用于视频排序的相关特征(如视频 ID、上次观看时间等)。这些特征的处理方式和候选生成网络类似,不同之处是排序网络的顶部是一个加权逻辑回归(weighted logistic regression),它对所有候选视频进行打分,从高到底排序后将分数较高的一些视频返回给用户。\n", - "\n", - "### 融合推荐模型\n", - "\n", - "在下文的电影推荐系统中:\n", - "\n", - "1. 首先,使用用户特征和电影特征作为神经网络的输入,其中:\n", - "\n", - " - 用户特征融合了四个属性信息,分别是用户ID、性别、职业和年龄。\n", - "\n", - " - 电影特征融合了三个属性信息,分别是电影ID、电影类型ID和电影名称。\n", - "\n", - "2. 对用户特征,将用户ID映射为维度大小为256的向量表示,输入全连接层,并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。\n", - "\n", - "3. 对电影特征,将电影ID以类似用户ID的方式进行处理,电影类型ID以向量的形式直接输入全连接层,电影名称用文本卷积神经网络(详见[第5章](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md))得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。\n", - "\n", - "4. 得到用户和电影的向量表示后,计算二者的余弦相似度作为推荐系统的打分。最后,用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。\n", - "\n", - "\u003cp align=\"center\"\u003e\n", - "\n", - "\u003cimg src=\"image/rec_regression_network.png\" width=\"90%\" \u003e\u003cbr/\u003e\n", - "图3. 融合推荐模型\n", - "\u003c/p\u003e\n", - "\n", - "## 数据准备\n", - "\n", - "### 数据介绍与下载\n", - "\n", - "我们以 [MovieLens 百万数据集(ml-1m)](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价(评分范围 1~5 分,均为整数),由 GroupLens Research 实验室搜集整理。\n", - "\n", - "Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens`\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "import paddle.v2 as paddle\n", - "paddle.init(use_gpu=False)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "# Run this block to show dataset's documentation\n", - "# help(paddle.dataset.movielens)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "在原始数据中包含电影的特征数据,用户的特征数据,和用户对电影的评分。\n", - "\n", - "例如,其中某一个电影特征为:\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "movie_info = paddle.dataset.movielens.movie_info()\n", - "print movie_info.values()[0]\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \u003cMovieInfo id(1), title(Toy Story ), categories(['Animation', \"Children's\", 'Comedy'])\u003e\n", - "\n", - "\n", - "这表示,电影的id是1,标题是《Toy Story》,该电影被分为到三个类别中。这三个类别是动画,儿童,喜剧。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "user_info = paddle.dataset.movielens.user_info()\n", - "print user_info.values()[0]\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \u003cUserInfo id(1), gender(F), age(1), job(10)\u003e\n", - "\n", - "\n", - "这表示,该用户ID是1,女性,年龄比18岁还年轻。职业ID是10。\n", - "\n", - "\n", - "其中,年龄使用下列分布\n", - "* 1: \"Under 18\"\n", - "* 18: \"18-24\"\n", - "* 25: \"25-34\"\n", - "* 35: \"35-44\"\n", - "* 45: \"45-49\"\n", - "* 50: \"50-55\"\n", - "* 56: \"56+\"\n", - "\n", - "职业是从下面几种选项里面选则得出:\n", - "* 0: \"other\" or not specified\n", - "* 1: \"academic/educator\"\n", - "* 2: \"artist\"\n", - "* 3: \"clerical/admin\"\n", - "* 4: \"college/grad student\"\n", - "* 5: \"customer service\"\n", - "* 6: \"doctor/health care\"\n", - "* 7: \"executive/managerial\"\n", - "* 8: \"farmer\"\n", - "* 9: \"homemaker\"\n", - "* 10: \"K-12 student\"\n", - "* 11: \"lawyer\"\n", - "* 12: \"programmer\"\n", - "* 13: \"retired\"\n", - "* 14: \"sales/marketing\"\n", - "* 15: \"scientist\"\n", - "* 16: \"self-employed\"\n", - "* 17: \"technician/engineer\"\n", - "* 18: \"tradesman/craftsman\"\n", - "* 19: \"unemployed\"\n", - "* 20: \"writer\"\n", - "\n", - "而对于每一条训练/测试数据,均为 \u003c用户特征\u003e + \u003c电影特征\u003e + 评分。\n", - "\n", - "例如,我们获得第一条训练数据:\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "train_set_creator = paddle.dataset.movielens.train()\n", - "train_sample = next(train_set_creator())\n", - "uid = train_sample[0]\n", - "mov_id = train_sample[len(user_info[uid].value())]\n", - "print \"User %s rates Movie %s with Score %s\"%(user_info[uid], movie_info[mov_id], train_sample[-1])\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " User \u003cUserInfo id(1), gender(F), age(1), job(10)\u003e rates Movie \u003cMovieInfo id(1193), title(One Flew Over the Cuckoo's Nest ), categories(['Drama'])\u003e with Score [5.0]\n", - "\n", - "\n", - "即用户1对电影1193的评价为5分。\n", - "\n", - "## 模型配置说明\n", - "\n", - "下面我们开始根据输入数据的形式配置模型。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "uid = paddle.layer.data(\n", - " name='user_id',\n", - " type=paddle.data_type.integer_value(\n", - " paddle.dataset.movielens.max_user_id() + 1))\n", - "usr_emb = paddle.layer.embedding(input=uid, size=32)\n", - "\n", - "usr_gender_id = paddle.layer.data(\n", - " name='gender_id', type=paddle.data_type.integer_value(2))\n", - "usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)\n", - "\n", - "usr_age_id = paddle.layer.data(\n", - " name='age_id',\n", - " type=paddle.data_type.integer_value(\n", - " len(paddle.dataset.movielens.age_table)))\n", - "usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)\n", - "\n", - "usr_job_id = paddle.layer.data(\n", - " name='job_id',\n", - " type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(\n", - " ) + 1))\n", - "usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "如上述代码所示,对于每个用户,我们输入4维特征。其中包括`user_id`,`gender_id`,`age_id`,`job_id`。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便,我们借鉴NLP中的语言模型,将这几维离散的整数值,变换成embedding取出。分别形成`usr_emb`, `usr_gender_emb`, `usr_age_emb`, `usr_job_emb`。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "usr_combined_features = paddle.layer.fc(\n", - " input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],\n", - " size=200,\n", - " act=paddle.activation.Tanh())\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "然后,我们对于所有的用户特征,均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。\n", - "\n", - "进而,我们对每一个电影特征做类似的变换,网络配置为:\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "mov_id = paddle.layer.data(\n", - " name='movie_id',\n", - " type=paddle.data_type.integer_value(\n", - " paddle.dataset.movielens.max_movie_id() + 1))\n", - "mov_emb = paddle.layer.embedding(input=mov_id, size=32)\n", - "\n", - "mov_categories = paddle.layer.data(\n", - " name='category_id',\n", - " type=paddle.data_type.sparse_binary_vector(\n", - " len(paddle.dataset.movielens.movie_categories())))\n", - "\n", - "mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)\n", - "\n", - "\n", - "movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()\n", - "mov_title_id = paddle.layer.data(\n", - " name='movie_title',\n", - " type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))\n", - "mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)\n", - "mov_title_conv = paddle.networks.sequence_conv_pool(\n", - " input=mov_title_emb, hidden_size=32, context_len=3)\n", - "\n", - "mov_combined_features = paddle.layer.fc(\n", - " input=[mov_emb, mov_categories_hidden, mov_title_conv],\n", - " size=200,\n", - " act=paddle.activation.Tanh())\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "电影ID和电影类型分别映射到其对应的特征隐层。对于电影标题名称(title),一个ID序列表示的词语序列,在输入卷积层后,将得到每个时间窗口的特征(序列特征),然后通过在时间维度降采样得到固定维度的特征,整个过程在sequence_conv_pool实现。\n", - "\n", - "最后再将电影的特征融合进`mov_combined_features`中。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "进而,我们使用余弦相似度计算用户特征与电影特征的相似性。并将这个相似性拟合(回归)到用户评分上。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "cost = paddle.layer.mse_cost(\n", - " input=inference,\n", - " label=paddle.layer.data(\n", - " name='score', type=paddle.data_type.dense_vector(1)))\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "至此,我们的优化目标就是这个网络配置中的`cost`了。\n", - "\n", - "## 训练模型\n", - "\n", - "### 定义参数\n", - "神经网络的模型,我们可以简单的理解为网络拓朴结构+参数。之前一节,我们定义出了优化目标`cost`。这个`cost`即为网络模型的拓扑结构。我们开始训练模型,需要先定义出参数。定义方法为:\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "parameters = paddle.parameters.create(cost)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " [INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]\n", - " [INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__mse_cost_0__]\n", - "\n", - "\n", - "`parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候,我们没有指定参数名称,这里参数名称是自动生成的。当然,我们也可以指定每一个参数名称,方便日后维护。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "print parameters.keys()\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " [u'___fc_layer_2__.wbias', u'___fc_layer_2__.w2', u'___embedding_layer_3__.w0', u'___embedding_layer_5__.w0', u'___embedding_layer_2__.w0', u'___embedding_layer_1__.w0', u'___fc_layer_1__.wbias', u'___fc_layer_0__.wbias', u'___fc_layer_1__.w0', u'___fc_layer_0__.w2', u'___fc_layer_0__.w3', u'___fc_layer_0__.w0', u'___fc_layer_0__.w1', u'___fc_layer_2__.w1', u'___fc_layer_2__.w0', u'___embedding_layer_4__.w0', u'___sequence_conv_pool_0___conv_fc.w0', u'___embedding_layer_0__.w0', u'___sequence_conv_pool_0___conv_fc.wbias']\n", - "\n", - "\n", - "### 构造训练(trainer)\n", - "\n", - "下面,我们根据网络拓扑结构和模型参数来构造出一个本地训练(trainer)。在构造本地训练的时候,我们还需要指定这个训练的优化方法。这里我们使用Adam来作为优化算法。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,\n", - " update_equation=paddle.optimizer.Adam(learning_rate=1e-4))\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]\n", - " [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__mse_cost_0__]\n", - "\n", - "\n", - "### 训练\n", - "\n", - "下面我们开始训练过程。\n", - "\n", - "我们直接使用Paddle提供的数据集读取程序。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和预测数据集。并且通过`reader_dict`来指定每一个数据和data_layer的对应关系。\n", - "\n", - "例如,这里的reader_dict表示的是,对于数据层 `user_id`,使用了reader中每一条数据的第0个元素。`gender_id`数据层使用了第1个元素。以此类推。\n", - "\n", - "训练过程是完全自动的。我们可以使用event_handler来观察训练过程,或进行测试等。这里我们在event_handler里面绘制了训练误差曲线和测试误差曲线。并且保存了模型。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "%matplotlib inline\n", - "\n", - "import matplotlib.pyplot as plt\n", - "from IPython import display\n", - "import cPickle\n", - "\n", - "feeding = {\n", - " 'user_id': 0,\n", - " 'gender_id': 1,\n", - " 'age_id': 2,\n", - " 'job_id': 3,\n", - " 'movie_id': 4,\n", - " 'category_id': 5,\n", - " 'movie_title': 6,\n", - " 'score': 7\n", - "}\n", - "\n", - "step=0\n", - "\n", - "train_costs=[],[]\n", - "test_costs=[],[]\n", - "\n", - "def event_handler(event):\n", - " global step\n", - " global train_costs\n", - " global test_costs\n", - " if isinstance(event, paddle.event.EndIteration):\n", - " need_plot = False\n", - " if step % 10 == 0: # every 10 batches, record a train cost\n", - " train_costs[0].append(step)\n", - " train_costs[1].append(event.cost)\n", - "\n", - " if step % 1000 == 0: # every 1000 batches, record a test cost\n", - " result = trainer.test(reader=paddle.batch(\n", - " paddle.dataset.movielens.test(), batch_size=256))\n", - " test_costs[0].append(step)\n", - " test_costs[1].append(result.cost)\n", - "\n", - " if step % 100 == 0: # every 100 batches, update cost plot\n", - " plt.plot(*train_costs)\n", - " plt.plot(*test_costs)\n", - " plt.legend(['Train Cost', 'Test Cost'], loc='upper left')\n", - " display.clear_output(wait=True)\n", - " display.display(plt.gcf())\n", - " plt.gcf().clear()\n", - " step += 1\n", - "\n", - "trainer.train(\n", - " reader=paddle.batch(\n", - " paddle.reader.shuffle(\n", - " paddle.dataset.movielens.train(), buf_size=8192),\n", - " batch_size=256),\n", - " event_handler=event_handler,\n", - " feeding=feeding,\n", - " num_passes=2)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "![png](./image/output_32_0.png)\n", - "\n", - "## 应用模型\n", - "\n", - "在训练了几轮以后,您可以对模型进行推断。我们可以使用任意一个用户ID和电影ID,来预测该用户对该电影的评分。示例程序为:\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "editable": true - }, - "source": [ - "import copy\n", - "user_id = 234\n", - "movie_id = 345\n", - "\n", - "user = user_info[user_id]\n", - "movie = movie_info[movie_id]\n", - "\n", - "feature = user.value() + movie.value()\n", - "\n", - "infer_dict = copy.copy(feeding)\n", - "del infer_dict['score']\n", - "\n", - "prediction = paddle.infer(output=inference, parameters=parameters, input=[feature], feeding=infer_dict)\n", - "score = (prediction[0][0] + 5.0) / 2\n", - "print \"[Predict] User %d Rating Movie %d With Score %.2f\"%(user_id, movie_id, score)\n" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "execution_count": 1 - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " [INFO 2017-03-06 17:17:08,132 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title]\n", - " [INFO 2017-03-06 17:17:08,134 networks.py:1478] The output order is [__cos_sim_0__]\n", - "\n", - "\n", - " [Predict] User 234 Rating Movie 345 With Score 4.16\n", - "\n", - "\n", - "## 总结\n", - "\n", - "本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统,并以电影推荐为例,使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面,而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术,也将会在推荐系统领域大放异彩。\n", - "\n", - "## 参考文献\n", - "\n", - "1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.\n", - "2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.\n", - "3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.\n", - "4. Sarwar, Badrul, et al. \"[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)\" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.\n", - "5. Kautz, Henry, Bart Selman, and Mehul Shah. \"[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)\" Communications of the ACM 40.3 (1997): 63-65. APA\n", - "6. Yuan, Jianbo, et al. [\"Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach.\"](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).\n", - "7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.\n", - "\n", - "\u003cbr/\u003e\n", - "\u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e\u003cimg alt=\"知识共享许可协议\" style=\"border-width:0\" src=\"https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png\" /\u003e\u003c/a\u003e\u003cbr /\u003e\u003cspan xmlns:dct=\"http://purl.org/dc/terms/\" href=\"http://purl.org/dc/dcmitype/Text\" property=\"dct:title\" rel=\"dct:type\"\u003e本教程\u003c/span\u003e 由 \u003ca xmlns:cc=\"http://creativecommons.org/ns#\" href=\"http://book.paddlepaddle.org\" property=\"cc:attributionName\" rel=\"cc:attributionURL\"\u003ePaddlePaddle\u003c/a\u003e 创作,采用 \u003ca rel=\"license\" href=\"http://creativecommons.org/licenses/by-nc-sa/4.0/\"\u003e知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议\u003c/a\u003e进行许可。\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}