提交 1409897c 编写于 作者: Y Yang Yu

Merge branch 'develop' of https://github.com/PaddlePaddle/book into feature/remove_lambda

Dockerfile
.git/
.gitignore
deprecated
*~
pandoc.template
.DS_Store
\ No newline at end of file
.DS_Store
.idea
py_env*
*.ipynb
build
[submodule "paddle"]
path = paddle
url = https://github.com/PaddlePaddle/Paddle.git
branch = develop
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: c25201a00e6b0514370501050cf2a8538ac12270
- repo: https://github.com/pre-commit/mirrors-yapf.git
sha: v0.16.0
hooks:
- id: remove-crlf
- repo: https://github.com/reyoung/mirrors-yapf.git
sha: v0.13.2
hooks:
- id: yapf
files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ # Bazel BUILD files follow Python syntax.
- id: yapf
files: \.py$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
sha: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*paddle)^.*$
- id: end-of-file-fixer
files: \.md$
- id: trailing-whitespace
files: \.md$
- repo: https://github.com/Lucas-C/pre-commit-hooks
sha: v1.0.1
hooks:
- id: forbid-crlf
files: \.md$
- id: remove-crlf
files: \.md$
- id: forbid-tabs
files: \.md$
- id: remove-tabs
files: \.md$
- repo: https://github.com/reyoung/pre-commit-hooks-jinja-compile.git
sha: 4a369cc72a4a2b8d3813ab8cc17abb5f5b21ef6c
hooks:
- id: convert-jinja2-into-html
# The argument means repleace filename from pattern `.*/([^/]*)\.tmpl` to `\1`
args: ['--filename_pattern=.*/([^/]*)\.tmpl', '--filename_repl=\1']
- repo: local
hooks:
- id: convert-markdown-into-html
name: convert-markdown-into-html
description: Convert README.md into index.html and README.cn.md into index.cn.html
entry: python .pre-commit-hooks/convert_markdown_into_html.py
language: system
files: .+README(\.cn)?\.md$
markdown_file=$1
import argparse
import re
import sys
# Notice: the single-quotes around EOF below make outputs
# verbatium. c.f. http://stackoverflow.com/a/9870274/724872
cat <<'EOF'
HEAD = """
<html>
<head>
<script type="text/x-mathjax-config">
......@@ -10,21 +10,21 @@ cat <<'EOF'
extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
inlineMath: [ ['$','$'] ],
displayMath: [ ['$$','$$'] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
<script type="text/javascript" src="../.tmpl/marked.js">
<script type="text/javascript" src="../.tools/theme/marked.js">
</script>
<link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
<script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
<link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
<link href="../.tmpl/github-markdown.css" rel='stylesheet'>
<link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
</head>
<style type="text/css" >
.markdown-body {
......@@ -39,16 +39,14 @@ cat <<'EOF'
<body>
<div id="context" class="container markdown-body">
<div id="context" class="container-fluid markdown-body">
</div>
<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
<div id="markdown" style='display:none'>
EOF
"""
cat $markdown_file
cat <<'EOF'
TAIL = """
</div>
<!-- You can change the lines below now. -->
......@@ -67,7 +65,31 @@ marked.setOptions({
}
});
document.getElementById("context").innerHTML = marked(
document.getElementById("markdown").innerHTML)
document.getElementById("markdown").innerHTML)
</script>
</body>
EOF
"""
def convert_markdown_into_html(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('filenames', nargs='*', help='Filenames to fix')
args = parser.parse_args(argv)
retv = 0
for filename in args.filenames:
with open(
re.sub(r"README", "index", re.sub(r"\.md$", ".html", filename)),
"w") as output:
output.write(HEAD)
with open(filename) as input:
for line in input:
output.write(line)
output.write(TAIL)
return retv
if __name__ == '__main__':
sys.exit(convert_markdown_into_html())
#!/bin/sh
for file in $@ ; do
markdown-to-ipynb < $file > ${file%.*}".ipynb"
if [ $? -ne 0 ]; then
echo >&2 "markdown-to-ipynb $file error"
exit 1
fi
done
#!/bin/bash
set -xe
cur_path="$(cd "$(dirname "$0")" && pwd -P)"
cd "$cur_path"/../
#paddle production image name
if [ ! -n "$1" ]; then
paddle_image=paddlepaddle/paddle
else
paddle_image=$1
fi
#paddle production image tag
if [ ! -n "$2" ]; then
paddle_tag=0.10.0
else
paddle_tag=$2
fi
#paddle book image name
if [ ! -n "$3" ]; then
book_image=paddlepaddle/book
else
book_image=$3
fi
#paddle book image tag
if [ ! -n "$4" ]; then
book_tag=latest
else
book_tag=$4
fi
#generate docker file
if [ ${USE_UBUNTU_REPO_MIRROR} ]; then
update_mirror_cmd="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
else
update_mirror_cmd="\\"
fi
#build docker image
echo "paddle_tag:"$paddle_tag
echo "book_tag:"$book_tag
cat > Dockerfile <<EOF
FROM ${paddle_image}:${paddle_tag}
MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
COPY . /book
EOF
if [ -n "${http_proxy}" ]; then
cat >> Dockerfile <<EOF
ENV http_proxy ${http_proxy}
ENV https_proxy ${http_proxy}
EOF
fi
cat >> Dockerfile <<EOF
RUN pip install -U nltk \
&& python /book/.tools/cache_dataset.py
RUN ${update_mirror_cmd}
apt-get update && \
apt-get install -y locales patch && \
apt-get -y install gcc curl git vim && \
apt-get -y clean && \
localedef -f UTF-8 -i en_US en_US.UTF-8 && \
pip install --upgrade pip && \
pip install -U notedown pillow matplotlib jupyter numpy requests scipy
RUN curl https://storage.googleapis.com/golang/go1.8.linux-amd64.tar.gz -o go1.8.linux-amd64.tar.gz && \
tar -zxvf go1.8.linux-amd64.tar.gz -C /usr/local/ && \
rm go1.8.linux-amd64.tar.gz
ENV GOROOT /usr/local/go
ENV PATH \${GOROOT}/bin:\${PATH}
#convert md to ipynb
RUN /bin/bash /book/.tools/convert-markdown-into-ipynb-and-test.sh
EXPOSE 8888
CMD ["sh", "-c", "jupyter notebook --ip=0.0.0.0 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.disable_check_xsrf=True /book/"]
EOF
docker build --no-cache -t ${book_image}:${paddle_tag} -t ${book_image}:${book_tag} .
#!/bin/env python
import paddle.v2.dataset as dataset
import nltk
#cifar
dataset.common.download(dataset.cifar.CIFAR100_URL, 'cifar',
dataset.cifar.CIFAR100_MD5)
dataset.common.download(dataset.cifar.CIFAR10_URL, 'cifar',
dataset.cifar.CIFAR10_MD5)
# Cache conll05
dataset.common.download(dataset.conll05.WORDDICT_URL, 'conll05st', \
dataset.conll05.WORDDICT_MD5)
dataset.common.download(dataset.conll05.VERBDICT_URL, 'conll05st', \
dataset.conll05.VERBDICT_MD5)
dataset.common.download(dataset.conll05.TRGDICT_URL, 'conll05st', \
dataset.conll05.TRGDICT_MD5)
dataset.common.download(dataset.conll05.EMB_URL, 'conll05st',
dataset.conll05.EMB_MD5)
dataset.common.download(dataset.conll05.DATA_URL, 'conll05st',
dataset.conll05.DATA_MD5)
# Cache imdb
dataset.common.download(dataset.imdb.URL, "imdb", dataset.imdb.MD5)
# Cache imikolov
dataset.common.download(dataset.imikolov.URL, "imikolov", dataset.imikolov.MD5)
# Cache movielens
dataset.common.download('http://files.grouplens.org/datasets/movielens/ml-1m.zip',\
'movielens','c4d9eecfca2ab87c1945afe126590906')
# Cache nltk
nltk.download('movie_reviews', download_dir=dataset.common.DATA_HOME)
# Cache uci housing
dataset.common.download(dataset.uci_housing.URL, "uci_housing", \
dataset.uci_housing.MD5)
# Cache vmt14
dataset.common.download(dataset.wmt14.URL_TRAIN, "wmt14",\
dataset.wmt14.MD5_TRAIN)
#mnist
dataset.common.download(dataset.mnist.TRAIN_IMAGE_URL, 'mnist',
dataset.mnist.TRAIN_IMAGE_MD5)
dataset.common.download(dataset.mnist.TRAIN_LABEL_URL, 'mnist',
dataset.mnist.TRAIN_LABEL_MD5)
dataset.common.download(dataset.mnist.TEST_IMAGE_URL, 'mnist',
dataset.mnist.TEST_IMAGE_MD5)
dataset.common.download(dataset.mnist.TEST_LABEL_URL, 'mnist',
dataset.mnist.TEST_LABEL_MD5)
#!/bin/bash
command -v go >/dev/null 2>&1
if [ $? -ne 0 ]; then
echo >&2 "Please install go https://golang.org/doc/install#install"
exit 1
fi
export GOPATH=~/go; go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb
cur_path="$(cd "$(dirname "$0")" && pwd -P)"
cd $cur_path/../
#convert md to ipynb
for file in */{README,README\.cn}.md ; do
~/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb"
if [ $? -ne 0 ]; then
echo >&2 "markdown-to-ipynb $file error"
exit 1
fi
done
if [[ -z $TEST_EMBEDDED_PYTHON_SCRIPTS ]]; then
exit 0
fi
#exec ipynb's py file
for file in */{README,README\.cn}.ipynb ; do
pushd $PWD > /dev/null
cd $(dirname $file) > /dev/null
echo "begin test $file"
jupyter nbconvert --to python $(basename $file) --stdout | python
popd > /dev/null
#break
done
#!/bin/bash
set -xe
cd /book
#convert md to ipynb
for file in */{README,README\.cn}.md ; do
notedown $file > ${file%.*}.ipynb
done
{
"is_en": false,
"chapters": [
{
"name": "新手入门",
"link": "./01.fit_a_line/index.cn.html"
},
{
"name": "识别数字",
"link": "./02.recognize_digits/index.cn.html"
},
{
"name": "图像分类",
"link": "./03.image_classification/index.cn.html"
},
{
"name": "词向量",
"link": "./04.word2vec/index.cn.html"
},
{
"name": "个性化推荐",
"link": "./05.recommender_system/index.cn.html"
},
{
"name": "情感分析",
"link": "./06.understand_sentiment/index.cn.html"
},
{
"name": "语义角色标注",
"link": "./07.label_semantic_roles/index.cn.html"
},
{
"name": "机器翻译",
"link": "./08.machine_translation/index.cn.html"
}
]
}
index.html.tmpl
\ No newline at end of file
{
"is_en": true,
"chapters": [
{
"name": "Linear Regression",
"link": "./01.fit_a_line/index.html"
},
{
"name": "Recognize Digits",
"link": "./02.recognize_digits/index.html"
},
{
"name": "Image Classification",
"link": "./03.image_classification/index.html"
},
{
"name": "Word2Vec",
"link": "./04.word2vec/index.html"
},
{
"name": "Personalized Recommendation",
"link": "./05.recommender_system/index.html"
},
{
"name": "Sentiment Analysis",
"link": "./06.understand_sentiment/index.html"
},
{
"name": "Semantic Role Labeling",
"link": "./07.label_semantic_roles/index.html"
},
{
"name": "Machine Translation",
"link": "./08.machine_translation/index.html"
}
]
}
<!DOCTYPE html>
<html lang="{% if is_en %}en{%else%}zh-Hans{% endif %}">
<head>
<meta charset="UTF-8">
<title>{% if not is_en %}深度学习入门{%else%}Deep Learning 101{% endif %}</title>
<link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
<script src="http://cdn.bootcss.com/tether/1.4.0/js/tether.js"></script>
<script src="http://cdn.bootcss.com/jquery/3.1.0/jquery.slim.js"></script>
<script src="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js"></script>
<style>
* {
font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
}
.left-panel {
background: #E5E6EA;
}
.left-panel .card-block a.click_active {
background-color: #597cf1;
color: #fff;
}
.left-panel .card-header {
background: #ecedee;
}
{% if not is_en %}
.left-panel .card-header a {
font-size: 14px;
}
{% endif %}
.left-panel .card-block a:not(.click_active) {
background: #e4e6e9;
}
.left-panel .card-block a {
border-radius: 0px;
font-size: 13px;
color: #2F323A;
padding-left: 40px;
border: 1px solid rgba(0,0,0,.125);
}
.left-panel .card-block a.active_color {
color: rgb(70, 74, 76);
font-weight: bolder;
}
.left-panel .list-group-item {
-moz-box-align: center;
align-items: center;
display: flex;
flex-flow: row wrap;
margin-bottom: -1px;
padding: 0.75rem 1.25rem;
position: relative;
color: #2f323a;
}
.navbar img {
height: 90%;
width: 90%;
}
</style>
</head>
<body>
<nav class="navbar navbar-toggleable-md navbar-inverse bg-inverse">
<a class="navbar-brand mr-auto" href="#">
<img alt="PaddlePaddle" src="./.tools/theme/PP_w.png">
</a>
<ul class="nav navbar-nav">
<li class="nav-item">
<a class="nav-link" href="http://paddlepaddle.org">
{% if is_en %}Home{% else %}首页{% endif %}
</a>
</li>
<li class="nav-item {% if not is_en %}active{% endif %}">
<a class="nav-link" href="{% if is_en %}./index.cn.html{% else %}#{%endif%}">
{% if is_en %}中文{% else %}深度学习入门{% endif %}
</a>
</li>
<li class="nav-item {% if is_en %}active{% endif %}">
<a class="nav-link" href="{% if is_en %}#{% else %}./index.html{% endif %}">
{% if is_en %}Deep Learning 101{% else %}English{% endif %}
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://github.com/PaddlePaddle/book/">Fork on Github</a>
</li>
</ul>
</nav>
<div class="container-fluid">
<div class="row">
<div class="col-3 pl-1 pr-1 left-panel">
<div id="accordion" role="tablist" aria-multiselectable="true">
<div class="card">
<div class="card-header" role="tab" id="headingOne">
<h5 class="mb-0">
<a aria-expanded="true" aria-controls="collapseOne">
{% if is_en %}Deep Learning 101<span class="sr-only">(current)</span>{% else %}深度学习入门{% endif %}
</a>
</h5>
</div>
<div id="collapseOne" class="rounded-0 collapse show" role="tabpanel" aria-labelledby="headingOne">
<div class="card-block pl-0 pr-0 pt-0 pb-0">
<div class="list-group ">
{% for chapter in chapters %}
<a href="{{ chapter.link }}" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
{{ chapter.name }}
</a>
{% endfor %}
</div>
</div>
</div>
</div>
</div>
</div>
<div class="col">
<iframe src="./01.fit_a_line/index{% if not is_en %}.cn{% endif %}.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe">
</iframe>
</div>
</div>
</div>
<script>
$('#content_iframe').on('load', function(){
$("#content_iframe").height(200) // trick code to shrink iframe size
var body = $('#content_iframe').contents().find("body")
body.css("overflow-y", "hidden")
$("#content_iframe").height(body.height()+20)
var alllinks = $('#content_iframe').contents().find("a")
for (var i =0; i<alllinks.length; ++i) {
alllinks[i].setAttribute("target", "_blank")
}
});
$(".list-group a").click(function(){
$(".list-group a.click_active").removeClass("click_active");
$(this).addClass("click_active");
})
$($(".list-group a")[0]).addClass("click_active")
</script>
</body>
</html>
......@@ -188,7 +188,7 @@
}
.markdown-body a {
color: #4078c0;
color: #597cf1;
text-decoration: none;
}
......
......@@ -1093,7 +1093,7 @@ function escape(html, encode) {
}
function unescape(html) {
// explicitly match decimal, hex, and named HTML entities
// explicitly match decimal, hex, and named HTML entities
return html.replace(/&(#(?:\d+)|(?:#x[0-9A-Fa-f]+)|(?:\w+));?/g, function(_, n) {
n = n.toLowerCase();
if (n === 'colon') return ':';
......
......@@ -14,10 +14,20 @@ addons:
- python
- python-pip
- python2.7-dev
- golang
ssh_known_hosts: 52.76.173.135
before_install:
- pip install virtualenv pre-commit
- sudo pip install -U virtualenv pre-commit pip
- GOPATH=/tmp/go go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb
script:
- travis/precommit.sh
- PATH=/tmp/go/bin:$PATH .travis/precommit.sh
- |
if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then echo "not develop branch, no deploy"; exit 0; fi;
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export BOOK_DIR=`pwd`
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $BOOK_DIR
notifications:
email:
on_success: change
......
......@@ -13,7 +13,9 @@ export PATH=/usr/bin:$PATH
pre-commit install
if ! pre-commit run -a ; then
ls -lh
git diff --exit-code
exit 1
fi
trap : 0
# 线性回归
让我们从经典的线性回归(Linear Regression \[[1](#参考文献)\])模型开始这份教程。在这一章里,你将使用真实的数据集建立起一个房价预测模型,并且了解到机器学习中的若干重要概念。
本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)
本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/137.html)
## 背景介绍
给定一个大小为$n$的数据集 ${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$,其中$x_{i1}, \ldots, x_{id}$是第$i$个样本$d$个属性上的取值,$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述,即
......@@ -15,8 +15,8 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldo
## 效果展示
我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。
<p align="center">
<img src = "image/predictions.png" width=400><br/>
图1. 预测值 V.S. 真实值
<img src = "image/predictions.png" width=400><br/>
图1. 预测值 V.S. 真实值
</p>
## 模型概览
......@@ -29,7 +29,7 @@ $$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
$\hat{Y}$ 表示模型的预测结果,用来和真实值$Y$区分。模型要学习的参数即:$\omega_1, \ldots, \omega_{13}, b$。
建立模型后,我们需要给模型一个优化目标,使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数([Loss Function](https://en.wikipedia.org/wiki/Loss_function),或Cost Function)这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$,损失函数输出一个非负的实值。这个实通常用来反映模型误差的大小。
建立模型后,我们需要给模型一个优化目标,使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数([Loss Function](https://en.wikipedia.org/wiki/Loss_function),或Cost Function)这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$,损失函数输出一个非负的实值。这个实通常用来反映模型误差的大小。
对于线性回归模型来讲,最常见的损失函数就是均方误差(Mean Squared Error, [MSE](https://en.wikipedia.org/wiki/Mean_squared_error))了,它的形式是:
......@@ -45,14 +45,25 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。
4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。
## 数据集
## 数据准备
执行以下命令来准备数据:
```bash
cd data && python prepare_data.py
### 数据集接口的封装
首先加载需要的包
```python
import paddle.v2 as paddle
import paddle.v2.dataset.uci_housing as uci_housing
```
这段代码将从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)下载数据并进行[预处理](#数据预处理),最后数据将被分为训练集和测试集。
我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)
其中,在uci_housing模块中封装了:
1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
2. [数据预处理](#数据预处理)的过程。
### 数据集介绍
这份数据集共506行,每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下:
| 属性名 | 解释 | 类型 |
......@@ -85,94 +96,171 @@ cd data && python prepare_data.py
- 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。
<p align="center">
<img src = "image/ranges.png" width=550><br/>
图2. 各维属性的取值范围
<img src = "image/ranges.png" width=550><br/>
图2. 各维属性的取值范围
</p>
#### 整理训练集与测试集
我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。一种常见的分割比例为$8:2$,感兴趣的读者朋友们也可以尝试不同的设置来观察这两种误差的变化。
我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$
在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。
## 训练
`fit_a_line/trainer.py`演示了训练的整体过程。
### 初始化PaddlePaddle
执行如下命令可以分割数据集,并将训练集和测试集的地址分别写入train.list 和 test.list两个文件中,供PaddlePaddle读取。
```python
python prepare_data.py -r 0.8 #默认使用8:2的比例进行分割
paddle.init(use_gpu=False, trainer_count=1)
```
在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。
### 模型配置
线性回归的模型其实就是一个采用线性激活函数(linear activation,`LinearActivation`)的全连接层(fully-connected layer,`fc_layer`):
```python
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x,
size=1,
act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
```
### 提供数据给PaddlePaddle
准备好数据之后,我们使用一个Python data provider来为PaddlePaddle的训练过程提供数据。一个 data provider 就是一个Python函数,它会被PaddlePaddle的训练过程调用。在这个例子里,只需要读取已经保存好的数据,然后一行一行地返回给PaddlePaddle的训练进程即可。
### 保存网络拓扑
```python
from paddle.trainer.PyDataProvider2 import *
import numpy as np
#定义数据的类型和维度
@provider(input_types=[dense_vector(13), dense_vector(1)])
def process(settings, input_file):
data = np.load(input_file.strip())
for row in data:
yield row[:-1].tolist(), row[-1:].tolist()
# Save the inference topology to protobuf.
inference_topology = paddle.topology.Topology(layers=y_predict)
with open("inference_topology.pkl", 'wb') as f:
inference_topology.serialize_for_inference(f)
```
### 创建参数
```python
parameters = paddle.parameters.create(cost)
```
## 模型配置说明
### 创建Trainer
### 数据定义
首先,通过 `define_py_data_sources2` 来配置PaddlePaddle从上面的`dataprovider.py`里读入训练数据和测试数据。 PaddlePaddle接受从命令行读入的配置信息,例如这里我们传入一个名为`is_predict`的变量来控制模型在训练和测试时的不同结构。
```python
from paddle.trainer_config_helpers import *
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
```
is_predict = get_config_arg('is_predict', bool, False)
### 读取数据且打印训练的中间信息
define_py_data_sources2(
train_list='data/train.list',
test_list='data/test.list',
module='dataprovider',
obj='process')
PaddlePaddle提供一个
[reader机制](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
来读取数据。 Reader返回的数据可以包括多列,我们需要一个Python dict把列
序号映射到网络里的数据层。
```python
feeding={'x': 0, 'y': 1}
```
### 算法配置
接着,指定模型优化算法的细节。由于线性回归模型比较简单,我们只要设置基本的`batch_size`即可,它指定每次更新参数的时候使用多少条数据计算梯度信息。
此外,我们还可以提供一个 event handler,来打印训练的进度:
```python
settings(batch_size=2)
# event_handler to print training and testing info
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f" % (
event.pass_id, event.batch_id, event.cost)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
print "Test %d, Cost %f" % (event.pass_id, result.cost)
```
### 网络结构
最后,使用`fc_layer``LinearActivation`来表示线性回归的模型本身。
```python
#输入数据,13维的房屋信息
x = data_layer(name='x', size=13)
y_predict = fc_layer(
input=x,
param_attr=ParamAttr(name='w'),
size=1,
act=LinearActivation(),
bias_attr=ParamAttr(name='b'))
if not is_predict: #训练时,我们使用MSE,即regression_cost作为损失函数
y = data_layer(name='y', size=1)
cost = regression_cost(input=y_predict, label=y)
outputs(cost) #训练时输出MSE来监控损失的变化
else: #测试时,输出预测值
outputs(y_predict)
# event_handler to print training and testing info
from paddle.v2.plot import Ploter
train_title = "Train cost"
test_title = "Test cost"
cost_ploter = Ploter(train_title, test_title)
step = 0
def event_handler_plot(event):
global step
if isinstance(event, paddle.event.EndIteration):
if step % 10 == 0: # every 10 batches, record a train cost
cost_ploter.append(train_title, step, event.cost)
if step % 100 == 0: # every 100 batches, record a test cost
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
cost_ploter.append(test_title, step, result.cost)
if step % 100 == 0: # every 100 batches, update cost plot
cost_ploter.plot()
step += 1
if isinstance(event, paddle.event.EndPass):
if event.pass_id % 10 == 0:
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
```
## 训练模型
在对应代码的根目录下执行PaddlePaddle的命令行训练程序。这里指定模型配置文件为`trainer_config.py`,训练30轮,结果保存在`output`路径下。
```bash
./train.sh
### 开始训练
```python
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
uci_housing.train(), buf_size=500),
batch_size=2),
feeding=feeding,
event_handler=event_handler_plot,
num_passes=30)
```
## 应用模型
现在来看下如何使用已经训练好的模型进行预测。
```bash
python predict.py
![png](./image/train_and_test.png)
### 应用模型
#### 1. 生成测试数据
```python
test_data_creator = paddle.dataset.uci_housing.test()
test_data = []
test_label = []
for item in test_data_creator():
test_data.append((item[0],))
test_label.append(item[1])
if len(test_data) == 5:
break
```
这里默认使用`output/pass-00029`中保存的模型进行预测,并将数据中的房价与预测结果进行对比,结果保存在 `predictions.png`中。
如果你想使用别的模型或者其它的数据进行预测,只要传入新的路径即可:
```bash
python predict.py -m output/pass-00020 -t data/housing.test.npy
#### 2. 推测 inference
```python
# load parameters from tar file.
# users can remove the comments and change the model name
# with open('params_pass_20.tar', 'r') as f:
# parameters = paddle.parameters.Parameters.from_tar(f)
probs = paddle.infer(
output_layer=y_predict, parameters=parameters, input=test_data)
for i in xrange(len(probs)):
print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
```
## 总结
......@@ -186,4 +274,4 @@ python predict.py -m output/pass-00020 -t data/housing.test.npy
4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
<br/>
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
# Linear Regression
Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict home prices. Some important concepts in Machine Learning will be covered through this example.
The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
## Problem Setup
Suppose we have a dataset of $n$ real estate properties. Each real estate property will be referred to as **homes** in this chapter for clarity.
Each home is associated with $d$ attributes. The attributes describe characteristics such as the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely,
$$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b, i=1,\ldots,n$$
where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
## Results Demonstration
We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of similar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the closer the point is to the dotted line, better the model's prediction.
<p align="center">
<img src = "image/predictions_en.png" width=400><br/>
Figure 1. Predicted Value V.S. Actual Value
</p>
## Model Overview
### Model Definition
In the UCI Housing Data Set, there are 13 home attributes $\{x_{i,j}\}$ that are related to the median home price $y_i$, which we aim to predict. Thus, our model can be written as:
$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
where $\hat{Y}$ is the predicted value used to differentiate from actual value $Y$. The model learns parameters $\omega_1, \ldots, \omega_{13}, b$, where the entries of $\vec{\omega}$ are **weights** and $b$ is **bias**.
Now we need an objective to optimize, so that the learned parameters can make $\hat{Y}$ as close to $Y$ as possible. Let's refer to the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). A loss function must output a non-negative value, given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$. This value reflects the magnitutude of the model error.
For Linear Regression, the most common loss function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
That is, for a dataset of size $n$, MSE is the average value of the the prediction sqaure errors.
### Training
After setting up our model, there are several major steps to go through to train it:
1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s.
2. Feedforward. Evaluate the network output and compute the corresponding loss.
3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of epochs is reached.
## Dataset
### Python Dataset Modules
Our program starts with importing necessary packages:
```python
import paddle.v2 as paddle
import paddle.v2.dataset.uci_housing as uci_housing
```
We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`. This module can
1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if you haven't yet, and
2. [preprocess](#preprocessing) the dataset.
### An Introduction of the Dataset
The UCI housing dataset has 506 instances. Each instance describes the attributes of a house in surburban Boston. The attributes are explained below:
| Attribute Name | Characteristic | Data Type |
| ------| ------ | ------ |
| CRIM | per capita crime rate by town | Continuous|
| ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous |
| INDUS | proportion of non-retail business acres per town | Continuous |
| CHAS | Charles River dummy variable | Discrete, 1 if tract bounds river; 0 otherwise|
| NOX | nitric oxides concentration (parts per 10 million) | Continuous |
| RM | average number of rooms per dwelling | Continuous |
| AGE | proportion of owner-occupied units built prior to 1940 | Continuous |
| DIS | weighted distances to five Boston employment centres | Continuous |
| RAD | index of accessibility to radial highways | Continuous |
| TAX | full-value property-tax rate per $10,000 | Continuous |
| PTRATIO | pupil-teacher ratio by town | Continuous |
| B | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | Continuous |
| LSTAT | % lower status of the population | Continuous |
| MEDV | Median value of owner-occupied homes in $1000's | Continuous |
The last entry is the median home price.
### Preprocessing
#### Continuous and Discrete Data
We define a feature vector of length 13 for each home, where each entry corresponds to an attribute. Our first observation is that, among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension.
Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically. The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
#### Feature Normalization
We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale the values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we subtract the mean value from the feature value and divide the result by the width of the original range.
There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
- A value range that is too large or too small might cause floating number overflow or underflow during computation.
- Different value ranges might result in varying *importances* of different features to the model (at least in the beginning of the training process). This assumption about the data is often unreasonable, making the optimization difficult, which in turn results in increased training time.
- Many machine learning techniques or models (e.g., *L1/L2 regularization* and *Vector Space Model*) assumes that all the features have roughly zero means and their value ranges are similar.
<p align="center">
<img src = "image/ranges_en.png" width=550><br/>
Figure 2. The value ranges of the features
</p>
#### Prepare Training and Test Sets
We split the dataset in two, one for adjusting the model parameters, namely, for training the model, and the other for testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process.
## Training
`fit_a_line/trainer.py` demonstrates the training using [PaddlePaddle](http://paddlepaddle.org).
### Initialize PaddlePaddle
```python
paddle.init(use_gpu=False, trainer_count=1)
```
### Model Configuration
Linear regression is essentially a fully-connected layer with linear activation:
```python
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x,
size=1,
act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
```
### Save Topology
```python
# Save the inference topology to protobuf.
inference_topology = paddle.topology.Topology(layers=y_predict)
with open("inference_topology.pkl", 'wb') as f:
inference_topology.serialize_for_inference(f)
```
### Create Parameters
```python
parameters = paddle.parameters.create(cost)
```
### Create Trainer
```python
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
```
### Feeding Data
PaddlePaddle provides the
[reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
for loading the training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
```python
feeding={'x': 0, 'y': 1}
```
Moreover, an event handler is provided to print the training progress:
```python
# event_handler to print training and testing info
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f" % (
event.pass_id, event.batch_id, event.cost)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
print "Test %d, Cost %f" % (event.pass_id, result.cost)
```
```python
# event_handler to plot training and testing info
from paddle.v2.plot import Ploter
train_title = "Train cost"
test_title = "Test cost"
plot_cost = Ploter(train_title, test_title)
step = 0
def event_handler_plot(event):
global step
if isinstance(event, paddle.event.EndIteration):
if step % 10 == 0: # every 10 batches, record a train cost
plot_cost.append(train_title, step, event.cost)
if step % 100 == 0: # every 100 batches, record a test cost
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
plot_cost.append(test_title, step, result.cost)
if step % 100 == 0: # every 100 batches, update cost plot
plot_cost.plot()
step += 1
if isinstance(event, paddle.event.EndPass):
if event.pass_id % 10 == 0:
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
```
### Start Training
```python
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
uci_housing.train(), buf_size=500),
batch_size=2),
feeding=feeding,
event_handler=event_handler_plot,
num_passes=30)
```
![png](./image/train_and_test.png)
### Apply model
#### 1. generate testing data
```python
test_data_creator = paddle.dataset.uci_housing.test()
test_data = []
test_label = []
for item in test_data_creator():
test_data.append((item[0],))
test_label.append(item[1])
if len(test_data) == 5:
break
```
#### 2. inference
```python
# load parameters from tar file.
# users can remove the comments and change the model name
# with open('params_pass_20.tar', 'r') as f:
# parameters = paddle.parameters.Parameters.from_tar(f)
probs = paddle.infer(
output_layer=y_predict, parameters=parameters, input=test_data)
for i in xrange(len(probs)):
print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
```
## Summary
This chapter introduces *Linear Regression* and how to train and test this model with PaddlePaddle, using the UCI Housing Data Set. Because a large number of more complex models and techniques are derived from linear regression, it is important to understand its underlying theory and limitation.
## References
1. https://en.wikipedia.org/wiki/Linear_regression
2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
<br/>
This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
<html>
<head>
<script type="text/x-mathjax-config">
......@@ -5,21 +6,21 @@
extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'], ["\\(","\\)"] ],
displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
inlineMath: [ ['$','$'] ],
displayMath: [ ['$$','$$'] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
<script type="text/javascript" src="../.tmpl/marked.js">
<script type="text/javascript" src="../.tools/theme/marked.js">
</script>
<link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
<script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
<link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
<link href="../.tmpl/github-markdown.css" rel='stylesheet'>
<link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
</head>
<style type="text/css" >
.markdown-body {
......@@ -34,7 +35,7 @@
<body>
<div id="context" class="container markdown-body">
<div id="context" class="container-fluid markdown-body">
</div>
<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
......@@ -42,7 +43,7 @@
# 线性回归
让我们从经典的线性回归(Linear Regression \[[1](#参考文献)\])模型开始这份教程。在这一章里,你将使用真实的数据集建立起一个房价预测模型,并且了解到机器学习中的若干重要概念。
本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/137.html)。
## 背景介绍
给定一个大小为$n$的数据集 ${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$,其中$x_{i1}, \ldots, x_{id}$是第$i$个样本$d$个属性上的取值,$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述,即
......@@ -56,8 +57,8 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b, i=1,\ldo
## 效果展示
我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中,每个点的横坐标表示同一类房屋真实价格的中位数,纵坐标表示线性回归模型根据特征预测的结果,当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确,则点离虚线越近。
<p align="center">
<img src = "image/predictions.png" width=400><br/>
图1. 预测值 V.S. 真实值
<img src = "image/predictions.png" width=400><br/>
图1. 预测值 V.S. 真实值
</p>
## 模型概览
......@@ -70,7 +71,7 @@ $$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
$\hat{Y}$ 表示模型的预测结果,用来和真实值$Y$区分。模型要学习的参数即:$\omega_1, \ldots, \omega_{13}, b$。
建立模型后,我们需要给模型一个优化目标,使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数([Loss Function](https://en.wikipedia.org/wiki/Loss_function),或Cost Function)这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$,损失函数输出一个非负的实值。这个实通常用来反映模型误差的大小。
建立模型后,我们需要给模型一个优化目标,使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数([Loss Function](https://en.wikipedia.org/wiki/Loss_function),或Cost Function)这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$,损失函数输出一个非负的实值。这个实通常用来反映模型误差的大小。
对于线性回归模型来讲,最常见的损失函数就是均方误差(Mean Squared Error, [MSE](https://en.wikipedia.org/wiki/Mean_squared_error))了,它的形式是:
......@@ -86,14 +87,25 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
3. 根据损失函数进行反向误差传播 ([backpropagation](https://en.wikipedia.org/wiki/Backpropagation)),将网络误差从输出层依次向前传递, 并更新网络中的参数。
4. 重复2~3步骤,直至网络训练误差达到规定的程度或训练轮次达到设定值。
## 数据集
### 数据集接口的封装
首先加载需要的包
## 数据准备
执行以下命令来准备数据:
```bash
cd data && python prepare_data.py
```python
import paddle.v2 as paddle
import paddle.v2.dataset.uci_housing as uci_housing
```
这段代码将从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)下载数据并进行[预处理](#数据预处理),最后数据将被分为训练集和测试集。
我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)
其中,在uci_housing模块中封装了:
1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
2. [数据预处理](#数据预处理)的过程。
### 数据集介绍
这份数据集共506行,每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下:
| 属性名 | 解释 | 类型 |
......@@ -126,94 +138,171 @@ cd data && python prepare_data.py
- 很多的机器学习技巧/模型(例如L1,L2正则项,向量空间模型-Vector Space Model)都基于这样的假设:所有的属性取值都差不多是以0为均值且取值范围相近的。
<p align="center">
<img src = "image/ranges.png" width=550><br/>
图2. 各维属性的取值范围
<img src = "image/ranges.png" width=550><br/>
图2. 各维属性的取值范围
</p>
#### 整理训练集与测试集
我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。一种常见的分割比例为$8:2$,感兴趣的读者朋友们也可以尝试不同的设置来观察这两种误差的变化。
我们将数据集分割为两份:一份用于调整模型的参数,即进行模型的训练,模型在这份数据集上的误差被称为**训练误差**;另外一份被用来测试,模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据,所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素:更多的训练数据会降低参数估计的方差,从而得到更可信的模型;而更多的测试数据会降低测试误差的方差,从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$
在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。
## 训练
`fit_a_line/trainer.py`演示了训练的整体过程。
### 初始化PaddlePaddle
执行如下命令可以分割数据集,并将训练集和测试集的地址分别写入train.list 和 test.list两个文件中,供PaddlePaddle读取。
```python
python prepare_data.py -r 0.8 #默认使用8:2的比例进行分割
paddle.init(use_gpu=False, trainer_count=1)
```
在更复杂的模型训练过程中,我们往往还会多使用一种数据集:验证集。因为复杂的模型中常常还有一些超参数([Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization))需要调节,所以我们会尝试多种超参数的组合来分别训练多个模型,然后对比它们在验证集上的表现选择相对最好的一组超参数,最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单,我们暂且忽略掉这个过程。
### 模型配置
### 提供数据给PaddlePaddle
准备好数据之后,我们使用一个Python data provider来为PaddlePaddle的训练过程提供数据。一个 data provider 就是一个Python函数,它会被PaddlePaddle的训练过程调用。在这个例子里,只需要读取已经保存好的数据,然后一行一行地返回给PaddlePaddle的训练进程即可。
线性回归的模型其实就是一个采用线性激活函数(linear activation,`LinearActivation`)的全连接层(fully-connected layer,`fc_layer`):
```python
from paddle.trainer.PyDataProvider2 import *
import numpy as np
#定义数据的类型和维度
@provider(input_types=[dense_vector(13), dense_vector(1)])
def process(settings, input_file):
data = np.load(input_file.strip())
for row in data:
yield row[:-1].tolist(), row[-1:].tolist()
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x,
size=1,
act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
```
### 保存网络拓扑
```python
# Save the inference topology to protobuf.
inference_topology = paddle.topology.Topology(layers=y_predict)
with open("inference_topology.pkl", 'wb') as f:
inference_topology.serialize_for_inference(f)
```
## 模型配置说明
### 创建参数
### 数据定义
首先,通过 `define_py_data_sources2` 来配置PaddlePaddle从上面的`dataprovider.py`里读入训练数据和测试数据。 PaddlePaddle接受从命令行读入的配置信息,例如这里我们传入一个名为`is_predict`的变量来控制模型在训练和测试时的不同结构。
```python
from paddle.trainer_config_helpers import *
parameters = paddle.parameters.create(cost)
```
### 创建Trainer
```python
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
```
is_predict = get_config_arg('is_predict', bool, False)
### 读取数据且打印训练的中间信息
define_py_data_sources2(
train_list='data/train.list',
test_list='data/test.list',
module='dataprovider',
obj='process')
PaddlePaddle提供一个
[reader机制](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
来读取数据。 Reader返回的数据可以包括多列,我们需要一个Python dict把列
序号映射到网络里的数据层。
```python
feeding={'x': 0, 'y': 1}
```
### 算法配置
接着,指定模型优化算法的细节。由于线性回归模型比较简单,我们只要设置基本的`batch_size`即可,它指定每次更新参数的时候使用多少条数据计算梯度信息。
此外,我们还可以提供一个 event handler,来打印训练的进度:
```python
settings(batch_size=2)
# event_handler to print training and testing info
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f" % (
event.pass_id, event.batch_id, event.cost)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
print "Test %d, Cost %f" % (event.pass_id, result.cost)
```
### 网络结构
最后,使用`fc_layer`和`LinearActivation`来表示线性回归的模型本身。
```python
#输入数据,13维的房屋信息
x = data_layer(name='x', size=13)
y_predict = fc_layer(
input=x,
param_attr=ParamAttr(name='w'),
size=1,
act=LinearActivation(),
bias_attr=ParamAttr(name='b'))
if not is_predict: #训练时,我们使用MSE,即regression_cost作为损失函数
y = data_layer(name='y', size=1)
cost = regression_cost(input=y_predict, label=y)
outputs(cost) #训练时输出MSE来监控损失的变化
else: #测试时,输出预测值
outputs(y_predict)
# event_handler to print training and testing info
from paddle.v2.plot import Ploter
train_title = "Train cost"
test_title = "Test cost"
cost_ploter = Ploter(train_title, test_title)
step = 0
def event_handler_plot(event):
global step
if isinstance(event, paddle.event.EndIteration):
if step % 10 == 0: # every 10 batches, record a train cost
cost_ploter.append(train_title, step, event.cost)
if step % 100 == 0: # every 100 batches, record a test cost
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
cost_ploter.append(test_title, step, result.cost)
if step % 100 == 0: # every 100 batches, update cost plot
cost_ploter.plot()
step += 1
if isinstance(event, paddle.event.EndPass):
if event.pass_id % 10 == 0:
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
```
## 训练模型
在对应代码的根目录下执行PaddlePaddle的命令行训练程序。这里指定模型配置文件为`trainer_config.py`,训练30轮,结果保存在`output`路径下。
```bash
./train.sh
### 开始训练
```python
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
uci_housing.train(), buf_size=500),
batch_size=2),
feeding=feeding,
event_handler=event_handler_plot,
num_passes=30)
```
## 应用模型
现在来看下如何使用已经训练好的模型进行预测。
```bash
python predict.py
![png](./image/train_and_test.png)
### 应用模型
#### 1. 生成测试数据
```python
test_data_creator = paddle.dataset.uci_housing.test()
test_data = []
test_label = []
for item in test_data_creator():
test_data.append((item[0],))
test_label.append(item[1])
if len(test_data) == 5:
break
```
这里默认使用`output/pass-00029`中保存的模型进行预测,并将数据中的房价与预测结果进行对比,结果保存在 `predictions.png`中。
如果你想使用别的模型或者其它的数据进行预测,只要传入新的路径即可:
```bash
python predict.py -m output/pass-00020 -t data/housing.test.npy
#### 2. 推测 inference
```python
# load parameters from tar file.
# users can remove the comments and change the model name
# with open('params_pass_20.tar', 'r') as f:
# parameters = paddle.parameters.Parameters.from_tar(f)
probs = paddle.infer(
output_layer=y_predict, parameters=parameters, input=test_data)
for i in xrange(len(probs)):
print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
```
## 总结
......@@ -227,7 +316,8 @@ python predict.py -m output/pass-00020 -t data/housing.test.npy
4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
<br/>
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
</div>
<!-- You can change the lines below now. -->
......@@ -246,6 +336,6 @@ marked.setOptions({
}
});
document.getElementById("context").innerHTML = marked(
document.getElementById("markdown").innerHTML)
document.getElementById("markdown").innerHTML)
</script>
</body>
<html>
<head>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
jax: ["input/TeX", "output/HTML-CSS"],
tex2jax: {
inlineMath: [ ['$','$'] ],
displayMath: [ ['$$','$$'] ],
processEscapes: true
},
"HTML-CSS": { availableFonts: ["TeX"] }
});
</script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
<script type="text/javascript" src="../.tools/theme/marked.js">
</script>
<link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
<script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
<link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
<link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
</head>
<style type="text/css" >
.markdown-body {
box-sizing: border-box;
min-width: 200px;
max-width: 980px;
margin: 0 auto;
padding: 45px;
}
</style>
<body>
<div id="context" class="container-fluid markdown-body">
</div>
<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
<div id="markdown" style='display:none'>
# Linear Regression
Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict home prices. Some important concepts in Machine Learning will be covered through this example.
The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
## Problem Setup
Suppose we have a dataset of $n$ real estate properties. Each real estate property will be referred to as **homes** in this chapter for clarity.
Each home is associated with $d$ attributes. The attributes describe characteristics such as the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely,
$$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b, i=1,\ldots,n$$
where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
## Results Demonstration
We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of similar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the closer the point is to the dotted line, better the model's prediction.
<p align="center">
<img src = "image/predictions_en.png" width=400><br/>
Figure 1. Predicted Value V.S. Actual Value
</p>
## Model Overview
### Model Definition
In the UCI Housing Data Set, there are 13 home attributes $\{x_{i,j}\}$ that are related to the median home price $y_i$, which we aim to predict. Thus, our model can be written as:
$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
where $\hat{Y}$ is the predicted value used to differentiate from actual value $Y$. The model learns parameters $\omega_1, \ldots, \omega_{13}, b$, where the entries of $\vec{\omega}$ are **weights** and $b$ is **bias**.
Now we need an objective to optimize, so that the learned parameters can make $\hat{Y}$ as close to $Y$ as possible. Let's refer to the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). A loss function must output a non-negative value, given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$. This value reflects the magnitutude of the model error.
For Linear Regression, the most common loss function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
That is, for a dataset of size $n$, MSE is the average value of the the prediction sqaure errors.
### Training
After setting up our model, there are several major steps to go through to train it:
1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s.
2. Feedforward. Evaluate the network output and compute the corresponding loss.
3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of epochs is reached.
## Dataset
### Python Dataset Modules
Our program starts with importing necessary packages:
```python
import paddle.v2 as paddle
import paddle.v2.dataset.uci_housing as uci_housing
```
We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`. This module can
1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if you haven't yet, and
2. [preprocess](#preprocessing) the dataset.
### An Introduction of the Dataset
The UCI housing dataset has 506 instances. Each instance describes the attributes of a house in surburban Boston. The attributes are explained below:
| Attribute Name | Characteristic | Data Type |
| ------| ------ | ------ |
| CRIM | per capita crime rate by town | Continuous|
| ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous |
| INDUS | proportion of non-retail business acres per town | Continuous |
| CHAS | Charles River dummy variable | Discrete, 1 if tract bounds river; 0 otherwise|
| NOX | nitric oxides concentration (parts per 10 million) | Continuous |
| RM | average number of rooms per dwelling | Continuous |
| AGE | proportion of owner-occupied units built prior to 1940 | Continuous |
| DIS | weighted distances to five Boston employment centres | Continuous |
| RAD | index of accessibility to radial highways | Continuous |
| TAX | full-value property-tax rate per $10,000 | Continuous |
| PTRATIO | pupil-teacher ratio by town | Continuous |
| B | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | Continuous |
| LSTAT | % lower status of the population | Continuous |
| MEDV | Median value of owner-occupied homes in $1000's | Continuous |
The last entry is the median home price.
### Preprocessing
#### Continuous and Discrete Data
We define a feature vector of length 13 for each home, where each entry corresponds to an attribute. Our first observation is that, among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension.
Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically. The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
#### Feature Normalization
We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale the values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we subtract the mean value from the feature value and divide the result by the width of the original range.
There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
- A value range that is too large or too small might cause floating number overflow or underflow during computation.
- Different value ranges might result in varying *importances* of different features to the model (at least in the beginning of the training process). This assumption about the data is often unreasonable, making the optimization difficult, which in turn results in increased training time.
- Many machine learning techniques or models (e.g., *L1/L2 regularization* and *Vector Space Model*) assumes that all the features have roughly zero means and their value ranges are similar.
<p align="center">
<img src = "image/ranges_en.png" width=550><br/>
Figure 2. The value ranges of the features
</p>
#### Prepare Training and Test Sets
We split the dataset in two, one for adjusting the model parameters, namely, for training the model, and the other for testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process.
## Training
`fit_a_line/trainer.py` demonstrates the training using [PaddlePaddle](http://paddlepaddle.org).
### Initialize PaddlePaddle
```python
paddle.init(use_gpu=False, trainer_count=1)
```
### Model Configuration
Linear regression is essentially a fully-connected layer with linear activation:
```python
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x,
size=1,
act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
```
### Save Topology
```python
# Save the inference topology to protobuf.
inference_topology = paddle.topology.Topology(layers=y_predict)
with open("inference_topology.pkl", 'wb') as f:
inference_topology.serialize_for_inference(f)
```
### Create Parameters
```python
parameters = paddle.parameters.create(cost)
```
### Create Trainer
```python
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=optimizer)
```
### Feeding Data
PaddlePaddle provides the
[reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
for loading the training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
```python
feeding={'x': 0, 'y': 1}
```
Moreover, an event handler is provided to print the training progress:
```python
# event_handler to print training and testing info
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f" % (
event.pass_id, event.batch_id, event.cost)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
print "Test %d, Cost %f" % (event.pass_id, result.cost)
```
```python
# event_handler to plot training and testing info
from paddle.v2.plot import Ploter
train_title = "Train cost"
test_title = "Test cost"
plot_cost = Ploter(train_title, test_title)
step = 0
def event_handler_plot(event):
global step
if isinstance(event, paddle.event.EndIteration):
if step % 10 == 0: # every 10 batches, record a train cost
plot_cost.append(train_title, step, event.cost)
if step % 100 == 0: # every 100 batches, record a test cost
result = trainer.test(
reader=paddle.batch(
uci_housing.test(), batch_size=2),
feeding=feeding)
plot_cost.append(test_title, step, result.cost)
if step % 100 == 0: # every 100 batches, update cost plot
plot_cost.plot()
step += 1
if isinstance(event, paddle.event.EndPass):
if event.pass_id % 10 == 0:
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
```
### Start Training
```python
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
uci_housing.train(), buf_size=500),
batch_size=2),
feeding=feeding,
event_handler=event_handler_plot,
num_passes=30)
```
![png](./image/train_and_test.png)
### Apply model
#### 1. generate testing data
```python
test_data_creator = paddle.dataset.uci_housing.test()
test_data = []
test_label = []
for item in test_data_creator():
test_data.append((item[0],))
test_label.append(item[1])
if len(test_data) == 5:
break
```
#### 2. inference
```python
# load parameters from tar file.
# users can remove the comments and change the model name
# with open('params_pass_20.tar', 'r') as f:
# parameters = paddle.parameters.Parameters.from_tar(f)
probs = paddle.infer(
output_layer=y_predict, parameters=parameters, input=test_data)
for i in xrange(len(probs)):
print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
```
## Summary
This chapter introduces *Linear Regression* and how to train and test this model with PaddlePaddle, using the UCI Housing Data Set. Because a large number of more complex models and techniques are derived from linear regression, it is important to understand its underlying theory and limitation.
## References
1. https://en.wikipedia.org/wiki/Linear_regression
2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
<br/>
This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
</div>
<!-- You can change the lines below now. -->
<script type="text/javascript">
marked.setOptions({
renderer: new marked.Renderer(),
gfm: true,
breaks: false,
smartypants: true,
highlight: function(code, lang) {
code = code.replace(/&amp;/g, "&")
code = code.replace(/&gt;/g, ">")
code = code.replace(/&lt;/g, "<")
code = code.replace(/&nbsp;/g, " ")
return hljs.highlightAuto(code, [lang]).value;
}
});
document.getElementById("context").innerHTML = marked(
document.getElementById("markdown").innerHTML)
</script>
</body>
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
import os
import paddle.v2 as paddle
import paddle.v2.dataset.uci_housing as uci_housing
with_gpu = os.getenv('WITH_GPU', '0') != '0'
def main():
# init
paddle.init(use_gpu=with_gpu, trainer_count=1)
# network config
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
cost = paddle.layer.square_error_cost(input=y_predict, label=y)
# Save the inference topology to protobuf.
inference_topology = paddle.topology.Topology(layers=y_predict)
with open("inference_topology.pkl", 'wb') as f:
inference_topology.serialize_for_inference(f)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
feeding = {'x': 0, 'y': 1}
# event_handler to print training and testing info
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f" % (
event.pass_id, event.batch_id, event.cost)
if isinstance(event, paddle.event.EndPass):
if event.pass_id % 10 == 0:
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
result = trainer.test(
reader=paddle.batch(uci_housing.test(), batch_size=2),
feeding=feeding)
print "Test %d, Cost %f" % (event.pass_id, result.cost)
# training
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(uci_housing.train(), buf_size=500),
batch_size=2),
feeding=feeding,
event_handler=event_handler,
num_passes=30)
# inference
test_data_creator = paddle.dataset.uci_housing.test()
test_data = []
test_label = []
for item in test_data_creator():
test_data.append((item[0], ))
test_label.append(item[1])
if len(test_data) == 5:
break
# load parameters from tar file.
# users can remove the comments and change the model name
# with open('params_pass_20.tar', 'r') as f:
# parameters = paddle.parameters.Parameters.from_tar(f)
probs = paddle.infer(
output_layer=y_predict, parameters=parameters, input=test_data)
for i in xrange(len(probs)):
print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
if __name__ == '__main__':
main()
import requests
from PIL import Image
import numpy as np
import os
# this client is used by Paddle serve: https://github.com/PaddlePaddle/book/tree/develop/serve
# please do not use it directly
def load_image(file):
im = Image.open(file).convert('L')
im = im.resize((28, 28), Image.ANTIALIAS)
im = np.array(im).astype(np.float32).flatten()
im = im / 255.0
return im
cur_dir = os.path.dirname(os.path.realpath(__file__))
data = load_image(cur_dir + '/../image/infer_3.png')
data = data.tolist()
r = requests.post("http://0.0.0.0:8000", json={'img': data})
print(r.text)
import os
from PIL import Image
import numpy as np
import paddle.v2 as paddle
with_gpu = os.getenv('WITH_GPU', '0') != '0'
def softmax_regression(img):
predict = paddle.layer.fc(
input=img, size=10, act=paddle.activation.Softmax())
return predict
def multilayer_perceptron(img):
# The first fully-connected layer
hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
# The second fully-connected layer and the according activation function
hidden2 = paddle.layer.fc(
input=hidden1, size=64, act=paddle.activation.Relu())
# The thrid fully-connected layer, note that the hidden size should be 10,
# which is the number of unique digits
predict = paddle.layer.fc(
input=hidden2, size=10, act=paddle.activation.Softmax())
return predict
def convolutional_neural_network(img):
# first conv layer
conv_pool_1 = paddle.networks.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
num_channel=1,
pool_size=2,
pool_stride=2,
act=paddle.activation.Relu())
# second conv layer
conv_pool_2 = paddle.networks.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
num_channel=20,
pool_size=2,
pool_stride=2,
act=paddle.activation.Relu())
# fully-connected layer
predict = paddle.layer.fc(
input=conv_pool_2, size=10, act=paddle.activation.Softmax())
return predict
def main():
paddle.init(use_gpu=with_gpu, trainer_count=1)
# define network topology
images = paddle.layer.data(
name='pixel', type=paddle.data_type.dense_vector(784))
label = paddle.layer.data(
name='label', type=paddle.data_type.integer_value(10))
# Here we can build the prediction network in different ways. Please
# choose one by uncomment corresponding line.
# predict = softmax_regression(images)
# predict = multilayer_perceptron(images)
predict = convolutional_neural_network(images)
cost = paddle.layer.classification_cost(input=predict, label=label)
parameters = paddle.parameters.create(cost)
optimizer = paddle.optimizer.Momentum(
learning_rate=0.1 / 128.0,
momentum=0.9,
regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
lists = []
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if isinstance(event, paddle.event.EndPass):
# save parameters
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
result = trainer.test(reader=paddle.batch(
paddle.dataset.mnist.test(), batch_size=128))
print "Test with Pass %d, Cost %f, %s\n" % (
event.pass_id, result.cost, result.metrics)
lists.append((event.pass_id, result.cost,
result.metrics['classification_error_evaluator']))
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=8192),
batch_size=128),
event_handler=event_handler,
num_passes=5)
# find the best pass
best = sorted(lists, key=lambda list: float(list[1]))[0]
print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
def load_image(file):
im = Image.open(file).convert('L')
im = im.resize((28, 28), Image.ANTIALIAS)
im = np.array(im).astype(np.float32).flatten()
im = im / 255.0 * 2.0 - 1.0
return im
test_data = []
cur_dir = os.path.dirname(os.path.realpath(__file__))
test_data.append((load_image(cur_dir + '/image/infer_3.png'), ))
probs = paddle.infer(
output_layer=predict, parameters=parameters, input=test_data)
lab = np.argsort(-probs) # probs and lab are the results of one batch data
print "Label of image/infer_3.png is: %d" % lab[0][0]
if __name__ == '__main__':
main()
图像分类
=======
本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification), 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)
# 图像分类
## 背景介绍
本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification), 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书),更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)
## 背景介绍
图像相比文字能够提供更加生动、容易理解及更具艺术感的信息,是人们转递与交换信息的重要来源。在本教程中,我们专注于图像识别领域的一个重要问题,即图像分类。
......@@ -51,7 +51,7 @@
2). **特征编码**: 底层特征中包含了大量冗余与噪声,为了提高特征表达的鲁棒性,需要使用一种特征变换算法对底层特征进行编码,称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
3). **空间特征约束**: 特征编码之后一般会经过空间特征约束,也称作**特征汇聚**。特征汇聚是指在一个空间范围内,对每一维特征取最大值或者平均值,可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法,这种方法提出将图像均匀分块,在分块内做特征汇聚。
4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述,接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器,在传统图像分类任务上性能很好。
这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\][NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征,两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]
Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破,效果大幅度超越传统方法,获得了ILSVRC2012冠军,该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后,涌现了一系列CNN模型,不断地在ImageNet上刷新成绩,如图4展示。随着模型变得越来越深以及精妙的结构设计,Top-5的错误率也越来越低,降到了3.5%附近。而在同样的ImageNet数据集上,人眼的辨识错误率大概在5.1%,也就是目前的深度学习模型的识别能力已经超过了人眼。
......@@ -67,8 +67,8 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
<p align="center">
<img src="image/lenet.png"><br/>
图5. CNN网络示例[20]
</p>
图5. CNN网络示例[20]
</p>
- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征,发掘出图片局部关联性质和空间不变性质。
- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作,可以过滤掉一些不重要的高频信息。
......@@ -108,7 +108,7 @@ GoogleNet整体网络结构如图8所示,总共22层网络:开始由3层普
<p align="center">
<img src="image/googlenet.jpeg" ><br/>
图8. GoogleNet[12]
图8. GoogleNet[12]
</p>
......@@ -136,7 +136,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
## 数据准备
通用图像分类公开的标准数据集常用的有[CIFAR](<https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。
通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)[ImageNet](http://image-net.org/)[COCO](http://mscoco.org/)等,常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大,如[模型概览](#模型概览)一章所讲,大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化,常用的是ImageNet-2012数据集,该数据集包含1000个类别:训练集包含1,281,167张图片,每个类别数据732至1300张不等,验证集包含50,000张图片,平均每个类别50张图片。
由于ImageNet数据集较大,下载和训练较慢,为了方便大家学习,我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片,10个类别,每个类包含6,000张。其中50,000张图片作为训练集,10000张作为测试集。图11从每个类别中随机抽取了10张图片,展示了所有的类别。
......@@ -173,24 +173,24 @@ paddle.init(use_gpu=False, trainer_count=1)
1. 定义数据输入及其维度
网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。
```python
网络输入定义为 `data_layer` (数据层),在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图,因此输入数据大小为3072(3x32x32),类别大小为10,即10分类。
```python
datadim = 3 * 32 * 32
classdim = 10
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(datadim))
```
```
2. 定义VGG网络核心模块
```python
net = vgg_bn_drop(image)
```
VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下:
```python
```python
net = vgg_bn_drop(image)
```
VGG核心模块的输入是数据层,`vgg_bn_drop` 定义了16层VGG结构,每层卷积后面引入BN层和Dropout层,详细的定义如下:
```python
def vgg_bn_drop(input):
def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
return paddle.networks.img_conv_group(
......@@ -219,40 +219,40 @@ paddle.init(use_gpu=False, trainer_count=1)
layer_attr=paddle.attr.Extra(drop_rate=0.5))
fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
return fc2
```
2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成,
2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。
2.3. 最后接两层512维的全连接。
```
2.1. 首先定义了一组卷积网络,即conv_block。卷积核大小为3x3,池化窗口大小为2x2,窗口滑动大小为2,groups决定每组VGG模块是几次连续的卷积操作,dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块,由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
2.2. 五组卷积操作,即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0,即不使用Dropout操作。
2.3. 最后接两层512维的全连接。
3. 定义分类器
通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。
通过上面VGG网络提取高层特征,然后经过全连接层映射到类别维度大小的向量,再通过Softmax归一化得到每个类别的概率,也可称作分类器。
```python
```python
out = paddle.layer.fc(input=net,
size=classdim,
act=paddle.activation.Softmax())
```
```
4. 定义损失函数和网络输出
在有监督训练中需要输入图像对应的类别信息,同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。
```python
在有监督训练中需要输入图像对应的类别信息,同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数,并作为网络的输出,预测阶段定义网络的输出为分类器得到的概率信息。
```python
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(classdim))
cost = paddle.layer.classification_cost(input=out, label=lbl)
```
```
### ResNet
ResNet模型的第1、3、4步和VGG模型相同,这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
```python
net = resnet_cifar10(data, depth=56)
net = resnet_cifar10(image, depth=56)
```
先介绍`resnet_cifar10`中的一些基本函数,再介绍网络连接过程。
......@@ -305,9 +305,9 @@ def layer_warp(block_func, ipt, features, count, stride):
`resnet_cifar10` 的连接结构主要有以下几个过程。
1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。
1. 底层输入连接一层 `conv_bn_layer`,即带BN的卷积层。
2. 然后连接3组残差模块即下面配置3组 `layer_warp` ,每组采用图 10 左边残差模块组成。
3. 最后对网络做均值池化并返回该层。
3. 最后对网络做均值池化并返回该层。
注意:除过第一层卷积层和最后一层全连接层之外,要求三组 `layer_warp` 总的含参层数能够被6整除,即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。
......@@ -356,8 +356,7 @@ momentum_optimizer = paddle.optimizer.Momentum(
learning_rate=0.1 / 128.0,
learning_rate_decay_a=0.1,
learning_rate_decay_b=50000 * 100,
learning_rate_schedule='discexp',
batch_size=128)
learning_rate_schedule='discexp')
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
......@@ -375,7 +374,7 @@ $$ lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
cifar.train10()每次产生一条样本,在完成shuffle和batch之后,作为训练的输入。
```python
reader=paddle.reader.batch(
reader=paddle.batch(
paddle.reader.shuffle(
paddle.dataset.cifar.train10(), buf_size=50000),
batch_size=128)
......@@ -390,6 +389,36 @@ feeding={'image': 0,
可以使用`event_handler`回调函数来观察训练过程,或进行测试等, 该回调函数是`trainer.train`函数里设定。
`event_handler_plot`可以用来利用回调数据来打点画图:
![png](./image/train_and_test.png)
```python
from paddle.v2.plot import Ploter
train_title = "Train cost"
test_title = "Test cost"
cost_ploter = Ploter(train_title, test_title)
step = 0
def event_handler_plot(event):
global step
if isinstance(event, paddle.event.EndIteration):
if step % 1 == 0:
cost_ploter.append(train_title, step, event.cost)
cost_ploter.plot()
step += 1
if isinstance(event, paddle.event.EndPass):
result = trainer.test(
reader=paddle.batch(
paddle.dataset.cifar.test10(), batch_size=128),
feeding=feeding)
cost_ploter.append(test_title, step, result.cost)
```
`event_handler` 用来在训练过程中输出文本日志
```python
# End batch and end pass event handler
def event_handler(event):
......@@ -401,11 +430,14 @@ def event_handler(event):
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
# save parameters
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
result = trainer.test(
reader=paddle.reader.batch(
reader=paddle.batch(
paddle.dataset.cifar.test10(), batch_size=128),
reader_dict={'image': 0,
'label': 1})
feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
```
......@@ -415,7 +447,7 @@ def event_handler(event):
trainer.train(
reader=reader,
num_passes=200,
event_handler=event_handler,
event_handler=event_handler_plot,
feeding=feeding)
```
......@@ -440,6 +472,41 @@ Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
图12. CIFAR10数据集上VGG模型的分类错误率
</p>
## 应用模型
可以使用训练好的模型对图片进行分类,下面程序展示了如何使用`paddle.infer`接口进行推断,可以打开注释,更改加载的模型。
```python
from PIL import Image
import numpy as np
import os
def load_image(file):
im = Image.open(file)
im = im.resize((32, 32), Image.ANTIALIAS)
im = np.array(im).astype(np.float32)
# PIL打开图片存储顺序为H(高度),W(宽度),C(通道)。
# PaddlePaddle要求数据顺序为CHW,所以需要转换顺序。
im = im.transpose((2, 0, 1)) # CHW
# CIFAR训练图片通道顺序为B(蓝),G(绿),R(红),
# 而PIL打开图片默认通道顺序为RGB,因为需要交换通道。
im = im[(2, 1, 0),:,:] # BGR
im = im.flatten()
im = im / 255.0
return im
test_data = []
cur_dir = os.getcwd()
test_data.append((load_image(cur_dir + '/image/dog.png'),))
# with open('params_pass_50.tar', 'r') as f:
# parameters = paddle.parameters.Parameters.from_tar(f)
probs = paddle.infer(
output_layer=out, parameters=parameters, input=test_data)
lab = np.argsort(-probs) # probs and lab are the results of one batch data
print "Label of image/dog.png is: %d" % lab[0][0]
```
## 总结
......@@ -452,7 +519,7 @@ Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
......@@ -493,4 +560,4 @@ Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
[22] http://cs231n.github.io/classification/
<br/>
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span><a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作,采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
此差异已折叠。
......@@ -36,26 +36,25 @@ def conv_bn_layer(input,
return paddle.layer.batch_norm(input=tmp, act=active_type)
def shortcut(ipt, n_in, n_out, stride):
if n_in != n_out:
return conv_bn_layer(ipt, n_out, 1, stride, 0,
def shortcut(ipt, ch_in, ch_out, stride):
if ch_in != ch_out:
return conv_bn_layer(ipt, ch_out, 1, stride, 0,
paddle.activation.Linear())
else:
return ipt
def basicblock(ipt, ch_out, stride):
ch_in = ch_out * 2
def basicblock(ipt, ch_in, ch_out, stride):
tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
short = shortcut(ipt, ch_in, ch_out, stride)
return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
def layer_warp(block_func, ipt, features, count, stride):
tmp = block_func(ipt, features, stride)
def layer_warp(block_func, ipt, ch_in, ch_out, count, stride):
tmp = block_func(ipt, ch_in, ch_out, stride)
for i in range(1, count):
tmp = block_func(tmp, features, 1)
tmp = block_func(tmp, ch_out, ch_out, 1)
return tmp
......@@ -66,9 +65,9 @@ def resnet_cifar10(ipt, depth=32):
nStages = {16, 64, 128}
conv1 = conv_bn_layer(
ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
res1 = layer_warp(basicblock, conv1, 16, n, 1)
res2 = layer_warp(basicblock, res1, 32, n, 2)
res3 = layer_warp(basicblock, res2, 64, n, 2)
res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
pool = paddle.layer.img_pool(
input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
return pool
......@@ -12,20 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License
import sys
import sys, os
import paddle.v2 as paddle
from vgg import vgg_bn_drop
from resnet import resnet_cifar10
with_gpu = os.getenv('WITH_GPU', '0') != '0'
def main():
datadim = 3 * 32 * 32
classdim = 10
# PaddlePaddle init
paddle.init(use_gpu=False, trainer_count=1)
paddle.init(use_gpu=with_gpu, trainer_count=1)
image = paddle.layer.data(
name="image", type=paddle.data_type.dense_vector(datadim))
......@@ -36,9 +38,8 @@ def main():
# option 2. vgg
net = vgg_bn_drop(image)
out = paddle.layer.fc(input=net,
size=classdim,
act=paddle.activation.Softmax())
out = paddle.layer.fc(
input=net, size=classdim, act=paddle.activation.Softmax())
lbl = paddle.layer.data(
name="label", type=paddle.data_type.integer_value(classdim))
......@@ -54,8 +55,11 @@ def main():
learning_rate=0.1 / 128.0,
learning_rate_decay_a=0.1,
learning_rate_decay_b=50000 * 100,
learning_rate_schedule='discexp',
batch_size=128)
learning_rate_schedule='discexp')
# Create trainer
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=momentum_optimizer)
# End batch and end pass event handler
def event_handler(event):
......@@ -67,6 +71,10 @@ def main():
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
# save parameters
with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
trainer.save_parameter_to_tar(f)
result = trainer.test(
reader=paddle.batch(
paddle.dataset.cifar.test10(), batch_size=128),
......@@ -74,10 +82,11 @@ def main():
'label': 1})
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
# Create trainer
trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters,
update_equation=momentum_optimizer)
# Save the inference topology to protobuf.
inference_topology = paddle.topology.Topology(layers=out)
with open("inference_topology.pkl", 'wb') as f:
inference_topology.serialize_for_inference(f)
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
......@@ -88,6 +97,40 @@ def main():
feeding={'image': 0,
'label': 1})
# inference
from PIL import Image
import numpy as np
import os
def load_image(file):
im = Image.open(file)
im = im.resize((32, 32), Image.ANTIALIAS)
im = np.array(im).astype(np.float32)
# The storage order of the loaded image is W(widht),
# H(height), C(channel). PaddlePaddle requires
# the CHW order, so transpose them.
im = im.transpose((2, 0, 1)) # CHW
# In the training phase, the channel order of CIFAR
# image is B(Blue), G(green), R(Red). But PIL open
# image in RGB mode. It must swap the channel order.
im = im[(2, 1, 0), :, :] # BGR
im = im.flatten()
im = im / 255.0
return im
test_data = []
cur_dir = os.path.dirname(os.path.realpath(__file__))
test_data.append((load_image(cur_dir + '/image/dog.png'), ))
# users can remove the comments and change the model name
# with open('params_pass_50.tar', 'r') as f:
# parameters = paddle.parameters.Parameters.from_tar(f)
probs = paddle.infer(
output_layer=out, parameters=parameters, input=test_data)
lab = np.argsort(-probs) # probs and lab are the results of one batch data
print "Label of image/dog.png is: %d" % lab[0][0]
if __name__ == '__main__':
main()
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册