diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..2e95e732c58ecdeb092d3cae16cece794ae1ff6d
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,3 @@
+Dockerfile
+.git/
+.gitignore
diff --git a/.gitignore b/.gitignore
index 0a0dd02414c32ede8d58d2556709827f9a98bf5c..e7f8501f2c04d0ddb9a27202b3e91d33c47d9de8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,8 @@
+deprecated
+*~
 pandoc.template
-.DS_Store
\ No newline at end of file
+.DS_Store
+.idea
+py_env*
+*.ipynb
+build
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..dcf88e09e5b8ece0b6360d8ed5bab544e068b6cb
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "paddle"]
+	path = paddle
+	url = https://github.com/PaddlePaddle/Paddle.git
+	branch = develop
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0d8234b69cb092a25eb884a754600168f9a67f75..e06e02f312e8c594a28249e7e9d32eb5a60bf7e9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,16 +1,42 @@
--   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: c25201a00e6b0514370501050cf2a8538ac12270
+-   repo: https://github.com/pre-commit/mirrors-yapf.git
+    sha: v0.16.0
     hooks:
-    -   id: remove-crlf
--   repo: https://github.com/reyoung/mirrors-yapf.git
-    sha: v0.13.2
-    hooks:
-    - id: yapf
-      files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$  # Bazel BUILD files follow Python syntax.
+    -   id: yapf
+        files: \.py$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
+    sha: a11d9314b22d8f8c7556443875b731ef05965464
     hooks:
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
+        files: (?!.*paddle)^.*$
     -   id: end-of-file-fixer
+        files: \.md$
+    -   id: trailing-whitespace
+        files: \.md$
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    sha: v1.0.1
+    hooks:
+    -   id: forbid-crlf
+        files: \.md$
+    -   id: remove-crlf
+        files: \.md$
+    -   id: forbid-tabs
+        files: \.md$
+    -   id: remove-tabs
+        files: \.md$
+-   repo: https://github.com/reyoung/pre-commit-hooks-jinja-compile.git
+    sha: 4a369cc72a4a2b8d3813ab8cc17abb5f5b21ef6c
+    hooks:
+    -   id: convert-jinja2-into-html
+        # The argument means repleace filename from pattern `.*/([^/]*)\.tmpl` to `\1`
+        args: ['--filename_pattern=.*/([^/]*)\.tmpl', '--filename_repl=\1']
+-   repo: local
+    hooks:
+    -   id: convert-markdown-into-html
+        name: convert-markdown-into-html
+        description: Convert README.md into index.html and README.cn.md into index.cn.html
+        entry: python .pre-commit-hooks/convert_markdown_into_html.py
+        language: system
+        files: .+README(\.cn)?\.md$
+
diff --git a/.tmpl/convert-markdown-into-html.sh b/.pre-commit-hooks/convert_markdown_into_html.py
old mode 100755
new mode 100644
similarity index 61%
rename from .tmpl/convert-markdown-into-html.sh
rename to .pre-commit-hooks/convert_markdown_into_html.py
index 149c686bc502b7fed97453e0769a7ef6ee841b76..66f44ef23c5d9a82436dfbe4b6bcdfc4e69ab55a
--- a/.tmpl/convert-markdown-into-html.sh
+++ b/.pre-commit-hooks/convert_markdown_into_html.py
@@ -1,8 +1,8 @@
-markdown_file=$1
+import argparse
+import re
+import sys
 
-# Notice: the single-quotes around EOF below make outputs
-# verbatium. c.f. http://stackoverflow.com/a/9870274/724872
-cat <<'EOF'
+HEAD = """
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -10,21 +10,21 @@ cat <<'EOF'
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -39,16 +39,14 @@ cat <<'EOF'
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
-EOF
+"""
 
-cat $markdown_file
-
-cat <<'EOF'
+TAIL = """
 </div>
 <!-- You can change the lines below now. -->
 
@@ -67,7 +65,31 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
-EOF
+"""
+
+
+def convert_markdown_into_html(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+
+    retv = 0
+
+    for filename in args.filenames:
+        with open(
+                re.sub(r"README", "index", re.sub(r"\.md$", ".html", filename)),
+                "w") as output:
+            output.write(HEAD)
+            with open(filename) as input:
+                for line in input:
+                    output.write(line)
+            output.write(TAIL)
+
+    return retv
+
+
+if __name__ == '__main__':
+    sys.exit(convert_markdown_into_html())
diff --git a/.pre-commit-hooks/convert_markdown_into_ipynb.sh b/.pre-commit-hooks/convert_markdown_into_ipynb.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dbcb1046d82010f776792287b45abdebf5b097ee
--- /dev/null
+++ b/.pre-commit-hooks/convert_markdown_into_ipynb.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+for file in $@ ; do
+    markdown-to-ipynb < $file > ${file%.*}".ipynb"
+    if [ $? -ne 0 ]; then
+        echo >&2 "markdown-to-ipynb $file error"
+        exit 1
+    fi
+done
+
diff --git a/.tools/build_docker.sh b/.tools/build_docker.sh
new file mode 100755
index 0000000000000000000000000000000000000000..242f8de16639f6c333d99f2ff4c7e24f86788c98
--- /dev/null
+++ b/.tools/build_docker.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+set -xe
+
+cur_path="$(cd "$(dirname "$0")" && pwd -P)"
+cd "$cur_path"/../
+
+#paddle production image name
+if [ ! -n "$1" ]; then
+  paddle_image=paddlepaddle/paddle
+else
+  paddle_image=$1
+fi
+
+#paddle production image tag
+if [ ! -n "$2" ]; then
+  paddle_tag=0.10.0
+else
+  paddle_tag=$2
+fi
+
+#paddle book image name
+if [ ! -n "$3" ]; then
+  book_image=paddlepaddle/book
+else
+  book_image=$3
+fi
+
+#paddle book image tag
+if [ ! -n "$4" ]; then
+  book_tag=latest
+else
+  book_tag=$4
+fi
+
+#generate docker file
+if [ ${USE_UBUNTU_REPO_MIRROR} ]; then
+  update_mirror_cmd="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
+else
+  update_mirror_cmd="\\"
+fi
+
+#build docker image
+echo "paddle_tag:"$paddle_tag
+echo "book_tag:"$book_tag
+
+cat > Dockerfile <<EOF
+FROM ${paddle_image}:${paddle_tag}
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+COPY . /book
+EOF
+
+if [ -n "${http_proxy}" ]; then
+cat >> Dockerfile <<EOF
+ENV http_proxy ${http_proxy}
+ENV https_proxy ${http_proxy}
+EOF
+fi
+
+cat >> Dockerfile <<EOF
+RUN pip install -U nltk \
+    && python /book/.tools/cache_dataset.py
+
+RUN ${update_mirror_cmd}
+    apt-get update && \
+    apt-get install -y locales patch && \
+    apt-get -y install gcc curl git vim && \
+    apt-get -y clean && \
+    localedef -f UTF-8 -i en_US en_US.UTF-8 && \
+    pip install --upgrade pip && \
+    pip install -U notedown pillow matplotlib jupyter numpy requests scipy
+
+RUN curl https://storage.googleapis.com/golang/go1.8.linux-amd64.tar.gz -o go1.8.linux-amd64.tar.gz && \
+    tar -zxvf go1.8.linux-amd64.tar.gz -C /usr/local/ && \
+    rm go1.8.linux-amd64.tar.gz
+
+ENV GOROOT /usr/local/go
+ENV PATH \${GOROOT}/bin:\${PATH}
+
+#convert md to ipynb
+RUN /bin/bash /book/.tools/convert-markdown-into-ipynb-and-test.sh
+
+EXPOSE 8888
+CMD ["sh", "-c", "jupyter notebook --ip=0.0.0.0 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.disable_check_xsrf=True /book/"]
+EOF
+
+docker build --no-cache -t ${book_image}:${paddle_tag} -t ${book_image}:${book_tag} .
diff --git a/.tools/cache_dataset.py b/.tools/cache_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..ae0125c6c6d14f4eb7d970e8911e6fc744451922
--- /dev/null
+++ b/.tools/cache_dataset.py
@@ -0,0 +1,52 @@
+#!/bin/env python
+import paddle.v2.dataset as dataset
+import nltk
+
+#cifar
+dataset.common.download(dataset.cifar.CIFAR100_URL, 'cifar',
+                        dataset.cifar.CIFAR100_MD5)
+dataset.common.download(dataset.cifar.CIFAR10_URL, 'cifar',
+                        dataset.cifar.CIFAR10_MD5)
+
+# Cache conll05
+dataset.common.download(dataset.conll05.WORDDICT_URL, 'conll05st', \
+                        dataset.conll05.WORDDICT_MD5)
+dataset.common.download(dataset.conll05.VERBDICT_URL, 'conll05st', \
+                        dataset.conll05.VERBDICT_MD5)
+dataset.common.download(dataset.conll05.TRGDICT_URL, 'conll05st', \
+                        dataset.conll05.TRGDICT_MD5)
+dataset.common.download(dataset.conll05.EMB_URL, 'conll05st',
+                        dataset.conll05.EMB_MD5)
+dataset.common.download(dataset.conll05.DATA_URL, 'conll05st',
+                        dataset.conll05.DATA_MD5)
+
+# Cache imdb
+dataset.common.download(dataset.imdb.URL, "imdb", dataset.imdb.MD5)
+
+# Cache imikolov
+dataset.common.download(dataset.imikolov.URL, "imikolov", dataset.imikolov.MD5)
+
+# Cache movielens
+dataset.common.download('http://files.grouplens.org/datasets/movielens/ml-1m.zip',\
+                        'movielens','c4d9eecfca2ab87c1945afe126590906')
+
+# Cache nltk
+nltk.download('movie_reviews', download_dir=dataset.common.DATA_HOME)
+
+# Cache uci housing
+dataset.common.download(dataset.uci_housing.URL, "uci_housing", \
+                        dataset.uci_housing.MD5)
+
+# Cache vmt14
+dataset.common.download(dataset.wmt14.URL_TRAIN, "wmt14",\
+                        dataset.wmt14.MD5_TRAIN)
+
+#mnist
+dataset.common.download(dataset.mnist.TRAIN_IMAGE_URL, 'mnist',
+                        dataset.mnist.TRAIN_IMAGE_MD5)
+dataset.common.download(dataset.mnist.TRAIN_LABEL_URL, 'mnist',
+                        dataset.mnist.TRAIN_LABEL_MD5)
+dataset.common.download(dataset.mnist.TEST_IMAGE_URL, 'mnist',
+                        dataset.mnist.TEST_IMAGE_MD5)
+dataset.common.download(dataset.mnist.TEST_LABEL_URL, 'mnist',
+                        dataset.mnist.TEST_LABEL_MD5)
diff --git a/.tools/convert-markdown-into-ipynb-and-test.sh b/.tools/convert-markdown-into-ipynb-and-test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4510964d49865179d6ea7fb4c54d594da720b313
--- /dev/null
+++ b/.tools/convert-markdown-into-ipynb-and-test.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+command -v go >/dev/null 2>&1
+if [ $? -ne 0 ]; then
+    echo >&2 "Please install go https://golang.org/doc/install#install"
+    exit 1
+fi
+
+export GOPATH=~/go; go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb
+
+cur_path="$(cd "$(dirname "$0")" && pwd -P)"
+cd $cur_path/../
+
+#convert md to ipynb
+for file in */{README,README\.cn}.md ; do
+    ~/go/bin/markdown-to-ipynb < $file > ${file%.*}".ipynb"
+    if [ $? -ne 0 ]; then
+        echo >&2 "markdown-to-ipynb $file error"
+        exit 1
+    fi
+done
+
+if [[ -z $TEST_EMBEDDED_PYTHON_SCRIPTS ]]; then
+    exit 0
+fi
+
+#exec ipynb's py file
+for file in */{README,README\.cn}.ipynb ; do
+    pushd $PWD > /dev/null
+    cd $(dirname $file) > /dev/null
+
+    echo "begin test $file"
+    jupyter nbconvert --to python $(basename $file) --stdout | python
+
+    popd > /dev/null
+    #break
+done
diff --git a/.tools/notedown.sh b/.tools/notedown.sh
new file mode 100644
index 0000000000000000000000000000000000000000..593052264a444101b8c4f0331078efd8adab4127
--- /dev/null
+++ b/.tools/notedown.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+
+cd /book
+
+#convert md to ipynb
+for file in */{README,README\.cn}.md ; do
+    notedown $file > ${file%.*}.ipynb
+done
diff --git a/.tools/templates/index.cn.html.json b/.tools/templates/index.cn.html.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f62c5ca51edd303656c4f4f57fddbfe2a0275d4
--- /dev/null
+++ b/.tools/templates/index.cn.html.json
@@ -0,0 +1,37 @@
+{
+  "is_en": false,
+  "chapters": [
+    {
+      "name": "新手入门",
+      "link": "./01.fit_a_line/index.cn.html"
+    },
+    {
+      "name": "识别数字",
+      "link": "./02.recognize_digits/index.cn.html"
+    },
+    {
+      "name": "图像分类",
+      "link": "./03.image_classification/index.cn.html"
+    },
+    {
+      "name": "词向量",
+      "link": "./04.word2vec/index.cn.html"
+    },
+    {
+      "name": "个性化推荐",
+      "link": "./05.recommender_system/index.cn.html"
+    },
+    {
+      "name": "情感分析",
+      "link": "./06.understand_sentiment/index.cn.html"
+    },
+    {
+      "name": "语义角色标注",
+      "link": "./07.label_semantic_roles/index.cn.html"
+    },
+    {
+      "name": "机器翻译",
+      "link": "./08.machine_translation/index.cn.html"
+    }
+  ]
+}
diff --git a/.tools/templates/index.cn.html.tmpl b/.tools/templates/index.cn.html.tmpl
new file mode 120000
index 0000000000000000000000000000000000000000..f4662a7cb440b54a3196cbe21d8f15214f74c9c6
--- /dev/null
+++ b/.tools/templates/index.cn.html.tmpl
@@ -0,0 +1 @@
+index.html.tmpl
\ No newline at end of file
diff --git a/.tools/templates/index.html.json b/.tools/templates/index.html.json
new file mode 100644
index 0000000000000000000000000000000000000000..320441ef429f0c827af39c7389873677eb239d98
--- /dev/null
+++ b/.tools/templates/index.html.json
@@ -0,0 +1,37 @@
+{
+  "is_en": true,
+  "chapters": [
+    {
+      "name": "Linear Regression",
+      "link": "./01.fit_a_line/index.html"
+    },
+    {
+      "name": "Recognize Digits",
+      "link": "./02.recognize_digits/index.html"
+    },
+    {
+      "name": "Image Classification",
+      "link": "./03.image_classification/index.html"
+    },
+    {
+      "name": "Word2Vec",
+      "link": "./04.word2vec/index.html"
+    },
+    {
+      "name": "Personalized Recommendation",
+      "link": "./05.recommender_system/index.html"
+    },
+    {
+      "name": "Sentiment Analysis",
+      "link": "./06.understand_sentiment/index.html"
+    },
+    {
+      "name": "Semantic Role Labeling",
+      "link": "./07.label_semantic_roles/index.html"
+    },
+    {
+      "name": "Machine Translation",
+      "link": "./08.machine_translation/index.html"
+    }
+  ]
+}
diff --git a/.tools/templates/index.html.tmpl b/.tools/templates/index.html.tmpl
new file mode 100644
index 0000000000000000000000000000000000000000..19b52dd7edfad3f8eb36b847326cf57d5e255405
--- /dev/null
+++ b/.tools/templates/index.html.tmpl
@@ -0,0 +1,145 @@
+<!DOCTYPE html>
+<html lang="{% if is_en %}en{%else%}zh-Hans{% endif %}">
+<head>
+    <meta charset="UTF-8">
+    <title>{% if not is_en %}深度学习入门{%else%}Deep Learning 101{% endif %}</title>
+    <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+    <script src="http://cdn.bootcss.com/tether/1.4.0/js/tether.js"></script>
+    <script src="http://cdn.bootcss.com/jquery/3.1.0/jquery.slim.js"></script>
+    <script src="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js"></script>
+    <style>
+* {
+    font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+.left-panel {
+    background: #E5E6EA;
+}
+
+.left-panel .card-block a.click_active {
+    background-color: #597cf1;
+    color: #fff;
+}
+ 
+.left-panel .card-header {
+    background: #ecedee;
+}
+
+{% if not is_en %}
+.left-panel .card-header a {
+    font-size: 14px;
+}
+{% endif %}
+
+.left-panel .card-block a:not(.click_active) {
+    background: #e4e6e9;
+}
+
+.left-panel .card-block a {
+    border-radius: 0px;
+    font-size: 13px;
+    color: #2F323A;
+    padding-left: 40px;
+    border: 1px solid rgba(0,0,0,.125);
+}
+
+.left-panel .card-block a.active_color {
+    color:  rgb(70, 74, 76);
+    font-weight: bolder;
+}
+
+.left-panel .list-group-item {
+    -moz-box-align: center;
+    align-items: center;
+    display: flex;
+    flex-flow: row wrap;
+    margin-bottom: -1px;
+    padding: 0.75rem 1.25rem;
+    position: relative;
+    color: #2f323a;
+}
+
+.navbar img {
+    height: 90%;
+    width: 90%;
+}
+
+   </style>
+</head>
+<body>
+    <nav class="navbar navbar-toggleable-md navbar-inverse bg-inverse">
+        <a class="navbar-brand mr-auto" href="#">
+            <img alt="PaddlePaddle" src="./.tools/theme/PP_w.png">
+        </a>
+        <ul class="nav navbar-nav">
+          <li class="nav-item">
+            <a class="nav-link" href="http://paddlepaddle.org">
+                {% if is_en %}Home{% else %}首页{% endif %}
+            </a>
+          </li>
+          <li class="nav-item {% if not is_en %}active{% endif %}">
+            <a class="nav-link" href="{% if is_en %}./index.cn.html{% else %}#{%endif%}">
+                {% if is_en %}中文{% else %}深度学习入门{% endif %}
+            </a>
+          </li>
+          <li class="nav-item {% if is_en %}active{% endif %}">
+            <a class="nav-link" href="{% if is_en %}#{% else %}./index.html{% endif %}">
+                {% if is_en %}Deep Learning 101{% else %}English{% endif %}
+            </a>
+          </li>
+          <li class="nav-item">
+            <a class="nav-link" href="https://github.com/PaddlePaddle/book/">Fork on Github</a>
+          </li>
+        </ul>
+    </nav>
+    <div class="container-fluid">
+        <div class="row">
+            <div class="col-3 pl-1 pr-1 left-panel">
+                <div id="accordion" role="tablist" aria-multiselectable="true">
+                  <div class="card">
+                    <div class="card-header" role="tab" id="headingOne">
+                      <h5 class="mb-0">
+                        <a aria-expanded="true" aria-controls="collapseOne">
+                            {% if is_en %}Deep Learning 101<span class="sr-only">(current)</span>{% else %}深度学习入门{% endif %}
+                        </a>
+                      </h5>
+                    </div>
+                    <div id="collapseOne" class="rounded-0 collapse show" role="tabpanel" aria-labelledby="headingOne">
+                      <div class="card-block pl-0 pr-0 pt-0 pb-0">
+                        <div class="list-group ">
+                          {% for chapter in chapters %}
+                            <a href="{{ chapter.link }}" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                {{ chapter.name }}
+                            </a>
+                          {% endfor %}
+                        </div>
+                      </div>
+                    </div>
+                  </div>
+                </div>
+            </div>
+            <div class="col">
+                <iframe src="./01.fit_a_line/index{% if not is_en %}.cn{% endif %}.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe">
+                </iframe>
+            </div>
+        </div>
+    </div>
+    <script>
+    $('#content_iframe').on('load', function(){
+        $("#content_iframe").height(200)  // trick code to shrink iframe size
+        var body = $('#content_iframe').contents().find("body")
+        body.css("overflow-y", "hidden")
+        $("#content_iframe").height(body.height()+20)
+        var alllinks = $('#content_iframe').contents().find("a")
+        for (var i =0; i<alllinks.length; ++i) {
+            alllinks[i].setAttribute("target", "_blank")
+        }
+    });
+    $(".list-group a").click(function(){ 
+       $(".list-group a.click_active").removeClass("click_active"); 
+       $(this).addClass("click_active"); 
+    })
+    $($(".list-group a")[0]).addClass("click_active") 
+    </script>
+</body>
+</html>
diff --git a/.tools/theme/PP_w.png b/.tools/theme/PP_w.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc58b0b458135773fcde5ee941ea095e3d4d07a0
Binary files /dev/null and b/.tools/theme/PP_w.png differ
diff --git a/.tmpl/github-markdown.css b/.tools/theme/github-markdown.css
similarity index 99%
rename from .tmpl/github-markdown.css
rename to .tools/theme/github-markdown.css
index 42e38eba8cf4a5bc93c256843fbd765dc80facea..97615ce2c76b02ce6b4f95e11a71b54f7c313014 100644
--- a/.tmpl/github-markdown.css
+++ b/.tools/theme/github-markdown.css
@@ -188,7 +188,7 @@
 }
 
 .markdown-body a {
-  color: #4078c0;
+  color: #597cf1;
   text-decoration: none;
 }
 
diff --git a/.tmpl/marked.js b/.tools/theme/marked.js
similarity index 99%
rename from .tmpl/marked.js
rename to .tools/theme/marked.js
index 3c4fbe885422d11cdfdea4dfcdb71c3f42ef2022..0499d1d4e383ee3f866b9f9eed91ae775fe3da10 100644
--- a/.tmpl/marked.js
+++ b/.tools/theme/marked.js
@@ -1093,7 +1093,7 @@ function escape(html, encode) {
 }
 
 function unescape(html) {
-	// explicitly match decimal, hex, and named HTML entities 
+    // explicitly match decimal, hex, and named HTML entities
   return html.replace(/&(#(?:\d+)|(?:#x[0-9A-Fa-f]+)|(?:\w+));?/g, function(_, n) {
     n = n.toLowerCase();
     if (n === 'colon') return ':';
diff --git a/.travis.yml b/.travis.yml
index f5409798d0baed1d518ff8c85c84fcad0d0fa701..f212f1b82a619d13a1277f8c2193b3892528cae5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,10 +14,20 @@ addons:
       - python
       - python-pip
       - python2.7-dev
+      - golang
+  ssh_known_hosts: 52.76.173.135
 before_install:
-  -  pip install virtualenv pre-commit
+  -  sudo pip install -U virtualenv pre-commit pip
+  -  GOPATH=/tmp/go go get -u github.com/wangkuiyi/ipynb/markdown-to-ipynb
 script:
-  - travis/precommit.sh
+  -  PATH=/tmp/go/bin:$PATH .travis/precommit.sh
+  - |
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then echo "not develop branch, no deploy"; exit 0; fi;
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
+    export BOOK_DIR=`pwd`
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $BOOK_DIR
 notifications:
   email:
     on_success: change
diff --git a/travis/precommit.sh b/.travis/precommit.sh
similarity index 94%
rename from travis/precommit.sh
rename to .travis/precommit.sh
index 204ee0533ebfdf93dd19c7a6af2353c30a10abb8..bcbfb2bb530ca6fecd1ac4c9e049c292a61e5e64 100755
--- a/travis/precommit.sh
+++ b/.travis/precommit.sh
@@ -13,7 +13,9 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 
 if ! pre-commit run -a ; then
+  ls -lh
   git diff  --exit-code
+  exit 1
 fi
 
 trap : 0
diff --git a/fit_a_line/.gitignore b/01.fit_a_line/.gitignore
similarity index 100%
rename from fit_a_line/.gitignore
rename to 01.fit_a_line/.gitignore
diff --git a/fit_a_line/README.md b/01.fit_a_line/README.cn.md
similarity index 59%
rename from fit_a_line/README.md
rename to 01.fit_a_line/README.cn.md
index c8becc9de54649648ba27654c607d9ca68ae53fa..e197408caed7ffce907c1d31df644df63e7c4e0a 100644
--- a/fit_a_line/README.md
+++ b/01.fit_a_line/README.cn.md
@@ -1,7 +1,7 @@
 # 线性回归
 让我们从经典的线性回归（Linear Regression \[[1](#参考文献)\]）模型开始这份教程。在这一章里，你将使用真实的数据集建立起一个房价预测模型，并且了解到机器学习中的若干重要概念。
 
-本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/137.html)。
 
 ## 背景介绍
 给定一个大小为$n$的数据集  ${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$，其中$x_{i1}, \ldots, x_{id}$是第$i$个样本$d$个属性上的取值，$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述，即
@@ -15,8 +15,8 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldo
 ## 效果展示
 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
 <p align="center">
-	<img src = "image/predictions.png" width=400><br/>
-	图1. 预测值 V.S. 真实值
+    <img src = "image/predictions.png" width=400><br/>
+    图1. 预测值 V.S. 真实值
 </p>
 
 ## 模型概览
@@ -29,7 +29,7 @@ $$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
 
 $\hat{Y}$ 表示模型的预测结果，用来和真实值$Y$区分。模型要学习的参数即：$\omega_1, \ldots, \omega_{13}, b$。
 
-建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数（[Loss Function](https://en.wikipedia.org/wiki/Loss_function)，或Cost Function）这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$，损失函数输出一个非负的实值。这个实质通常用来反映模型误差的大小。
+建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数（[Loss Function](https://en.wikipedia.org/wiki/Loss_function)，或Cost Function）这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$，损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。
 
 对于线性回归模型来讲，最常见的损失函数就是均方误差（Mean Squared Error， [MSE](https://en.wikipedia.org/wiki/Mean_squared_error)）了，它的形式是：
 
@@ -45,14 +45,25 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
  3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
  4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
 
+## 数据集
 
-## 数据准备
-执行以下命令来准备数据:
-```bash
-cd data && python prepare_data.py
+### 数据集接口的封装
+首先加载需要的包
+
+```python
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
 ```
-这段代码将从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)下载数据并进行[预处理](#数据预处理)，最后数据将被分为训练集和测试集。
 
+我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)
+
+其中，在uci_housing模块中封装了：
+
+1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
+2. [数据预处理](#数据预处理)的过程。
+
+
+### 数据集介绍
 这份数据集共506行，每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下：
 
 | 属性名 | 解释 | 类型 |
@@ -85,94 +96,171 @@ cd data && python prepare_data.py
 - 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
 
 <p align="center">
-	<img src = "image/ranges.png" width=550><br/>
-	图2. 各维属性的取值范围
+    <img src = "image/ranges.png" width=550><br/>
+    图2. 各维属性的取值范围
 </p>
 
 #### 整理训练集与测试集
-我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为**训练误差**；另外一份被用来测试，模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。一种常见的分割比例为$8:2$，感兴趣的读者朋友们也可以尝试不同的设置来观察这两种误差的变化。
+我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为**训练误差**；另外一份被用来测试，模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$
+
+
+在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（[Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization)）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。
+
+## 训练
+
+`fit_a_line/trainer.py`演示了训练的整体过程。
+
+### 初始化PaddlePaddle
 
-执行如下命令可以分割数据集，并将训练集和测试集的地址分别写入train.list 和 test.list两个文件中，供PaddlePaddle读取。
 ```python
-python prepare_data.py -r 0.8 #默认使用8:2的比例进行分割
+paddle.init(use_gpu=False, trainer_count=1)
 ```
 
-在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（[Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization)）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。
+### 模型配置
+
+线性回归的模型其实就是一个采用线性激活函数（linear activation，`LinearActivation`）的全连接层（fully-connected layer，`fc_layer`）：
+
+```python
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+y_predict = paddle.layer.fc(input=x,
+                                size=1,
+                                act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+```
 
-### 提供数据给PaddlePaddle
-准备好数据之后，我们使用一个Python data provider来为PaddlePaddle的训练过程提供数据。一个 data provider 就是一个Python函数，它会被PaddlePaddle的训练过程调用。在这个例子里，只需要读取已经保存好的数据，然后一行一行地返回给PaddlePaddle的训练进程即可。
+### 保存网络拓扑
 
 ```python
-from paddle.trainer.PyDataProvider2 import *
-import numpy as np
-#定义数据的类型和维度
-@provider(input_types=[dense_vector(13), dense_vector(1)])
-def process(settings, input_file):
-    data = np.load(input_file.strip())
-    for row in data:
-	    yield row[:-1].tolist(), row[-1:].tolist()
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
+```
+
+### 创建参数
 
+```python
+parameters = paddle.parameters.create(cost)
 ```
 
-## 模型配置说明
+### 创建Trainer
 
-### 数据定义
-首先，通过 `define_py_data_sources2` 来配置PaddlePaddle从上面的`dataprovider.py`里读入训练数据和测试数据。 PaddlePaddle接受从命令行读入的配置信息，例如这里我们传入一个名为`is_predict`的变量来控制模型在训练和测试时的不同结构。
 ```python
-from paddle.trainer_config_helpers import *
+optimizer = paddle.optimizer.Momentum(momentum=0)
+
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+```
 
-is_predict = get_config_arg('is_predict', bool, False)
+### 读取数据且打印训练的中间信息
 
-define_py_data_sources2(
-    train_list='data/train.list',
-    test_list='data/test.list',
-    module='dataprovider',
-    obj='process')
+PaddlePaddle提供一个
+[reader机制](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
+来读取数据。 Reader返回的数据可以包括多列，我们需要一个Python dict把列
+序号映射到网络里的数据层。
 
+```python
+feeding={'x': 0, 'y': 1}
 ```
 
-### 算法配置
-接着，指定模型优化算法的细节。由于线性回归模型比较简单，我们只要设置基本的`batch_size`即可，它指定每次更新参数的时候使用多少条数据计算梯度信息。
+此外，我们还可以提供一个 event handler，来打印训练的进度：
+
 ```python
-settings(batch_size=2)
+# event_handler to print training and testing info
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f" % (
+                event.pass_id, event.batch_id, event.cost)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+            reader=paddle.batch(
+                uci_housing.test(), batch_size=2),
+            feeding=feeding)
+        print "Test %d, Cost %f" % (event.pass_id, result.cost)
 ```
 
-### 网络结构
-最后，使用`fc_layer`和`LinearActivation`来表示线性回归的模型本身。
 ```python
-#输入数据，13维的房屋信息
-x = data_layer(name='x', size=13)
-
-y_predict = fc_layer(
-    input=x,
-    param_attr=ParamAttr(name='w'),
-    size=1,
-    act=LinearActivation(),
-    bias_attr=ParamAttr(name='b'))
-
-if not is_predict: #训练时，我们使用MSE，即regression_cost作为损失函数
-    y = data_layer(name='y', size=1)
-    cost = regression_cost(input=y_predict, label=y)
-    outputs(cost) #训练时输出MSE来监控损失的变化
-else: #测试时，输出预测值
-    outputs(y_predict)
+# event_handler to print training and testing info
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            cost_ploter.append(train_title, step, event.cost)
+
+        if step % 100 == 0: # every 100 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    uci_housing.test(), batch_size=2),
+                feeding=feeding)
+            cost_ploter.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            cost_ploter.plot()
+
+        step += 1
+
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
 ```
 
-## 训练模型
-在对应代码的根目录下执行PaddlePaddle的命令行训练程序。这里指定模型配置文件为`trainer_config.py`，训练30轮，结果保存在`output`路径下。
-```bash
-./train.sh
+### 开始训练
+
+```python
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(
+            uci_housing.train(), buf_size=500),
+        batch_size=2),
+    feeding=feeding,
+    event_handler=event_handler_plot,
+    num_passes=30)
 ```
 
-## 应用模型
-现在来看下如何使用已经训练好的模型进行预测。
-```bash
-python predict.py
+![png](./image/train_and_test.png)
+
+### 应用模型
+
+#### 1. 生成测试数据
+
+```python
+test_data_creator = paddle.dataset.uci_housing.test()
+test_data = []
+test_label = []
+
+for item in test_data_creator():
+    test_data.append((item[0],))
+    test_label.append(item[1])
+    if len(test_data) == 5:
+        break
 ```
-这里默认使用`output/pass-00029`中保存的模型进行预测，并将数据中的房价与预测结果进行对比，结果保存在 `predictions.png`中。
-如果你想使用别的模型或者其它的数据进行预测，只要传入新的路径即可：
-```bash
-python predict.py -m output/pass-00020 -t data/housing.test.npy
+
+#### 2. 推测 inference
+
+```python
+# load parameters from tar file.
+# users can remove the comments and change the model name
+# with open('params_pass_20.tar', 'r') as f:
+#     parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=y_predict, parameters=parameters, input=test_data)
+
+for i in xrange(len(probs)):
+    print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
 ```
 
 ## 总结
@@ -186,4 +274,4 @@ python predict.py -m output/pass-00020 -t data/housing.test.npy
 4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/01.fit_a_line/README.md b/01.fit_a_line/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..31d64d897cd53347c0866def355fc68650e28bf9
--- /dev/null
+++ b/01.fit_a_line/README.md
@@ -0,0 +1,283 @@
+# Linear Regression
+Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict home prices. Some important concepts in Machine Learning will be covered through this example.
+
+The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Problem Setup
+Suppose we have a dataset of $n$ real estate properties. Each real estate property will be referred to as **homes** in this chapter for clarity.
+
+Each home is associated with $d$ attributes. The attributes describe characteristics such as the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
+
+In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely,
+
+$$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b,  i=1,\ldots,n$$
+
+where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
+
+## Results Demonstration
+We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of similar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the closer the point is to the dotted line, better the model's prediction.
+<p align="center">
+    <img src = "image/predictions_en.png" width=400><br/>
+    Figure 1. Predicted Value V.S. Actual Value
+</p>
+
+## Model Overview
+
+### Model Definition
+
+In the UCI Housing Data Set, there are 13 home attributes $\{x_{i,j}\}$ that are related to the median home price $y_i$, which we aim to predict. Thus, our model can be written as:
+
+$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
+
+where $\hat{Y}$ is the predicted value used to differentiate from actual value $Y$. The model learns parameters $\omega_1, \ldots, \omega_{13}, b$, where the entries of $\vec{\omega}$ are **weights** and $b$ is **bias**.
+
+Now we need an objective to optimize, so that the learned parameters can make $\hat{Y}$ as close to $Y$ as possible. Let's refer to the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). A loss function must output a non-negative value, given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$. This value reflects the magnitutude of the model error.
+
+For Linear Regression, the most common loss function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
+
+$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
+
+That is, for a dataset of size $n$, MSE is the average value of the the prediction sqaure errors.
+
+### Training
+
+After setting up our model, there are several major steps to go through to train it:
+1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s.
+2. Feedforward. Evaluate the network output and compute the corresponding loss.
+3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
+4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of epochs is reached.
+
+## Dataset
+
+### Python Dataset Modules
+
+Our program starts with importing necessary packages:
+
+```python
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+```
+
+We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`.  This module can
+
+1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if you haven't yet, and
+2.  [preprocess](#preprocessing) the dataset.
+
+### An Introduction of the Dataset
+
+The UCI housing dataset has 506 instances. Each instance describes the attributes of a house in surburban Boston.  The attributes are explained below:
+
+| Attribute Name | Characteristic | Data Type |
+| ------| ------ | ------ |
+| CRIM | per capita crime rate by town | Continuous|
+| ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous |
+| INDUS | proportion of non-retail business acres per town | Continuous |
+| CHAS | Charles River dummy variable | Discrete, 1 if tract bounds river; 0 otherwise|
+| NOX | nitric oxides concentration (parts per 10 million) | Continuous |
+| RM | average number of rooms per dwelling | Continuous |
+| AGE | proportion of owner-occupied units built prior to 1940 | Continuous |
+| DIS | weighted distances to five Boston employment centres | Continuous |
+| RAD | index of accessibility to radial highways | Continuous |
+| TAX | full-value property-tax rate per $10,000 | Continuous |
+| PTRATIO | pupil-teacher ratio by town | Continuous |
+| B | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | Continuous |
+| LSTAT | % lower status of the population | Continuous |
+| MEDV | Median value of owner-occupied homes in $1000's | Continuous |
+
+The last entry is the median home price.
+
+### Preprocessing
+#### Continuous and Discrete Data
+We define a feature vector of length 13 for each home, where each entry corresponds to an attribute. Our first observation is that, among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension.
+
+Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically.  The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
+
+#### Feature Normalization
+We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale the values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we subtract the mean value from the feature value and divide the result by the width of the original range.
+
+There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
+- A value range that is too large or too small might cause floating number overflow or underflow during computation.
+- Different value ranges might result in varying *importances* of different features to the model (at least in the beginning of the training process). This assumption about the data is often unreasonable, making the optimization difficult, which in turn results in increased training time.
+- Many machine learning techniques or models (e.g., *L1/L2 regularization* and *Vector Space Model*) assumes that all the features have roughly zero means and their value ranges are similar.
+
+<p align="center">
+    <img src = "image/ranges_en.png" width=550><br/>
+    Figure 2. The value ranges of the features
+</p>
+
+#### Prepare Training and Test Sets
+We split the dataset in two, one for adjusting the model parameters, namely, for training the model, and the other for testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
+
+
+When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process.
+
+
+## Training
+
+`fit_a_line/trainer.py` demonstrates the training using [PaddlePaddle](http://paddlepaddle.org).
+
+### Initialize PaddlePaddle
+
+```python
+paddle.init(use_gpu=False, trainer_count=1)
+```
+
+### Model Configuration
+
+Linear regression is essentially a fully-connected layer with linear activation:
+
+```python
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+y_predict = paddle.layer.fc(input=x,
+                                size=1,
+                                act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+```
+
+### Save Topology
+
+```python
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
+```
+
+
+### Create Parameters
+
+```python
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+```python
+optimizer = paddle.optimizer.Momentum(momentum=0)
+
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+```
+
+### Feeding Data
+
+PaddlePaddle provides the
+[reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
+for loading the training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
+
+```python
+feeding={'x': 0, 'y': 1}
+```
+
+Moreover, an event handler is provided to print the training progress:
+
+```python
+# event_handler to print training and testing info
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f" % (
+                event.pass_id, event.batch_id, event.cost)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+            reader=paddle.batch(
+                uci_housing.test(), batch_size=2),
+            feeding=feeding)
+        print "Test %d, Cost %f" % (event.pass_id, result.cost)
+```
+
+```python
+# event_handler to plot training and testing info
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+plot_cost = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            plot_cost.append(train_title, step, event.cost)
+
+        if step % 100 == 0: # every 100 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    uci_housing.test(), batch_size=2),
+                feeding=feeding)
+            plot_cost.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            plot_cost.plot()
+
+        step += 1
+
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+```
+
+### Start Training
+
+```python
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(
+            uci_housing.train(), buf_size=500),
+        batch_size=2),
+    feeding=feeding,
+    event_handler=event_handler_plot,
+    num_passes=30)
+```
+
+![png](./image/train_and_test.png)
+
+### Apply model
+
+#### 1. generate testing data
+
+```python
+test_data_creator = paddle.dataset.uci_housing.test()
+test_data = []
+test_label = []
+
+for item in test_data_creator():
+    test_data.append((item[0],))
+    test_label.append(item[1])
+    if len(test_data) == 5:
+        break
+```
+
+#### 2. inference
+
+```python
+# load parameters from tar file.
+# users can remove the comments and change the model name
+# with open('params_pass_20.tar', 'r') as f:
+#     parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=y_predict, parameters=parameters, input=test_data)
+
+for i in xrange(len(probs)):
+    print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
+```
+
+## Summary
+This chapter introduces *Linear Regression* and how to train and test this model with PaddlePaddle, using the UCI Housing Data Set. Because a large number of more complex models and techniques are derived from linear regression, it is important to understand its underlying theory and limitation.
+
+
+## References
+1. https://en.wikipedia.org/wiki/Linear_regression
+2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
+3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
+4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/01.fit_a_line/fit_a_line.tar b/01.fit_a_line/fit_a_line.tar
new file mode 100644
index 0000000000000000000000000000000000000000..4a0e79a02d43bdede95e25a7570c12f3104da185
Binary files /dev/null and b/01.fit_a_line/fit_a_line.tar differ
diff --git a/fit_a_line/image/predictions.png b/01.fit_a_line/image/predictions.png
similarity index 100%
rename from fit_a_line/image/predictions.png
rename to 01.fit_a_line/image/predictions.png
diff --git a/fit_a_line/image/predictions_en.png b/01.fit_a_line/image/predictions_en.png
similarity index 100%
rename from fit_a_line/image/predictions_en.png
rename to 01.fit_a_line/image/predictions_en.png
diff --git a/01.fit_a_line/image/ranges.png b/01.fit_a_line/image/ranges.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d86b12715f46afbafb7d50e2938e184219b5b95
Binary files /dev/null and b/01.fit_a_line/image/ranges.png differ
diff --git a/fit_a_line/image/ranges_en.png b/01.fit_a_line/image/ranges_en.png
similarity index 100%
rename from fit_a_line/image/ranges_en.png
rename to 01.fit_a_line/image/ranges_en.png
diff --git a/01.fit_a_line/image/train_and_test.png b/01.fit_a_line/image/train_and_test.png
new file mode 100644
index 0000000000000000000000000000000000000000..bcd304a6a0baf30ecfbc43e08fc0aca179d05958
Binary files /dev/null and b/01.fit_a_line/image/train_and_test.png differ
diff --git a/fit_a_line/index.html b/01.fit_a_line/index.cn.html
similarity index 62%
rename from fit_a_line/index.html
rename to 01.fit_a_line/index.cn.html
index 7bb9e8a2f6b69b766f17eb72d5cd9d9844138b2d..3154f30f1d0701dfd1f84d5ada65641faf49c19c 100644
--- a/fit_a_line/index.html
+++ b/01.fit_a_line/index.cn.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,7 +35,7 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
@@ -42,7 +43,7 @@
 # 线性回归
 让我们从经典的线性回归（Linear Regression \[[1](#参考文献)\]）模型开始这份教程。在这一章里，你将使用真实的数据集建立起一个房价预测模型，并且了解到机器学习中的若干重要概念。
 
-本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/137.html)。
 
 ## 背景介绍
 给定一个大小为$n$的数据集  ${\{y_{i}, x_{i1}, ..., x_{id}\}}_{i=1}^{n}$，其中$x_{i1}, \ldots, x_{id}$是第$i$个样本$d$个属性上的取值，$y_i$是该样本待预测的目标。线性回归模型假设目标$y_i$可以被属性间的线性组合描述，即
@@ -56,8 +57,8 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldo
 ## 效果展示
 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
 <p align="center">
-	<img src = "image/predictions.png" width=400><br/>
-	图1. 预测值 V.S. 真实值
+    <img src = "image/predictions.png" width=400><br/>
+    图1. 预测值 V.S. 真实值
 </p>
 
 ## 模型概览
@@ -70,7 +71,7 @@ $$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
 
 $\hat{Y}$ 表示模型的预测结果，用来和真实值$Y$区分。模型要学习的参数即：$\omega_1, \ldots, \omega_{13}, b$。
 
-建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数（[Loss Function](https://en.wikipedia.org/wiki/Loss_function)，或Cost Function）这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$，损失函数输出一个非负的实值。这个实质通常用来反映模型误差的大小。
+建立模型后，我们需要给模型一个优化目标，使得学到的参数能够让预测值$\hat{Y}$尽可能地接近真实值$Y$。这里我们引入损失函数（[Loss Function](https://en.wikipedia.org/wiki/Loss_function)，或Cost Function）这个概念。 输入任意一个数据样本的目标值$y_{i}$和模型给出的预测值$\hat{y_{i}}$，损失函数输出一个非负的实值。这个实值通常用来反映模型误差的大小。
 
 对于线性回归模型来讲，最常见的损失函数就是均方误差（Mean Squared Error， [MSE](https://en.wikipedia.org/wiki/Mean_squared_error)）了，它的形式是：
 
@@ -86,14 +87,25 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
  3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
  4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
 
+## 数据集
+
+### 数据集接口的封装
+首先加载需要的包
 
-## 数据准备
-执行以下命令来准备数据:
-```bash
-cd data && python prepare_data.py
+```python
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
 ```
-这段代码将从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)下载数据并进行[预处理](#数据预处理)，最后数据将被分为训练集和测试集。
 
+我们通过uci_housing模块引入了数据集合[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)
+
+其中，在uci_housing模块中封装了：
+
+1. 数据下载的过程。下载数据保存在~/.cache/paddle/dataset/uci_housing/housing.data。
+2. [数据预处理](#数据预处理)的过程。
+
+
+### 数据集介绍
 这份数据集共506行，每行包含了波士顿郊区的一类房屋的相关信息及该类房屋价格的中位数。其各维属性的意义如下：
 
 | 属性名 | 解释 | 类型 |
@@ -126,94 +138,171 @@ cd data && python prepare_data.py
 - 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
 
 <p align="center">
-	<img src = "image/ranges.png" width=550><br/>
-	图2. 各维属性的取值范围
+    <img src = "image/ranges.png" width=550><br/>
+    图2. 各维属性的取值范围
 </p>
 
 #### 整理训练集与测试集
-我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为**训练误差**；另外一份被用来测试，模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。一种常见的分割比例为$8:2$，感兴趣的读者朋友们也可以尝试不同的设置来观察这两种误差的变化。
+我们将数据集分割为两份：一份用于调整模型的参数，即进行模型的训练，模型在这份数据集上的误差被称为**训练误差**；另外一份被用来测试，模型在这份数据集上的误差被称为**测试误差**。我们训练模型的目的是为了通过从训练数据中找到规律来预测未知的新数据，所以测试误差是更能反映模型表现的指标。分割数据的比例要考虑到两个因素：更多的训练数据会降低参数估计的方差，从而得到更可信的模型；而更多的测试数据会降低测试误差的方差，从而得到更可信的测试误差。我们这个例子中设置的分割比例为$8:2$
+
+
+在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（[Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization)）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。
+
+## 训练
+
+`fit_a_line/trainer.py`演示了训练的整体过程。
+
+### 初始化PaddlePaddle
 
-执行如下命令可以分割数据集，并将训练集和测试集的地址分别写入train.list 和 test.list两个文件中，供PaddlePaddle读取。
 ```python
-python prepare_data.py -r 0.8 #默认使用8:2的比例进行分割
+paddle.init(use_gpu=False, trainer_count=1)
 ```
 
-在更复杂的模型训练过程中，我们往往还会多使用一种数据集：验证集。因为复杂的模型中常常还有一些超参数（[Hyperparameter](https://en.wikipedia.org/wiki/Hyperparameter_optimization)）需要调节，所以我们会尝试多种超参数的组合来分别训练多个模型，然后对比它们在验证集上的表现选择相对最好的一组超参数，最后才使用这组参数下训练的模型在测试集上评估测试误差。由于本章训练的模型比较简单，我们暂且忽略掉这个过程。
+### 模型配置
 
-### 提供数据给PaddlePaddle
-准备好数据之后，我们使用一个Python data provider来为PaddlePaddle的训练过程提供数据。一个 data provider 就是一个Python函数，它会被PaddlePaddle的训练过程调用。在这个例子里，只需要读取已经保存好的数据，然后一行一行地返回给PaddlePaddle的训练进程即可。
+线性回归的模型其实就是一个采用线性激活函数（linear activation，`LinearActivation`）的全连接层（fully-connected layer，`fc_layer`）：
 
 ```python
-from paddle.trainer.PyDataProvider2 import *
-import numpy as np
-#定义数据的类型和维度
-@provider(input_types=[dense_vector(13), dense_vector(1)])
-def process(settings, input_file):
-    data = np.load(input_file.strip())
-    for row in data:
-	    yield row[:-1].tolist(), row[-1:].tolist()
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+y_predict = paddle.layer.fc(input=x,
+                                size=1,
+                                act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+```
+
+### 保存网络拓扑
 
+```python
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
 ```
 
-## 模型配置说明
+### 创建参数
 
-### 数据定义
-首先，通过 `define_py_data_sources2` 来配置PaddlePaddle从上面的`dataprovider.py`里读入训练数据和测试数据。 PaddlePaddle接受从命令行读入的配置信息，例如这里我们传入一个名为`is_predict`的变量来控制模型在训练和测试时的不同结构。
 ```python
-from paddle.trainer_config_helpers import *
+parameters = paddle.parameters.create(cost)
+```
+
+### 创建Trainer
+
+```python
+optimizer = paddle.optimizer.Momentum(momentum=0)
+
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+```
 
-is_predict = get_config_arg('is_predict', bool, False)
+### 读取数据且打印训练的中间信息
 
-define_py_data_sources2(
-    train_list='data/train.list',
-    test_list='data/test.list',
-    module='dataprovider',
-    obj='process')
+PaddlePaddle提供一个
+[reader机制](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
+来读取数据。 Reader返回的数据可以包括多列，我们需要一个Python dict把列
+序号映射到网络里的数据层。
 
+```python
+feeding={'x': 0, 'y': 1}
 ```
 
-### 算法配置
-接着，指定模型优化算法的细节。由于线性回归模型比较简单，我们只要设置基本的`batch_size`即可，它指定每次更新参数的时候使用多少条数据计算梯度信息。
+此外，我们还可以提供一个 event handler，来打印训练的进度：
+
 ```python
-settings(batch_size=2)
+# event_handler to print training and testing info
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f" % (
+                event.pass_id, event.batch_id, event.cost)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+            reader=paddle.batch(
+                uci_housing.test(), batch_size=2),
+            feeding=feeding)
+        print "Test %d, Cost %f" % (event.pass_id, result.cost)
 ```
 
-### 网络结构
-最后，使用`fc_layer`和`LinearActivation`来表示线性回归的模型本身。
 ```python
-#输入数据，13维的房屋信息
-x = data_layer(name='x', size=13)
-
-y_predict = fc_layer(
-    input=x,
-    param_attr=ParamAttr(name='w'),
-    size=1,
-    act=LinearActivation(),
-    bias_attr=ParamAttr(name='b'))
-
-if not is_predict: #训练时，我们使用MSE，即regression_cost作为损失函数
-    y = data_layer(name='y', size=1)
-    cost = regression_cost(input=y_predict, label=y)
-    outputs(cost) #训练时输出MSE来监控损失的变化
-else: #测试时，输出预测值
-    outputs(y_predict)
+# event_handler to print training and testing info
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            cost_ploter.append(train_title, step, event.cost)
+
+        if step % 100 == 0: # every 100 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    uci_housing.test(), batch_size=2),
+                feeding=feeding)
+            cost_ploter.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            cost_ploter.plot()
+
+        step += 1
+
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
 ```
 
-## 训练模型
-在对应代码的根目录下执行PaddlePaddle的命令行训练程序。这里指定模型配置文件为`trainer_config.py`，训练30轮，结果保存在`output`路径下。
-```bash
-./train.sh
+### 开始训练
+
+```python
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(
+            uci_housing.train(), buf_size=500),
+        batch_size=2),
+    feeding=feeding,
+    event_handler=event_handler_plot,
+    num_passes=30)
 ```
 
-## 应用模型
-现在来看下如何使用已经训练好的模型进行预测。
-```bash
-python predict.py
+![png](./image/train_and_test.png)
+
+### 应用模型
+
+#### 1. 生成测试数据
+
+```python
+test_data_creator = paddle.dataset.uci_housing.test()
+test_data = []
+test_label = []
+
+for item in test_data_creator():
+    test_data.append((item[0],))
+    test_label.append(item[1])
+    if len(test_data) == 5:
+        break
 ```
-这里默认使用`output/pass-00029`中保存的模型进行预测，并将数据中的房价与预测结果进行对比，结果保存在 `predictions.png`中。
-如果你想使用别的模型或者其它的数据进行预测，只要传入新的路径即可：
-```bash
-python predict.py -m output/pass-00020 -t data/housing.test.npy
+
+#### 2. 推测 inference
+
+```python
+# load parameters from tar file.
+# users can remove the comments and change the model name
+# with open('params_pass_20.tar', 'r') as f:
+#     parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=y_predict, parameters=parameters, input=test_data)
+
+for i in xrange(len(probs)):
+    print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
 ```
 
 ## 总结
@@ -227,7 +316,8 @@ python predict.py -m output/pass-00020 -t data/housing.test.npy
 4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -246,6 +336,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/01.fit_a_line/index.html b/01.fit_a_line/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..272b1f250b9f32edb6d6fbaa4b0b4539a9625c48
--- /dev/null
+++ b/01.fit_a_line/index.html
@@ -0,0 +1,347 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Linear Regression
+Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict home prices. Some important concepts in Machine Learning will be covered through this example.
+
+The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Problem Setup
+Suppose we have a dataset of $n$ real estate properties. Each real estate property will be referred to as **homes** in this chapter for clarity.
+
+Each home is associated with $d$ attributes. The attributes describe characteristics such as the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
+
+In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely,
+
+$$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b,  i=1,\ldots,n$$
+
+where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
+
+## Results Demonstration
+We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of similar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the closer the point is to the dotted line, better the model's prediction.
+<p align="center">
+    <img src = "image/predictions_en.png" width=400><br/>
+    Figure 1. Predicted Value V.S. Actual Value
+</p>
+
+## Model Overview
+
+### Model Definition
+
+In the UCI Housing Data Set, there are 13 home attributes $\{x_{i,j}\}$ that are related to the median home price $y_i$, which we aim to predict. Thus, our model can be written as:
+
+$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
+
+where $\hat{Y}$ is the predicted value used to differentiate from actual value $Y$. The model learns parameters $\omega_1, \ldots, \omega_{13}, b$, where the entries of $\vec{\omega}$ are **weights** and $b$ is **bias**.
+
+Now we need an objective to optimize, so that the learned parameters can make $\hat{Y}$ as close to $Y$ as possible. Let's refer to the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). A loss function must output a non-negative value, given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$. This value reflects the magnitutude of the model error.
+
+For Linear Regression, the most common loss function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
+
+$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
+
+That is, for a dataset of size $n$, MSE is the average value of the the prediction sqaure errors.
+
+### Training
+
+After setting up our model, there are several major steps to go through to train it:
+1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s.
+2. Feedforward. Evaluate the network output and compute the corresponding loss.
+3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
+4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of epochs is reached.
+
+## Dataset
+
+### Python Dataset Modules
+
+Our program starts with importing necessary packages:
+
+```python
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+```
+
+We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`.  This module can
+
+1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if you haven't yet, and
+2.  [preprocess](#preprocessing) the dataset.
+
+### An Introduction of the Dataset
+
+The UCI housing dataset has 506 instances. Each instance describes the attributes of a house in surburban Boston.  The attributes are explained below:
+
+| Attribute Name | Characteristic | Data Type |
+| ------| ------ | ------ |
+| CRIM | per capita crime rate by town | Continuous|
+| ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous |
+| INDUS | proportion of non-retail business acres per town | Continuous |
+| CHAS | Charles River dummy variable | Discrete, 1 if tract bounds river; 0 otherwise|
+| NOX | nitric oxides concentration (parts per 10 million) | Continuous |
+| RM | average number of rooms per dwelling | Continuous |
+| AGE | proportion of owner-occupied units built prior to 1940 | Continuous |
+| DIS | weighted distances to five Boston employment centres | Continuous |
+| RAD | index of accessibility to radial highways | Continuous |
+| TAX | full-value property-tax rate per $10,000 | Continuous |
+| PTRATIO | pupil-teacher ratio by town | Continuous |
+| B | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | Continuous |
+| LSTAT | % lower status of the population | Continuous |
+| MEDV | Median value of owner-occupied homes in $1000's | Continuous |
+
+The last entry is the median home price.
+
+### Preprocessing
+#### Continuous and Discrete Data
+We define a feature vector of length 13 for each home, where each entry corresponds to an attribute. Our first observation is that, among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension.
+
+Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically.  The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
+
+#### Feature Normalization
+We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale the values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we subtract the mean value from the feature value and divide the result by the width of the original range.
+
+There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
+- A value range that is too large or too small might cause floating number overflow or underflow during computation.
+- Different value ranges might result in varying *importances* of different features to the model (at least in the beginning of the training process). This assumption about the data is often unreasonable, making the optimization difficult, which in turn results in increased training time.
+- Many machine learning techniques or models (e.g., *L1/L2 regularization* and *Vector Space Model*) assumes that all the features have roughly zero means and their value ranges are similar.
+
+<p align="center">
+    <img src = "image/ranges_en.png" width=550><br/>
+    Figure 2. The value ranges of the features
+</p>
+
+#### Prepare Training and Test Sets
+We split the dataset in two, one for adjusting the model parameters, namely, for training the model, and the other for testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
+
+
+When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process.
+
+
+## Training
+
+`fit_a_line/trainer.py` demonstrates the training using [PaddlePaddle](http://paddlepaddle.org).
+
+### Initialize PaddlePaddle
+
+```python
+paddle.init(use_gpu=False, trainer_count=1)
+```
+
+### Model Configuration
+
+Linear regression is essentially a fully-connected layer with linear activation:
+
+```python
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+y_predict = paddle.layer.fc(input=x,
+                                size=1,
+                                act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+```
+
+### Save Topology
+
+```python
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
+```
+
+
+### Create Parameters
+
+```python
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+```python
+optimizer = paddle.optimizer.Momentum(momentum=0)
+
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+```
+
+### Feeding Data
+
+PaddlePaddle provides the
+[reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
+for loading the training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
+
+```python
+feeding={'x': 0, 'y': 1}
+```
+
+Moreover, an event handler is provided to print the training progress:
+
+```python
+# event_handler to print training and testing info
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f" % (
+                event.pass_id, event.batch_id, event.cost)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+            reader=paddle.batch(
+                uci_housing.test(), batch_size=2),
+            feeding=feeding)
+        print "Test %d, Cost %f" % (event.pass_id, result.cost)
+```
+
+```python
+# event_handler to plot training and testing info
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+plot_cost = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            plot_cost.append(train_title, step, event.cost)
+
+        if step % 100 == 0: # every 100 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    uci_housing.test(), batch_size=2),
+                feeding=feeding)
+            plot_cost.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            plot_cost.plot()
+
+        step += 1
+
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+```
+
+### Start Training
+
+```python
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(
+            uci_housing.train(), buf_size=500),
+        batch_size=2),
+    feeding=feeding,
+    event_handler=event_handler_plot,
+    num_passes=30)
+```
+
+![png](./image/train_and_test.png)
+
+### Apply model
+
+#### 1. generate testing data
+
+```python
+test_data_creator = paddle.dataset.uci_housing.test()
+test_data = []
+test_label = []
+
+for item in test_data_creator():
+    test_data.append((item[0],))
+    test_label.append(item[1])
+    if len(test_data) == 5:
+        break
+```
+
+#### 2. inference
+
+```python
+# load parameters from tar file.
+# users can remove the comments and change the model name
+# with open('params_pass_20.tar', 'r') as f:
+#     parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=y_predict, parameters=parameters, input=test_data)
+
+for i in xrange(len(probs)):
+    print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
+```
+
+## Summary
+This chapter introduces *Linear Regression* and how to train and test this model with PaddlePaddle, using the UCI Housing Data Set. Because a large number of more complex models and techniques are derived from linear regression, it is important to understand its underlying theory and limitation.
+
+
+## References
+1. https://en.wikipedia.org/wiki/Linear_regression
+2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
+3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
+4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/01.fit_a_line/infer.py b/01.fit_a_line/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a271aa662cd4eb59d4d500224abee8fb8b48cb3
--- /dev/null
+++ b/01.fit_a_line/infer.py
@@ -0,0 +1,17 @@
+import paddle.v2 as paddle
+
+# Initialize PaddlePaddle.
+paddle.init(use_gpu=False, trainer_count=1)
+
+# Configure the neural network.
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# Infer using provided test data.
+probs = paddle.infer(
+    output_layer=y_predict,
+    parameters=paddle.dataset.uci_housing.model(),
+    input=[item for item in paddle.dataset.uci_housing.test()()])
+
+for i in xrange(len(probs)):
+    print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
diff --git a/01.fit_a_line/train.py b/01.fit_a_line/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..38f267aaedfab98db9451de577ba9a33a91e0262
--- /dev/null
+++ b/01.fit_a_line/train.py
@@ -0,0 +1,83 @@
+import os
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
+
+def main():
+    # init
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.square_error_cost(input=y_predict, label=y)
+
+    # Save the inference topology to protobuf.
+    inference_topology = paddle.topology.Topology(layers=y_predict)
+    with open("inference_topology.pkl", 'wb') as f:
+        inference_topology.serialize_for_inference(f)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+
+    feeding = {'x': 0, 'y': 1}
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if event.pass_id % 10 == 0:
+                with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                    trainer.save_parameter_to_tar(f)
+            result = trainer.test(
+                reader=paddle.batch(uci_housing.test(), batch_size=2),
+                feeding=feeding)
+            print "Test %d, Cost %f" % (event.pass_id, result.cost)
+
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding=feeding,
+        event_handler=event_handler,
+        num_passes=30)
+
+    # inference
+    test_data_creator = paddle.dataset.uci_housing.test()
+    test_data = []
+    test_label = []
+
+    for item in test_data_creator():
+        test_data.append((item[0], ))
+        test_label.append(item[1])
+        if len(test_data) == 5:
+            break
+
+    # load parameters from tar file.
+    # users can remove the comments and change the model name
+    # with open('params_pass_20.tar', 'r') as f:
+    #     parameters = paddle.parameters.Parameters.from_tar(f)
+
+    probs = paddle.infer(
+        output_layer=y_predict, parameters=parameters, input=test_data)
+
+    for i in xrange(len(probs)):
+        print "label=" + str(test_label[i][0]) + ", predict=" + str(probs[i][0])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/recognize_digits/.gitignore b/02.recognize_digits/.gitignore
similarity index 100%
rename from recognize_digits/.gitignore
rename to 02.recognize_digits/.gitignore
diff --git a/recognize_digits/README.md b/02.recognize_digits/README.cn.md
similarity index 71%
rename from recognize_digits/README.md
rename to 02.recognize_digits/README.cn.md
index 6423f6af121e8797f1f0999e093e5d320f20ad02..0d4d8c474bb2e4fbef3a41458f071f111c3b67cb 100644
--- a/recognize_digits/README.md
+++ b/02.recognize_digits/README.cn.md
@@ -1,6 +1,6 @@
 # 识别数字
 
-本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/recognize_digits)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/167.html)。
 
 ## 背景介绍
 当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
@@ -12,7 +12,7 @@
 
 MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
 
-Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/image_classification) 教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。
+Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。
 
 有很多算法在MNIST上进行实验。1998年，LeCun分别用单层线性分类器、多层感知器（Multilayer Perceptron, MLP）和多层卷积神经网络LeNet进行实验，使得测试集上的误差不断下降（从12%下降到0.7%）\[[1](#参考文献)\]。此后，科学家们又基于K近邻（K-Nearest Neighbors）算法\[[2](#参考文献)\]、支持向量机（SVM）\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验，并采用多种预处理方法（如去除歪曲、去噪、模糊等）来提高识别的准确率。
 
@@ -32,15 +32,15 @@ Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程
 
 输入层的数据$X$传到输出层，在激活操作之前，会乘以相应的权重 $W$ ，并加上偏置变量 $b$ ，具体如下：
 
-$$ y_i = softmax(\sum_j W_{i,j}x_j + b_i) $$
+$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
 
-其中 $ softmax(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
+其中 $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
 
-对于有 $N$ 个类别的多分类问题，指定 $N$ 个输出节点，$N$ 维输入特征经过softmax将归一化为 $N$ 个[0,1]范围内的实数值，分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。
+对于有 $N$ 个类别的多分类问题，指定 $N$ 个输出节点，$N$ 维结果向量经过softmax将归一化为 $N$ 个[0,1]范围内的实数值，分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。
 
 在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy），公式如下：
 
-$$  crossentropy(label, y) = -\sum_i label_ilog(y_i) $$
+$$  \text{crossentropy}(label, y) = -\sum_i label_ilog(y_i) $$
 
 图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
@@ -55,8 +55,8 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 
 1.  经过第一个隐藏层，可以得到 $ H_1 = \phi(W_1X + b_1) $，其中$\phi$代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
 2.  经过第二个隐藏层，可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
-3.  最后，再经过输出层，得到的$Y=softmax(W_3H_2 + b_3)$，即为最后的分类结果向量。
-      
+3.  最后，再经过输出层，得到的$Y=\text{softmax}(W_3H_2 + b_3)$，即为最后的分类结果向量。
+
 
 图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
@@ -67,48 +67,40 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 
 ### 卷积神经网络(Convolutional Neural Network, CNN)
 
-在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图6显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
+在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
 
 <p align="center">
 <img src="image/cnn.png"><br/>
-图6. LeNet-5卷积神经网络结构<br/>
+图4. LeNet-5卷积神经网络结构<br/>
 </p>
 
 #### 卷积层
 
-卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器(也称作卷积核)与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
+卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
 
 <p align="center">
-<img src="image/conv_layer.png"><br/>
-图4. 卷积层图片<br/>
+<img src="image/conv_layer.png" width='750'><br/>
+图5. 卷积层图片<br/>
 </p>
 
-图4给出一个卷积计算过程的示例图，输入图像大小为$H=5,W=5,D=3$，即$5x5$大小的3通道(RGB，也称作深度)彩色图像。这个示例图中包含两(用$K$表示)组卷积核，即图中$Filter W_0$和$Filter W_1$，在卷积计算中，通常对不同的输入通道采用不同的卷积核，如图示例中每组卷积核包含3($D$)个$3x3$(用$FXF$表示)大小的卷积核。另外，这个示例中卷积核在图像的水平方向($W$方向)和垂直方向($H$方向)的滑动步长为2(用$S$表示)；对输入图像周围各填充1(用$P$表示)个0，即图中输入层原始数据为蓝色部分，灰色部分是进行了大小为1的扩展，用0来进行扩展。经过卷积操作得到输出为$3x3x2$(用$H_{o}xW_{o}xK$表示)大小的特征图，即$3x3$大小的2通道特征图，其中$H_o$计算公式为：$H_o = (H - F + 2*P)/S + 1$，$W_o$同理。 而输出特征图中的每个像素，是每组滤波器与输入图像每个特征图的内积再求和，再加上偏置($b_o$)，偏置通常对于每个输出特征图是共享的。例如图中输出特征图`o[:,:,0]`中的第一个$2$计算如下：
-
-\begin{align}
-o[0,0,0] = \sum x[0:3,0:3,0] * w_{0}[:,:,0]]  + \sum x[0:3,0:3,1] * w_{0}[:,:,1]]  +  \sum x[0:3,0:3,2] * w_{0}[:,:,2]] +  b_0 = 2 \\\\
-\sum x[0:3,0:3,0] * w_{0}[:,:,0]] & = 0*1 + 0*1 + 0*1 + 0*1 + 1*1 + 2*(-1) + 0*(-1) + 0*1 + 0*(-1) = -1 \\\\
-\sum x[0:3,0:3,1] * w_{0}[:,:,1]] & = 0*0 + 0*1 + 0*1 + 0*(-1) + 0*0 + 1*1 + 0*1 + 2*0 + 1*1 = 2 \\\\
-\sum x[0:3,0:3,2] * w_{0}[:,:,2]] & = 0*(-1) + 0*1 + 0*(-1) + 0*0 + 1*1 + 1*0 + 0*(-1) + 1*0 + 1*(-1) = 0 \\\\
-b_0 & = 1\\\\
-\end{align}
+图5给出一个卷积计算过程的示例图，输入图像大小为$H=5,W=5,D=3$，即$5 \times 5$大小的3通道（RGB，也称作深度）彩色图像。这个示例图中包含两（用$K$表示）组卷积核，即图中滤波器$W_0$和$W_1$。在卷积计算中，通常对不同的输入通道采用不同的卷积核，如图示例中每组卷积核包含（$D=3）$个$3 \times 3$（用$F \times F$表示）大小的卷积核。另外，这个示例中卷积核在图像的水平方向（$W$方向）和垂直方向（$H$方向）的滑动步长为2（用$S$表示）；对输入图像周围各填充1（用$P$表示）个0，即图中输入层原始数据为蓝色部分，灰色部分是进行了大小为1的扩展，用0来进行扩展。经过卷积操作得到输出为$3 \times 3 \times 2$（用$H_{o} \times W_{o} \times K$表示）大小的特征图，即$3 \times 3$大小的2通道特征图，其中$H_o$计算公式为：$H_o = (H - F + 2 \times P)/S + 1$，$W_o$同理。 而输出特征图中的每个像素，是每组滤波器与输入图像每个特征图的内积再求和，再加上偏置$b_o$，偏置通常对于每个输出特征图是共享的。输出特征图$o[:,:,0]$中的最后一个$-2$计算如图5右下角公式所示。
 
-在卷积操作中卷积核是可学习的参数，经过上面示例介绍，每层卷积的参数大小为$DxFxFxK$。在多层感知器模型中，神经元通常是全部连接，参数较多。而卷积层的参数较少，这也是由卷积层的主要特性即局部连接和共享权重所决定。
+在卷积操作中卷积核是可学习的参数，经过上面示例介绍，每层卷积的参数大小为$D \times F \times F \times K$。在多层感知器模型中，神经元通常是全部连接，参数较多。而卷积层的参数较少，这也是由卷积层的主要特性即局部连接和共享权重所决定。
 
-- 局部连接：每个神经元仅与输入神经元的一块区域连接，这块局部区域称作感受野(receptive field)。在图像卷积操作中，即神经元在空间维度(spatial dimension，即上图示例H和W所在的平面)是局部连接，但在深度上是全部连接。对于二维图像本身而言，也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想，也是受启发于生物学里面的视觉系统结构，视觉皮层的神经元就是局部接受信息的。
+- 局部连接：每个神经元仅与输入神经元的一块区域连接，这块局部区域称作感受野（receptive field）。在图像卷积操作中，即神经元在空间维度（spatial dimension，即上图示例H和W所在的平面）是局部连接，但在深度上是全部连接。对于二维图像本身而言，也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想，也是受启发于生物学里面的视觉系统结构，视觉皮层的神经元就是局部接受信息的。
 
 - 权重共享：计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算$o[:,:,0]$的每个每个神经元的滤波器均相同，都为$W_0$，这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的，例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的，比如输入的图片是人脸，眼睛和头发位于不同的位置，希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的，在卷积层，通常采用多组卷积核提取不同特征，即对应不同深度切片的特征，不同深度切片的神经元权重是不共享。另外，偏重对同一深度切片的所有神经元都是共享的。
 
-通过介绍卷积计算过程及其特性，可以看出卷积是线性操作，并具有平移不变性(shift-invariant)，平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小，这样也有利于训练较大卷积神经网络。
+通过介绍卷积计算过程及其特性，可以看出卷积是线性操作，并具有平移不变性（shift-invariant），平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小，这样也有利于训练较大卷积神经网络。
 
 #### 池化层
 
 <p align="center">
 <img src="image/max_pooling.png" width="400px"><br/>
-图5. 池化层图片<br/>
+图6. 池化层图片<br/>
 </p>
 
-池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图5所示。
+池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。
 
 更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
 
@@ -181,7 +173,7 @@ def convolutional_neural_network(img):
         num_channel=1,
         pool_size=2,
         pool_stride=2,
-        act=paddle.activation.Tanh())
+        act=paddle.activation.Relu())
     # 第二个卷积-池化层
     conv_pool_2 = paddle.networks.simple_img_conv_pool(
         input=conv_pool_1,
@@ -190,13 +182,9 @@ def convolutional_neural_network(img):
         num_channel=20,
         pool_size=2,
         pool_stride=2,
-        act=paddle.activation.Tanh())
-    # 全连接层
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
+        act=paddle.activation.Relu())
     # 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
-    predict = paddle.layer.fc(input=fc1,
+    predict = paddle.layer.fc(input=conv_pool_2,
                               size=10,
                               act=paddle.activation.Softmax())
     return predict
@@ -213,9 +201,9 @@ images = paddle.layer.data(
 label = paddle.layer.data(
     name='label', type=paddle.data_type.integer_value(10))
 
-predict = softmax_regression(images) # Softmax回归
-#predict = multilayer_perceptron(images) #多层感知器
-#predict = convolutional_neural_network(images) #LeNet5卷积神经网络
+# predict = softmax_regression(images) # Softmax回归
+# predict = multilayer_perceptron(images) #多层感知器
+predict = convolutional_neural_network(images) #LeNet5卷积神经网络
 
 cost = paddle.layer.classification_cost(input=predict, label=label)
 ```
@@ -242,8 +230,40 @@ trainer = paddle.trainer.SGD(cost=cost,
 
 下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B —— reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
 
-`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minbatch。
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+
+`event_handler_plot`可以用来在训练过程中画图如下：
+
+![png](./image/train_and_test.png)
 
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+# event_handler to plot a figure
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 100 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=128))
+        cost_ploter.append(test_title, step, result.cost)
+```
+
+`event_handler` 用来在训练过程中输出训练结果
 ```python
 lists = []
 
@@ -253,20 +273,26 @@ def event_handler(event):
             print "Pass %d, Batch %d, Cost %f, %s" % (
                 event.pass_id, event.batch_id, event.cost, event.metrics)
     if isinstance(event, paddle.event.EndPass):
-        result = trainer.test(reader=paddle.reader.batched(
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=paddle.batch(
             paddle.dataset.mnist.test(), batch_size=128))
         print "Test with Pass %d, Cost %f, %s\n" % (
             event.pass_id, result.cost, result.metrics)
         lists.append((event.pass_id, result.cost,
                       result.metrics['classification_error_evaluator']))
+```
 
+```python
 trainer.train(
-    reader=paddle.reader.batched(
+    reader=paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.mnist.train(), buf_size=8192),
         batch_size=128),
-    event_handler=event_handler,
-    num_passes=100)
+    event_handler=event_handler_plot,
+    num_passes=5)
 ```
 
 训练过程是完全自动的，event_handler里打印的日志类似如下所示：
@@ -282,6 +308,32 @@ trainer.train(
 
 训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。
 
+
+## 应用模型
+
+可以使用训练好的模型对手写体数字图片进行分类，下面程序展示了如何使用paddle.infer接口进行推断。
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file).convert('L')
+    im = im.resize((28, 28), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32).flatten()
+    im = im / 255.0 * 2.0 - 1.0
+    return im
+
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/infer_3.png'),))
+
+probs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/infer_3.png is: %d" % lab[0][0]
+```
+
 ## 总结
 
 本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型，后续章节中复杂的神经网络都是从它们衍生出来的，因此这几个模型对之后的学习大有裨益。同时，我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候，MNIST数据集上的识别准确率有了大幅度的提升，原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候，希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外，本教程还介绍了PaddlePaddle模型搭建的基本流程，从dataprovider的编写、网络层的构建，到最后的训练和预测。对这个流程熟悉以后，大家就可以用自己的数据，定义自己的网络模型，并完成自己的训练和预测任务了。
@@ -297,7 +349,7 @@ trainer.train(
 7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
 8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
 9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
-10. Bishop, Christopher M. ["Pattern recognition."](http://s3.amazonaws.com/academia.edu.documents/30428242/bg0137.pdf?AWSAccessKeyId=AKIAJ56TQJRTWSMTNPEA&Expires=1484816640&Signature=85Ad6%2Fca8T82pmHzxaSXermovIA%3D&response-content-disposition=inline%3B%20filename%3DPattern_recognition_and_machine_learning.pdf) Machine Learning 128 (2006): 1-58.
+10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58.
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/02.recognize_digits/README.md b/02.recognize_digits/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6292be0df2f99e75f5d6c74e70e526552d14d36b
--- /dev/null
+++ b/02.recognize_digits/README.md
@@ -0,0 +1,365 @@
+# Recognize Digits
+
+The source code for this tutorial is here:  [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits). For instructions on getting started with Paddle, please refer to [installation instructions](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Introduction
+When one learns to program, the first task is usually to write a program that prints "Hello World!". In Machine Learning or Deep Learning, an equivalent task is to train a model to recognize hand-written digits using the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. Handwriting recognition is a classic image classification problem. The problem is relatively easy and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a $28\times28$ matrix, and the label is one of the digits from $0$ to $9$. All images are normalized, meaning that they are both rescaled and centered.
+
+<p align="center">
+<img src="image/mnist_example_image.png" width="400"><br/>
+Fig. 1. Examples of MNIST images
+</p>
+
+The MNIST dataset is from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set of 60,000 samples and test set of 10,000 samples. 250 annotators labeled the training set, thus guaranteed that there wasn't a complete overlap of annotators of training set and test set.
+
+The MNIST dataset has been used for evaluating many image recognition algorithms such as a single layer linear classifier, Multilayer Perceptron (MLP) and Multilayer CNN LeNet\[[1](#references)\], K-Nearest Neighbors (k-NN) \[[2](#references)\], Support Vector Machine (SVM) \[[3](#references)\], Neural Networks \[[4-7](#references)\], Boosting \[[8](#references)\] and preprocessing methods like distortion removal, noise removal, and blurring.  Among these algorithms, the *Convolutional Neural Network* (CNN) has achieved a series of impressive results in Image Classification tasks, including VGGNet, GoogLeNet, and ResNet (See [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) tutorial).
+
+In this tutorial, we start with a simple **softmax** regression model and go on with MLP and CNN.  Readers will see how these methods improve the recognition accuracy step-by-step.
+
+
+## Model Overview
+
+Before introducing classification algorithms and training procedure, we define the following symbols:
+- $X$ is the input: Input is a $28\times 28$ MNIST image. It is flattened to a $784$ dimensional vector. $X=\left (x_0, x_1, \dots, x_{783} \right )$.
+- $Y$ is the output: Output of the classifier is 1 of the 10 classes (digits from 0 to 9). $Y=\left (y_0, y_1, \dots, y_9 \right )$. Each dimension $y_i$ represents the probability that the input image belongs to class $i$.
+- $L$ is the ground truth label: $L=\left ( l_0, l_1, \dots, l_9 \right )$. It is also 10 dimensional, but only one entry is $1$ and all others are $0$s.
+
+### Softmax Regression
+
+In a simple softmax regression model, the input is first fed to fully connected layers. Then, a softmax function is applied to output probabilities of multiple output classes\[[9](#references)\].
+
+The input $X$ is multiplied by weights $W$ and then added to the bias $b$ to generate activations.
+
+$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
+
+where $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
+
+For an $N$-class classification problem with $N$ output nodes, Softmax normalizes the resulting $N$ dimensional vector so that each of its entries falls in the range $[0,1]\in {R}$, representing the probability that the sample belongs to a certain class. Here $y_i$ denotes the predicted probability that an image is of digit $i$.
+
+In such a classification problem, we usually use the cross entropy loss function:
+
+$$  \text{crossentropy}(label, y) = -\sum_i label_ilog(y_i) $$
+
+Fig. 2 illustrates a softmax regression network, with the weights in blue, and the bias in red. `+1` indicates that the bias is $1$.
+
+<p align="center">
+<img src="image/softmax_regression_en.png" width=400><br/>
+Fig. 2. Softmax regression network architecture<br/>
+</p>
+
+### Multilayer Perceptron
+
+The softmax regression model described above uses the simplest two-layer neural network. That is, it only contains an input layer and an output layer, with limited regression capability. To achieve better recognition results, consider adding several hidden layers\[[10](#references)\] between the input layer and the output layer.
+
+1.  After the first hidden layer, we get $ H_1 = \phi(W_1X + b_1) $, where $\phi$ denotes the activation function. Some [common ones](###list-of-common-activation-functions) are sigmoid, tanh and ReLU.
+2.  After the second hidden layer, we get $ H_2 = \phi(W_2H_1 + b_2) $.
+3.  Finally, the output layer outputs $Y=\text{softmax}(W_3H_2 + b_3)$, the vector denoting our classification result.
+
+Fig. 3. shows a Multilayer Perceptron network, with the weights in blue, and the bias in red. +1 indicates that the bias is $1$.
+
+<p align="center">
+<img src="image/mlp_en.png" width=500><br/>
+Fig. 3. Multilayer Perceptron network architecture<br/>
+
+</p>
+
+### Convolutional Neural Network
+
+#### Convolutional Layer
+
+<p align="center">
+<img src="image/conv_layer.png" width='750'><br/>
+Fig. 4. Convolutional layer<br/>
+</p>
+
+The **convolutional layer** is the core of a Convolutional Neural Network. The parameters in this layer are composed of a set of filters, also called kernels. We could visualize the convolution step in the following fashion: Each kernel slides horizontally and vertically till it covers the whole image. At every window, we compute the dot product of the kernel and the input. Then, we add the bias and apply an activation function. The result is a two-dimensional activation map. For example, some kernel may recognize corners, and some may recognize circles. These convolution kernels may respond strongly to the corresponding features.
+
+Fig. 4 illustrates the dynamic programming of a convolutional layer, where depths are flattened for simplicity. The input is $W_1=5$, $H_1=5$, $D_1=3$. In fact, this is a common representation for colored images. $W_1$ and $H_1$ correspond to the width and height in a colored image. $D_1$ corresponds to the three color channels for RGB. The parameters of the convolutional layer are $K=2$, $F=3$, $S=2$, $P=1$. $K$ denotes the number of kernels; specifically, $Filter$ $W_0$ and $Filter$ $W_1$ are the kernels. $F$ is kernel size while $W0$ and $W1$ are both $F\timesF = 3\times3$ matrices in all depths. $S$ is the stride, which is the width of the sliding window; here, kernels move leftwards or downwards by two units each time. $P$ is the width of the padding, which denotes an extension of the input; here, the gray area shows zero padding with size 1.
+
+#### Pooling Layer
+
+<p align="center">
+<img src="image/max_pooling_en.png" width="400px"><br/>
+Fig. 5 Pooling layer using max-pooling<br/>
+</p>
+
+A **pooling layer** performs downsampling. The main functionality of this layer is to reduce computation by reducing the network parameters. It also prevents over-fitting to some extent. Usually, a pooling layer is added after a convolutional layer. Pooling layer can use various techniques, such as max pooling and average pooling. As shown in Fig.5, max pooling uses rectangles to segment the input layer into several parts and computes the maximum value in each part as the output.
+
+#### LeNet-5 Network
+
+<p align="center">
+<img src="image/cnn_en.png"><br/>
+Fig. 6. LeNet-5 Convolutional Neural Network architecture<br/>
+</p>
+
+[**LeNet-5**](http://yann.lecun.com/exdb/lenet/) is one of the simplest Convolutional Neural Networks. Fig. 6. shows its architecture: A 2-dimensional input image is fed into two sets of convolutional layers and pooling layers. This output is then fed to a fully connected layer and a softmax classifier. Compared to multilayer, fully connected perceptrons, the LeNet-5 can recognize images better. This is due to the following three properties of the convolution:
+
+- The 3D nature of the neurons: a convolutional layer is organized by width, height, and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
+- Local connectivity: A CNN utilizes the local space correlation by connecting local neurons. This design guarantees that the learned filter has a strong response to local input features. Stacking many such layers generates a non-linear filter that is more global. This enables the network to first obtain good representation for small parts of input and then combine them to represent a larger region.
+- Weight sharing: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means that all the neurons in the same depth of the output response to the same feature. This allows the network to detect a feature regardless of its position in the input.
+
+For more details on Convolutional Neural Networks, please refer to the tutorial on [Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) and the [relevant lecture](http://cs231n.github.io/convolutional-networks/) from a Stanford course.
+
+### List of Common Activation Functions
+- Sigmoid activation function: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
+
+- Tanh activation function: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
+
+  In fact, tanh function is just a rescaled version of the sigmoid function. It is obtained by magnifying the value of the sigmoid function and moving it downwards by 1.
+
+- ReLU activation function: $ f(x) = max(0, x) $
+
+For more information, please refer to [Activation functions on Wikipedia](https://en.wikipedia.org/wiki/Activation_function).
+
+## Data Preparation
+
+PaddlePaddle provides a Python module, `paddle.dataset.mnist`, which downloads and caches the [MNIST dataset](http://yann.lecun.com/exdb/mnist/).  The cache is under `/home/username/.cache/paddle/dataset/mnist`:
+
+
+|    File name          |       Description | Size            |
+|----------------------|--------------|-----------|
+|train-images-idx3-ubyte|  Training images | 60,000 |
+|train-labels-idx1-ubyte|  Training labels | 60,000 |
+|t10k-images-idx3-ubyte |  Evaluation images | 10,000 |
+|t10k-labels-idx1-ubyte |  Evaluation labels | 10,000 |
+
+
+## Model Configuration
+
+A PaddlePaddle program starts from importing the API package:
+
+```python
+import paddle.v2 as paddle
+```
+
+We want to use this program to demonstrate three different classifiers, each defined as a Python function:
+
+- Softmax regression: the network has a fully-connection layer with softmax activation:
+
+```python
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+```
+
+- Multi-Layer Perceptron: this network has two hidden fully-connected layers, one with ReLU and the other with softmax activation:
+
+```python
+def multilayer_perceptron(img):
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+```
+
+- Convolution network LeNet-5: the input image is fed through two convolution-pooling layers, a fully-connected layer, and the softmax output layer:
+
+```python
+def convolutional_neural_network(img):
+
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Relu())
+
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Relu())
+
+    predict = paddle.layer.fc(input=conv_pool_2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+```
+
+PaddlePaddle provides a special layer `layer.data` for reading data. Let us create a data layer for reading images and connect it to a classification network created using one of above three functions.  We also need a cost layer for training the model.
+
+```python
+paddle.init(use_gpu=False, trainer_count=1)
+
+images = paddle.layer.data(
+    name='pixel', type=paddle.data_type.dense_vector(784))
+label = paddle.layer.data(
+    name='label', type=paddle.data_type.integer_value(10))
+
+# predict = softmax_regression(images)
+# predict = multilayer_perceptron(images) # uncomment for MLP
+predict = convolutional_neural_network(images) # uncomment for LeNet5
+
+cost = paddle.layer.classification_cost(input=predict, label=label)
+```
+
+Now, it is time to specify training parameters. In the following `Momentum` optimizer, `momentum=0.9` means that 90% of the current momentum comes from that of the previous iteration. The learning rate relates to the speed at which the network training converges. Regularization is meant to prevent over-fitting; here we use the L2 regularization.
+
+```python
+parameters = paddle.parameters.create(cost)
+
+optimizer = paddle.optimizer.Momentum(
+    learning_rate=0.1 / 128.0,
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+```
+
+Then we specify the training data `paddle.dataset.mnist.train()` and testing data `paddle.dataset.mnist.test()`. These two methods are *reader creators*. Once called, a reader creator returns a *reader*.  A reader is a Python method, which, once called, returns a Python generator, which yields instances of data.
+
+`shuffle` is a reader decorator. It takes a reader A as input and returns a new reader B. Under the hood, B calls A to read data in the following fashion: it copies in `buffer_size` instances at a time into a buffer, shuffles the data, and yields the shuffled instances one at a time. A large buffer size would yield very shuffled data.
+
+`batch` is a special decorator, which takes a reader and outputs a *batch reader*, which doesn't yield an instance, but a minibatch at a time.
+
+`event_handler_plot` is used to plot a figure like below：
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+# event_handler to plot a figure
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 100 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=128))
+        cost_ploter.append(test_title, step, result.cost)
+```
+
+`event_handler` is used to plot some text data when training.
+
+```python
+lists = []
+
+# event handler to print the progress
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=128))
+        print "Test with Pass %d, Cost %f, %s\n" % (
+            event.pass_id, result.cost, result.metrics)
+        lists.append((event.pass_id, result.cost,
+                      result.metrics['classification_error_evaluator']))
+```
+
+```python
+# Train the model now
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=8192),
+        batch_size=128),
+    event_handler=event_handler_plot,
+    num_passes=5)
+```
+
+During training, `trainer.train` invokes `event_handler` for certain events. This gives us a chance to print the training progress.
+
+```
+# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
+# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
+# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
+# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
+# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
+# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
+```
+
+After the training, we can check the model's prediction accuracy.
+
+```
+# find the best pass
+best = sorted(lists, key=lambda list: float(list[1]))[0]
+print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+```
+
+Usually, with MNIST data, the softmax regression model achieves an accuracy around 92.34%, the MLP 97.66%, and the convolution network around 99.20%. Convolution layers have been widely considered a great invention for image processing.
+
+## Application
+
+After training, users can use the trained model to classify images. The following code shows how to inference MNIST images through `paddle.infer` interface.
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file).convert('L')
+    im = im.resize((28, 28), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32).flatten()
+    im = im / 255.0 * 2.0 - 1.0
+    return im
+
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/infer_3.png'),))
+
+probs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/infer_3.png is: %d" % lab[0][0]
+```
+
+
+## Conclusion
+
+This tutorial describes a few common deep learning models using **Softmax regression**, **Multilayer Perceptron Network**, and **Convolutional Neural Network**. Understanding these models is crucial for future learning; the subsequent tutorials derive more sophisticated networks by building on top of them.
+
+When our model evolves from a simple softmax regression to a slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST dataset achieves a large improvement. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve the results of an old one.
+
+Moreover, this tutorial introduces the basic flow of PaddlePaddle model design, which starts with a *data provider*, a model layer construction, and finally training and prediction. Motivated readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
+
+
+## References
+
+1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
+2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2:753279&dswid=-434) (2014).
+3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
+4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
+5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
+6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220.
+7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
+8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
+9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
+10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/02.recognize_digits/client/client.py b/02.recognize_digits/client/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda7cec91677c9fdc9dd2846a02acb7bee1d0b01
--- /dev/null
+++ b/02.recognize_digits/client/client.py
@@ -0,0 +1,24 @@
+import requests
+from PIL import Image
+import numpy as np
+import os
+
+# this client is used by Paddle serve: https://github.com/PaddlePaddle/book/tree/develop/serve
+# please do not use it directly
+
+
+def load_image(file):
+    im = Image.open(file).convert('L')
+    im = im.resize((28, 28), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32).flatten()
+    im = im / 255.0
+    return im
+
+
+cur_dir = os.path.dirname(os.path.realpath(__file__))
+data = load_image(cur_dir + '/../image/infer_3.png')
+data = data.tolist()
+
+r = requests.post("http://0.0.0.0:8000", json={'img': data})
+
+print(r.text)
diff --git a/recognize_digits/image/cnn.png b/02.recognize_digits/image/cnn.png
similarity index 100%
rename from recognize_digits/image/cnn.png
rename to 02.recognize_digits/image/cnn.png
diff --git a/recognize_digits/image/cnn_en.png b/02.recognize_digits/image/cnn_en.png
similarity index 100%
rename from recognize_digits/image/cnn_en.png
rename to 02.recognize_digits/image/cnn_en.png
diff --git a/recognize_digits/image/cnn_train_log.png b/02.recognize_digits/image/cnn_train_log.png
similarity index 100%
rename from recognize_digits/image/cnn_train_log.png
rename to 02.recognize_digits/image/cnn_train_log.png
diff --git a/recognize_digits/image/cnn_train_log_en.png b/02.recognize_digits/image/cnn_train_log_en.png
similarity index 100%
rename from recognize_digits/image/cnn_train_log_en.png
rename to 02.recognize_digits/image/cnn_train_log_en.png
diff --git a/02.recognize_digits/image/conv_layer.png b/02.recognize_digits/image/conv_layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..87b1f6b83bce654d3854231b80cd710cc7cd4753
Binary files /dev/null and b/02.recognize_digits/image/conv_layer.png differ
diff --git a/02.recognize_digits/image/infer_3.png b/02.recognize_digits/image/infer_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..030cd60d3b4af9aecd4941204da4ad15f6e1189f
Binary files /dev/null and b/02.recognize_digits/image/infer_3.png differ
diff --git a/recognize_digits/image/max_pooling.png b/02.recognize_digits/image/max_pooling.png
similarity index 100%
rename from recognize_digits/image/max_pooling.png
rename to 02.recognize_digits/image/max_pooling.png
diff --git a/recognize_digits/image/max_pooling_en.png b/02.recognize_digits/image/max_pooling_en.png
similarity index 100%
rename from recognize_digits/image/max_pooling_en.png
rename to 02.recognize_digits/image/max_pooling_en.png
diff --git a/recognize_digits/image/mlp.png b/02.recognize_digits/image/mlp.png
similarity index 100%
rename from recognize_digits/image/mlp.png
rename to 02.recognize_digits/image/mlp.png
diff --git a/recognize_digits/image/mlp_en.png b/02.recognize_digits/image/mlp_en.png
similarity index 100%
rename from recognize_digits/image/mlp_en.png
rename to 02.recognize_digits/image/mlp_en.png
diff --git a/recognize_digits/image/mlp_train_log.png b/02.recognize_digits/image/mlp_train_log.png
similarity index 100%
rename from recognize_digits/image/mlp_train_log.png
rename to 02.recognize_digits/image/mlp_train_log.png
diff --git a/recognize_digits/image/mlp_train_log_en.png b/02.recognize_digits/image/mlp_train_log_en.png
similarity index 100%
rename from recognize_digits/image/mlp_train_log_en.png
rename to 02.recognize_digits/image/mlp_train_log_en.png
diff --git a/recognize_digits/image/mnist_example_image.png b/02.recognize_digits/image/mnist_example_image.png
similarity index 100%
rename from recognize_digits/image/mnist_example_image.png
rename to 02.recognize_digits/image/mnist_example_image.png
diff --git a/recognize_digits/image/softmax_regression.png b/02.recognize_digits/image/softmax_regression.png
similarity index 100%
rename from recognize_digits/image/softmax_regression.png
rename to 02.recognize_digits/image/softmax_regression.png
diff --git a/recognize_digits/image/softmax_regression_en.png b/02.recognize_digits/image/softmax_regression_en.png
similarity index 100%
rename from recognize_digits/image/softmax_regression_en.png
rename to 02.recognize_digits/image/softmax_regression_en.png
diff --git a/recognize_digits/image/softmax_train_log.png b/02.recognize_digits/image/softmax_train_log.png
similarity index 100%
rename from recognize_digits/image/softmax_train_log.png
rename to 02.recognize_digits/image/softmax_train_log.png
diff --git a/recognize_digits/image/softmax_train_log_en.png b/02.recognize_digits/image/softmax_train_log_en.png
similarity index 100%
rename from recognize_digits/image/softmax_train_log_en.png
rename to 02.recognize_digits/image/softmax_train_log_en.png
diff --git a/02.recognize_digits/image/train_and_test.png b/02.recognize_digits/image/train_and_test.png
new file mode 100644
index 0000000000000000000000000000000000000000..5cb87b450d0398bcfaec0e647c362052069797e7
Binary files /dev/null and b/02.recognize_digits/image/train_and_test.png differ
diff --git a/recognize_digits/index.html b/02.recognize_digits/index.cn.html
similarity index 66%
rename from recognize_digits/index.html
rename to 02.recognize_digits/index.cn.html
index d34d1a7cbd87df66060692c63c5a8e2339c6e3ba..2a8fed233c64bea117fcc014e507f4cb2a2ccbb9 100644
--- a/recognize_digits/index.html
+++ b/02.recognize_digits/index.cn.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,14 +35,14 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
 # 识别数字
 
-本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/recognize_digits)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/167.html)。
 
 ## 背景介绍
 当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
@@ -53,7 +54,7 @@
 
 MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
 
-Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/image_classification) 教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。
+Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程中提出了卷积神经网络（Convolutional Neural Network），大幅度地提高了手写字符的识别能力，也因此成为了深度学习领域的奠基人之一。如今的深度学习领域，卷积神经网络占据了至关重要的地位，从最早Yann LeCun提出的简单LeNet，到如今ImageNet大赛上的优胜模型VGGNet、GoogLeNet、ResNet等（请参见[图像分类](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) 教程），人们在图像分类领域，利用卷积神经网络得到了一系列惊人的结果。
 
 有很多算法在MNIST上进行实验。1998年，LeCun分别用单层线性分类器、多层感知器（Multilayer Perceptron, MLP）和多层卷积神经网络LeNet进行实验，使得测试集上的误差不断下降（从12%下降到0.7%）\[[1](#参考文献)\]。此后，科学家们又基于K近邻（K-Nearest Neighbors）算法\[[2](#参考文献)\]、支持向量机（SVM）\[[3](#参考文献)\]、神经网络\[[4-7](#参考文献)\]和Boosting方法\[[8](#参考文献)\]等做了大量实验，并采用多种预处理方法（如去除歪曲、去噪、模糊等）来提高识别的准确率。
 
@@ -73,15 +74,15 @@ Yann LeCun早先在手写字符识别上做了很多研究，并在研究过程
 
 输入层的数据$X$传到输出层，在激活操作之前，会乘以相应的权重 $W$ ，并加上偏置变量 $b$ ，具体如下：
 
-$$ y_i = softmax(\sum_j W_{i,j}x_j + b_i) $$
+$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
 
-其中 $ softmax(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
+其中 $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
 
-对于有 $N$ 个类别的多分类问题，指定 $N$ 个输出节点，$N$ 维输入特征经过softmax将归一化为 $N$ 个[0,1]范围内的实数值，分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。
+对于有 $N$ 个类别的多分类问题，指定 $N$ 个输出节点，$N$ 维结果向量经过softmax将归一化为 $N$ 个[0,1]范围内的实数值，分别表示该样本属于这 $N$ 个类别的概率。此处的 $y_i$ 即对应该图片为数字 $i$ 的预测概率。
 
 在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy），公式如下：
 
-$$  crossentropy(label, y) = -\sum_i label_ilog(y_i) $$
+$$  \text{crossentropy}(label, y) = -\sum_i label_ilog(y_i) $$
 
 图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
@@ -96,8 +97,8 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 
 1.  经过第一个隐藏层，可以得到 $ H_1 = \phi(W_1X + b_1) $，其中$\phi$代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
 2.  经过第二个隐藏层，可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
-3.  最后，再经过输出层，得到的$Y=softmax(W_3H_2 + b_3)$，即为最后的分类结果向量。
-      
+3.  最后，再经过输出层，得到的$Y=\text{softmax}(W_3H_2 + b_3)$，即为最后的分类结果向量。
+
 
 图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
@@ -108,38 +109,40 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 
 ### 卷积神经网络(Convolutional Neural Network, CNN)
 
-#### 卷积层
+在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
 
 <p align="center">
-<img src="image/conv_layer.png" width=500><br/>
-图4. 卷积层图片<br/>
+<img src="image/cnn.png"><br/>
+图4. LeNet-5卷积神经网络结构<br/>
 </p>
 
-卷积层是卷积神经网络的核心基石。该层的参数由一组可学习的过滤器（也叫作卷积核）组成。在前向过程中，每个卷积核在输入层进行横向和纵向的扫描，与输入层对应扫描位置进行卷积，得到的结果加上偏置并用相应的激活函数进行激活，结果能够得到一个二维的激活图(activation map)。每个特定的卷积核都能得到特定的激活图(activation map)，如有的卷积核可能对识别边角，有的可能识别圆圈，那这些卷积核可能对于对应的特征响应要强。
-
-图4是卷积层的一个动态图。由于3D量难以表示，所有的3D量（输入的3D量（蓝色），权重3D量（红色），输出3D量（绿色））通过将深度在行上堆叠来表示。如图4，输入层是$W_1=5,H_1=5,D_1=3$，我们常见的彩色图片其实就是类似这样的输入层，彩色图片的宽和高对应这里的$W_1$和$H_1$，而彩色图片有RGB三个颜色通道，对应这里的$D_1$；卷积层的参数为$K=2,F=3,S=2,P=1$，这里的$K$是卷积核的数量，如图4中有$Filter W_0$和$Filter   W_1$两个卷积核，$F$对应卷积核的大小，图中$W0$和$W1$在每一层深度上都是$3\times3$的矩阵，$S$对应卷积核扫描的步长，从动态图中可以看到，方框每次左移或下移2个单位，$P$对应Padding扩展，是对输入层的扩展，图中输入层，原始数据为蓝色部分，可以看到灰色部分是进行了大小为1的扩展，用0来进行扩展；图4的动态可视化对输出层结果（绿色）进行迭代，显示每个输出元素是通过将突出显示的输入（蓝色）与滤波器（红色）进行元素相乘，将其相加，然后通过偏置抵消结果来计算的。
+#### 卷积层
 
-#### 池化层
+卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
 
 <p align="center">
-<img src="image/max_pooling.png" width="400px"><br/>
-图5. 池化层图片<br/>
+<img src="image/conv_layer.png" width='750'><br/>
+图5. 卷积层图片<br/>
 </p>
 
-池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图5所示。
+图5给出一个卷积计算过程的示例图，输入图像大小为$H=5,W=5,D=3$，即$5 \times 5$大小的3通道（RGB，也称作深度）彩色图像。这个示例图中包含两（用$K$表示）组卷积核，即图中滤波器$W_0$和$W_1$。在卷积计算中，通常对不同的输入通道采用不同的卷积核，如图示例中每组卷积核包含（$D=3）$个$3 \times 3$（用$F \times F$表示）大小的卷积核。另外，这个示例中卷积核在图像的水平方向（$W$方向）和垂直方向（$H$方向）的滑动步长为2（用$S$表示）；对输入图像周围各填充1（用$P$表示）个0，即图中输入层原始数据为蓝色部分，灰色部分是进行了大小为1的扩展，用0来进行扩展。经过卷积操作得到输出为$3 \times 3 \times 2$（用$H_{o} \times W_{o} \times K$表示）大小的特征图，即$3 \times 3$大小的2通道特征图，其中$H_o$计算公式为：$H_o = (H - F + 2 \times P)/S + 1$，$W_o$同理。 而输出特征图中的每个像素，是每组滤波器与输入图像每个特征图的内积再求和，再加上偏置$b_o$，偏置通常对于每个输出特征图是共享的。输出特征图$o[:,:,0]$中的最后一个$-2$计算如图5右下角公式所示。
+
+在卷积操作中卷积核是可学习的参数，经过上面示例介绍，每层卷积的参数大小为$D \times F \times F \times K$。在多层感知器模型中，神经元通常是全部连接，参数较多。而卷积层的参数较少，这也是由卷积层的主要特性即局部连接和共享权重所决定。
+
+- 局部连接：每个神经元仅与输入神经元的一块区域连接，这块局部区域称作感受野（receptive field）。在图像卷积操作中，即神经元在空间维度（spatial dimension，即上图示例H和W所在的平面）是局部连接，但在深度上是全部连接。对于二维图像本身而言，也是局部像素关联较强。这种局部连接保证了学习后的过滤器能够对于局部的输入特征有最强的响应。局部连接的思想，也是受启发于生物学里面的视觉系统结构，视觉皮层的神经元就是局部接受信息的。
 
-#### LeNet-5网络 
+- 权重共享：计算同一个深度切片的神经元时采用的滤波器是共享的。例如图4中计算$o[:,:,0]$的每个每个神经元的滤波器均相同，都为$W_0$，这样可以很大程度上减少参数。共享权重在一定程度上讲是有意义的，例如图片的底层边缘特征与特征在图中的具体位置无关。但是在一些场景中是无意的，比如输入的图片是人脸，眼睛和头发位于不同的位置，希望在不同的位置学到不同的特征 (参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/))。请注意权重只是对于同一深度切片的神经元是共享的，在卷积层，通常采用多组卷积核提取不同特征，即对应不同深度切片的特征，不同深度切片的神经元权重是不共享。另外，偏重对同一深度切片的所有神经元都是共享的。
+
+通过介绍卷积计算过程及其特性，可以看出卷积是线性操作，并具有平移不变性（shift-invariant），平移不变性即在图像每个位置执行相同的操作。卷积层的局部连接和权重共享使得需要学习的参数大大减小，这样也有利于训练较大卷积神经网络。
+
+#### 池化层
 
 <p align="center">
-<img src="image/cnn.png"><br/>
-图6. LeNet-5卷积神经网络结构<br/>
+<img src="image/max_pooling.png" width="400px"><br/>
+图6. 池化层图片<br/>
 </p>
 
-[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个最简单的卷积神经网络。图6显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。卷积的如下三个特性，决定了LeNet-5能比同样使用全连接层的多层感知器更好地识别图像：
-
-- 神经元的三维特性： 卷积层的神经元在宽度、高度和深度上进行了组织排列。每一层的神经元仅仅与前一层的一块小区域连接，这块小区域被称为感受野(receptive field)。
-- 局部连接：CNN通过在相邻层的神经元之间实施局部连接模式来利用空间局部相关性。这样的结构保证了学习后的过滤器能够对于局部的输入特征有最强的响应。堆叠许多这样的层导致非线性“过滤器”变得越来越“全局”。这允许网络首先创建输入的小部分的良好表示，然后从它们组合较大区域的表示。
-- 共享权重：在CNN中，每个滤波器在整个视野中重复扫描。 这些复制单元共享相同的参数化（权重向量和偏差）并形成特征图。 这意味着给定卷积层中的所有神经元检测完全相同的特征。 以这种方式的复制单元允许不管它们在视野中的位置都能检测到特征，从而构成平移不变性的性质。
+池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。
 
 更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
 
@@ -212,7 +215,7 @@ def convolutional_neural_network(img):
         num_channel=1,
         pool_size=2,
         pool_stride=2,
-        act=paddle.activation.Tanh())
+        act=paddle.activation.Relu())
     # 第二个卷积-池化层
     conv_pool_2 = paddle.networks.simple_img_conv_pool(
         input=conv_pool_1,
@@ -221,13 +224,9 @@ def convolutional_neural_network(img):
         num_channel=20,
         pool_size=2,
         pool_stride=2,
-        act=paddle.activation.Tanh())
-    # 全连接层
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
+        act=paddle.activation.Relu())
     # 以softmax为激活函数的全连接输出层，输出层的大小必须为数字的个数10
-    predict = paddle.layer.fc(input=fc1,
+    predict = paddle.layer.fc(input=conv_pool_2,
                               size=10,
                               act=paddle.activation.Softmax())
     return predict
@@ -244,9 +243,9 @@ images = paddle.layer.data(
 label = paddle.layer.data(
     name='label', type=paddle.data_type.integer_value(10))
 
-predict = softmax_regression(images) # Softmax回归
-#predict = multilayer_perceptron(images) #多层感知器
-#predict = convolutional_neural_network(images) #LeNet5卷积神经网络
+# predict = softmax_regression(images) # Softmax回归
+# predict = multilayer_perceptron(images) #多层感知器
+predict = convolutional_neural_network(images) #LeNet5卷积神经网络
 
 cost = paddle.layer.classification_cost(input=predict, label=label)
 ```
@@ -273,8 +272,40 @@ trainer = paddle.trainer.SGD(cost=cost,
 
 下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B —— reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
 
-`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minbatch。
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+
+`event_handler_plot`可以用来在训练过程中画图如下：
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+# event_handler to plot a figure
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 100 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
 
+        result = trainer.test(reader=paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=128))
+        cost_ploter.append(test_title, step, result.cost)
+```
+
+`event_handler` 用来在训练过程中输出训练结果
 ```python
 lists = []
 
@@ -284,20 +315,26 @@ def event_handler(event):
             print "Pass %d, Batch %d, Cost %f, %s" % (
                 event.pass_id, event.batch_id, event.cost, event.metrics)
     if isinstance(event, paddle.event.EndPass):
-        result = trainer.test(reader=paddle.reader.batched(
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=paddle.batch(
             paddle.dataset.mnist.test(), batch_size=128))
         print "Test with Pass %d, Cost %f, %s\n" % (
             event.pass_id, result.cost, result.metrics)
         lists.append((event.pass_id, result.cost,
                       result.metrics['classification_error_evaluator']))
+```
 
+```python
 trainer.train(
-    reader=paddle.reader.batched(
+    reader=paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.mnist.train(), buf_size=8192),
         batch_size=128),
-    event_handler=event_handler,
-    num_passes=100)
+    event_handler=event_handler_plot,
+    num_passes=5)
 ```
 
 训练过程是完全自动的，event_handler里打印的日志类似如下所示：
@@ -313,6 +350,32 @@ trainer.train(
 
 训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。
 
+
+## 应用模型
+
+可以使用训练好的模型对手写体数字图片进行分类，下面程序展示了如何使用paddle.infer接口进行推断。
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file).convert('L')
+    im = im.resize((28, 28), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32).flatten()
+    im = im / 255.0 * 2.0 - 1.0
+    return im
+
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/infer_3.png'),))
+
+probs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/infer_3.png is: %d" % lab[0][0]
+```
+
 ## 总结
 
 本教程的softmax回归、多层感知器和卷积神经网络是最基础的深度学习模型，后续章节中复杂的神经网络都是从它们衍生出来的，因此这几个模型对之后的学习大有裨益。同时，我们也观察到从最简单的softmax回归变换到稍复杂的卷积神经网络的时候，MNIST数据集上的识别准确率有了大幅度的提升，原因是卷积层具有局部连接和共享权重的特性。在之后学习新模型的时候，希望大家也要深入到新模型相比原模型带来效果提升的关键之处。此外，本教程还介绍了PaddlePaddle模型搭建的基本流程，从dataprovider的编写、网络层的构建，到最后的训练和预测。对这个流程熟悉以后，大家就可以用自己的数据，定义自己的网络模型，并完成自己的训练和预测任务了。
@@ -328,10 +391,11 @@ trainer.train(
 7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
 8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
 9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
-10. Bishop, Christopher M. ["Pattern recognition."](http://s3.amazonaws.com/academia.edu.documents/30428242/bg0137.pdf?AWSAccessKeyId=AKIAJ56TQJRTWSMTNPEA&Expires=1484816640&Signature=85Ad6%2Fca8T82pmHzxaSXermovIA%3D&response-content-disposition=inline%3B%20filename%3DPattern_recognition_and_machine_learning.pdf) Machine Learning 128 (2006): 1-58.
+10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58.
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -350,6 +414,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/02.recognize_digits/index.html b/02.recognize_digits/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..edfee1053aedea203ea703a3db4149aeed772046
--- /dev/null
+++ b/02.recognize_digits/index.html
@@ -0,0 +1,429 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Recognize Digits
+
+The source code for this tutorial is here:  [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits). For instructions on getting started with Paddle, please refer to [installation instructions](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Introduction
+When one learns to program, the first task is usually to write a program that prints "Hello World!". In Machine Learning or Deep Learning, an equivalent task is to train a model to recognize hand-written digits using the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. Handwriting recognition is a classic image classification problem. The problem is relatively easy and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a $28\times28$ matrix, and the label is one of the digits from $0$ to $9$. All images are normalized, meaning that they are both rescaled and centered.
+
+<p align="center">
+<img src="image/mnist_example_image.png" width="400"><br/>
+Fig. 1. Examples of MNIST images
+</p>
+
+The MNIST dataset is from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set of 60,000 samples and test set of 10,000 samples. 250 annotators labeled the training set, thus guaranteed that there wasn't a complete overlap of annotators of training set and test set.
+
+The MNIST dataset has been used for evaluating many image recognition algorithms such as a single layer linear classifier, Multilayer Perceptron (MLP) and Multilayer CNN LeNet\[[1](#references)\], K-Nearest Neighbors (k-NN) \[[2](#references)\], Support Vector Machine (SVM) \[[3](#references)\], Neural Networks \[[4-7](#references)\], Boosting \[[8](#references)\] and preprocessing methods like distortion removal, noise removal, and blurring.  Among these algorithms, the *Convolutional Neural Network* (CNN) has achieved a series of impressive results in Image Classification tasks, including VGGNet, GoogLeNet, and ResNet (See [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) tutorial).
+
+In this tutorial, we start with a simple **softmax** regression model and go on with MLP and CNN.  Readers will see how these methods improve the recognition accuracy step-by-step.
+
+
+## Model Overview
+
+Before introducing classification algorithms and training procedure, we define the following symbols:
+- $X$ is the input: Input is a $28\times 28$ MNIST image. It is flattened to a $784$ dimensional vector. $X=\left (x_0, x_1, \dots, x_{783} \right )$.
+- $Y$ is the output: Output of the classifier is 1 of the 10 classes (digits from 0 to 9). $Y=\left (y_0, y_1, \dots, y_9 \right )$. Each dimension $y_i$ represents the probability that the input image belongs to class $i$.
+- $L$ is the ground truth label: $L=\left ( l_0, l_1, \dots, l_9 \right )$. It is also 10 dimensional, but only one entry is $1$ and all others are $0$s.
+
+### Softmax Regression
+
+In a simple softmax regression model, the input is first fed to fully connected layers. Then, a softmax function is applied to output probabilities of multiple output classes\[[9](#references)\].
+
+The input $X$ is multiplied by weights $W$ and then added to the bias $b$ to generate activations.
+
+$$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
+
+where $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
+
+For an $N$-class classification problem with $N$ output nodes, Softmax normalizes the resulting $N$ dimensional vector so that each of its entries falls in the range $[0,1]\in {R}$, representing the probability that the sample belongs to a certain class. Here $y_i$ denotes the predicted probability that an image is of digit $i$.
+
+In such a classification problem, we usually use the cross entropy loss function:
+
+$$  \text{crossentropy}(label, y) = -\sum_i label_ilog(y_i) $$
+
+Fig. 2 illustrates a softmax regression network, with the weights in blue, and the bias in red. `+1` indicates that the bias is $1$.
+
+<p align="center">
+<img src="image/softmax_regression_en.png" width=400><br/>
+Fig. 2. Softmax regression network architecture<br/>
+</p>
+
+### Multilayer Perceptron
+
+The softmax regression model described above uses the simplest two-layer neural network. That is, it only contains an input layer and an output layer, with limited regression capability. To achieve better recognition results, consider adding several hidden layers\[[10](#references)\] between the input layer and the output layer.
+
+1.  After the first hidden layer, we get $ H_1 = \phi(W_1X + b_1) $, where $\phi$ denotes the activation function. Some [common ones](###list-of-common-activation-functions) are sigmoid, tanh and ReLU.
+2.  After the second hidden layer, we get $ H_2 = \phi(W_2H_1 + b_2) $.
+3.  Finally, the output layer outputs $Y=\text{softmax}(W_3H_2 + b_3)$, the vector denoting our classification result.
+
+Fig. 3. shows a Multilayer Perceptron network, with the weights in blue, and the bias in red. +1 indicates that the bias is $1$.
+
+<p align="center">
+<img src="image/mlp_en.png" width=500><br/>
+Fig. 3. Multilayer Perceptron network architecture<br/>
+
+</p>
+
+### Convolutional Neural Network
+
+#### Convolutional Layer
+
+<p align="center">
+<img src="image/conv_layer.png" width='750'><br/>
+Fig. 4. Convolutional layer<br/>
+</p>
+
+The **convolutional layer** is the core of a Convolutional Neural Network. The parameters in this layer are composed of a set of filters, also called kernels. We could visualize the convolution step in the following fashion: Each kernel slides horizontally and vertically till it covers the whole image. At every window, we compute the dot product of the kernel and the input. Then, we add the bias and apply an activation function. The result is a two-dimensional activation map. For example, some kernel may recognize corners, and some may recognize circles. These convolution kernels may respond strongly to the corresponding features.
+
+Fig. 4 illustrates the dynamic programming of a convolutional layer, where depths are flattened for simplicity. The input is $W_1=5$, $H_1=5$, $D_1=3$. In fact, this is a common representation for colored images. $W_1$ and $H_1$ correspond to the width and height in a colored image. $D_1$ corresponds to the three color channels for RGB. The parameters of the convolutional layer are $K=2$, $F=3$, $S=2$, $P=1$. $K$ denotes the number of kernels; specifically, $Filter$ $W_0$ and $Filter$ $W_1$ are the kernels. $F$ is kernel size while $W0$ and $W1$ are both $F\timesF = 3\times3$ matrices in all depths. $S$ is the stride, which is the width of the sliding window; here, kernels move leftwards or downwards by two units each time. $P$ is the width of the padding, which denotes an extension of the input; here, the gray area shows zero padding with size 1.
+
+#### Pooling Layer
+
+<p align="center">
+<img src="image/max_pooling_en.png" width="400px"><br/>
+Fig. 5 Pooling layer using max-pooling<br/>
+</p>
+
+A **pooling layer** performs downsampling. The main functionality of this layer is to reduce computation by reducing the network parameters. It also prevents over-fitting to some extent. Usually, a pooling layer is added after a convolutional layer. Pooling layer can use various techniques, such as max pooling and average pooling. As shown in Fig.5, max pooling uses rectangles to segment the input layer into several parts and computes the maximum value in each part as the output.
+
+#### LeNet-5 Network
+
+<p align="center">
+<img src="image/cnn_en.png"><br/>
+Fig. 6. LeNet-5 Convolutional Neural Network architecture<br/>
+</p>
+
+[**LeNet-5**](http://yann.lecun.com/exdb/lenet/) is one of the simplest Convolutional Neural Networks. Fig. 6. shows its architecture: A 2-dimensional input image is fed into two sets of convolutional layers and pooling layers. This output is then fed to a fully connected layer and a softmax classifier. Compared to multilayer, fully connected perceptrons, the LeNet-5 can recognize images better. This is due to the following three properties of the convolution:
+
+- The 3D nature of the neurons: a convolutional layer is organized by width, height, and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
+- Local connectivity: A CNN utilizes the local space correlation by connecting local neurons. This design guarantees that the learned filter has a strong response to local input features. Stacking many such layers generates a non-linear filter that is more global. This enables the network to first obtain good representation for small parts of input and then combine them to represent a larger region.
+- Weight sharing: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means that all the neurons in the same depth of the output response to the same feature. This allows the network to detect a feature regardless of its position in the input.
+
+For more details on Convolutional Neural Networks, please refer to the tutorial on [Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) and the [relevant lecture](http://cs231n.github.io/convolutional-networks/) from a Stanford course.
+
+### List of Common Activation Functions
+- Sigmoid activation function: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
+
+- Tanh activation function: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
+
+  In fact, tanh function is just a rescaled version of the sigmoid function. It is obtained by magnifying the value of the sigmoid function and moving it downwards by 1.
+
+- ReLU activation function: $ f(x) = max(0, x) $
+
+For more information, please refer to [Activation functions on Wikipedia](https://en.wikipedia.org/wiki/Activation_function).
+
+## Data Preparation
+
+PaddlePaddle provides a Python module, `paddle.dataset.mnist`, which downloads and caches the [MNIST dataset](http://yann.lecun.com/exdb/mnist/).  The cache is under `/home/username/.cache/paddle/dataset/mnist`:
+
+
+|    File name          |       Description | Size            |
+|----------------------|--------------|-----------|
+|train-images-idx3-ubyte|  Training images | 60,000 |
+|train-labels-idx1-ubyte|  Training labels | 60,000 |
+|t10k-images-idx3-ubyte |  Evaluation images | 10,000 |
+|t10k-labels-idx1-ubyte |  Evaluation labels | 10,000 |
+
+
+## Model Configuration
+
+A PaddlePaddle program starts from importing the API package:
+
+```python
+import paddle.v2 as paddle
+```
+
+We want to use this program to demonstrate three different classifiers, each defined as a Python function:
+
+- Softmax regression: the network has a fully-connection layer with softmax activation:
+
+```python
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+```
+
+- Multi-Layer Perceptron: this network has two hidden fully-connected layers, one with ReLU and the other with softmax activation:
+
+```python
+def multilayer_perceptron(img):
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+```
+
+- Convolution network LeNet-5: the input image is fed through two convolution-pooling layers, a fully-connected layer, and the softmax output layer:
+
+```python
+def convolutional_neural_network(img):
+
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Relu())
+
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Relu())
+
+    predict = paddle.layer.fc(input=conv_pool_2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+```
+
+PaddlePaddle provides a special layer `layer.data` for reading data. Let us create a data layer for reading images and connect it to a classification network created using one of above three functions.  We also need a cost layer for training the model.
+
+```python
+paddle.init(use_gpu=False, trainer_count=1)
+
+images = paddle.layer.data(
+    name='pixel', type=paddle.data_type.dense_vector(784))
+label = paddle.layer.data(
+    name='label', type=paddle.data_type.integer_value(10))
+
+# predict = softmax_regression(images)
+# predict = multilayer_perceptron(images) # uncomment for MLP
+predict = convolutional_neural_network(images) # uncomment for LeNet5
+
+cost = paddle.layer.classification_cost(input=predict, label=label)
+```
+
+Now, it is time to specify training parameters. In the following `Momentum` optimizer, `momentum=0.9` means that 90% of the current momentum comes from that of the previous iteration. The learning rate relates to the speed at which the network training converges. Regularization is meant to prevent over-fitting; here we use the L2 regularization.
+
+```python
+parameters = paddle.parameters.create(cost)
+
+optimizer = paddle.optimizer.Momentum(
+    learning_rate=0.1 / 128.0,
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+```
+
+Then we specify the training data `paddle.dataset.mnist.train()` and testing data `paddle.dataset.mnist.test()`. These two methods are *reader creators*. Once called, a reader creator returns a *reader*.  A reader is a Python method, which, once called, returns a Python generator, which yields instances of data.
+
+`shuffle` is a reader decorator. It takes a reader A as input and returns a new reader B. Under the hood, B calls A to read data in the following fashion: it copies in `buffer_size` instances at a time into a buffer, shuffles the data, and yields the shuffled instances one at a time. A large buffer size would yield very shuffled data.
+
+`batch` is a special decorator, which takes a reader and outputs a *batch reader*, which doesn't yield an instance, but a minibatch at a time.
+
+`event_handler_plot` is used to plot a figure like below：
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+# event_handler to plot a figure
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 100 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=128))
+        cost_ploter.append(test_title, step, result.cost)
+```
+
+`event_handler` is used to plot some text data when training.
+
+```python
+lists = []
+
+# event handler to print the progress
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=128))
+        print "Test with Pass %d, Cost %f, %s\n" % (
+            event.pass_id, result.cost, result.metrics)
+        lists.append((event.pass_id, result.cost,
+                      result.metrics['classification_error_evaluator']))
+```
+
+```python
+# Train the model now
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=8192),
+        batch_size=128),
+    event_handler=event_handler_plot,
+    num_passes=5)
+```
+
+During training, `trainer.train` invokes `event_handler` for certain events. This gives us a chance to print the training progress.
+
+```
+# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
+# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
+# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
+# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
+# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
+# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
+```
+
+After the training, we can check the model's prediction accuracy.
+
+```
+# find the best pass
+best = sorted(lists, key=lambda list: float(list[1]))[0]
+print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+```
+
+Usually, with MNIST data, the softmax regression model achieves an accuracy around 92.34%, the MLP 97.66%, and the convolution network around 99.20%. Convolution layers have been widely considered a great invention for image processing.
+
+## Application
+
+After training, users can use the trained model to classify images. The following code shows how to inference MNIST images through `paddle.infer` interface.
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file).convert('L')
+    im = im.resize((28, 28), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32).flatten()
+    im = im / 255.0 * 2.0 - 1.0
+    return im
+
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/infer_3.png'),))
+
+probs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/infer_3.png is: %d" % lab[0][0]
+```
+
+
+## Conclusion
+
+This tutorial describes a few common deep learning models using **Softmax regression**, **Multilayer Perceptron Network**, and **Convolutional Neural Network**. Understanding these models is crucial for future learning; the subsequent tutorials derive more sophisticated networks by building on top of them.
+
+When our model evolves from a simple softmax regression to a slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST dataset achieves a large improvement. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve the results of an old one.
+
+Moreover, this tutorial introduces the basic flow of PaddlePaddle model design, which starts with a *data provider*, a model layer construction, and finally training and prediction. Motivated readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
+
+
+## References
+
+1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
+2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2:753279&dswid=-434) (2014).
+3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
+4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
+5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
+6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220.
+7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
+8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
+9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
+10. Bishop, Christopher M. ["Pattern recognition."](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf) Machine Learning 128 (2006): 1-58.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/02.recognize_digits/train.py b/02.recognize_digits/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7d000640b850b90012470c01f163253de228ed
--- /dev/null
+++ b/02.recognize_digits/train.py
@@ -0,0 +1,129 @@
+import os
+from PIL import Image
+import numpy as np
+import paddle.v2 as paddle
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(
+        input=img, size=10, act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(
+        input=hidden1, size=64, act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(
+        input=hidden2, size=10, act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Relu())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Relu())
+    # fully-connected layer
+    predict = paddle.layer.fc(
+        input=conv_pool_2, size=10, act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    # predict = softmax_regression(images)
+    # predict = multilayer_perceptron(images)
+    predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+
+    parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+        if isinstance(event, paddle.event.EndPass):
+            # save parameters
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=5)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    def load_image(file):
+        im = Image.open(file).convert('L')
+        im = im.resize((28, 28), Image.ANTIALIAS)
+        im = np.array(im).astype(np.float32).flatten()
+        im = im / 255.0 * 2.0 - 1.0
+        return im
+
+    test_data = []
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    test_data.append((load_image(cur_dir + '/image/infer_3.png'), ))
+
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    lab = np.argsort(-probs)  # probs and lab are the results of one batch data
+    print "Label of image/infer_3.png is: %d" % lab[0][0]
+
+
+if __name__ == '__main__':
+    main()
diff --git a/image_classification/.gitignore b/03.image_classification/.gitignore
similarity index 100%
rename from image_classification/.gitignore
rename to 03.image_classification/.gitignore
diff --git a/image_classification/README.md b/03.image_classification/README.cn.md
similarity index 82%
rename from image_classification/README.md
rename to 03.image_classification/README.cn.md
index 538760d429d6f250eed2ce578e10001526d11abb..7e210d7452e583154dc7cc94abf818c16d983d25 100644
--- a/image_classification/README.md
+++ b/03.image_classification/README.cn.md
@@ -1,9 +1,9 @@
-图像分类
-=======
 
-本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+# 图像分类
 
-## 背景介绍 
+本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。
+
+## 背景介绍
 
 图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在本教程中，我们专注于图像识别领域的一个重要问题，即图像分类。
 
@@ -51,7 +51,7 @@
   2). **特征编码**: 底层特征中包含了大量冗余与噪声，为了提高特征表达的鲁棒性，需要使用一种特征变换算法对底层特征进行编码，称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
   3). **空间特征约束**: 特征编码之后一般会经过空间特征约束，也称作**特征汇聚**。特征汇聚是指在一个空间范围内，对每一维特征取最大值或者平均值，可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法，这种方法提出将图像均匀分块，在分块内做特征汇聚。
   4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述，接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器，在传统图像分类任务上性能很好。
- 
+
 这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征，两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。
 
 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
@@ -67,8 +67,8 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 
 <p align="center">
 <img src="image/lenet.png"><br/>
-图5. CNN网络示例[20] 
-</p> 
+图5. CNN网络示例[20]
+</p>
 
 - 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征，发掘出图片局部关联性质和空间不变性质。
 - 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作，可以过滤掉一些不重要的高频信息。
@@ -108,7 +108,7 @@ GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普
 
 <p align="center">
 <img src="image/googlenet.jpeg" ><br/>
-图8. GoogleNet[12] 
+图8. GoogleNet[12]
 </p>
 
 
@@ -136,7 +136,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 
 ## 数据准备
 
-通用图像分类公开的标准数据集常用的有[CIFAR](<https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。
+通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。
 
 由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
 
@@ -173,24 +173,24 @@ paddle.init(use_gpu=False, trainer_count=1)
 
 1. 定义数据输入及其维度
 
-	网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)，类别大小为10，即10分类。
-	
-	```python
+    网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)，类别大小为10，即10分类。
+
+    ```python
     datadim = 3 * 32 * 32
     classdim = 10
 
     image = paddle.layer.data(
         name="image", type=paddle.data_type.dense_vector(datadim))
-	```
+    ```
 
 2. 定义VGG网络核心模块
 
-	```python
-	net = vgg_bn_drop(image)
-	```
-	VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
-	
-	```python
+    ```python
+    net = vgg_bn_drop(image)
+    ```
+    VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
+
+    ```python
     def vgg_bn_drop(input):
         def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
             return paddle.networks.img_conv_group(
@@ -219,40 +219,40 @@ paddle.init(use_gpu=False, trainer_count=1)
             layer_attr=paddle.attr.Extra(drop_rate=0.5))
         fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
         return fc2
-	```
-	
-	2.1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块，由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成，
-	
-	2.2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
-	
-	2.3. 最后接两层512维的全连接。
+    ```
+
+    2.1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块，由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
+
+    2.2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
+
+    2.3. 最后接两层512维的全连接。
 
 3. 定义分类器
 
-	通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
+    通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
 
-	```python
+    ```python
     out = paddle.layer.fc(input=net,
                           size=classdim,
                           act=paddle.activation.Softmax())
-	```
+    ```
 
 4. 定义损失函数和网络输出
 
-	在有监督训练中需要输入图像对应的类别信息，同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
-	
-	```python
+    在有监督训练中需要输入图像对应的类别信息，同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
+
+    ```python
     lbl = paddle.layer.data(
         name="label", type=paddle.data_type.integer_value(classdim))
     cost = paddle.layer.classification_cost(input=out, label=lbl)
-	```
+    ```
 
 ### ResNet
 
 ResNet模型的第1、3、4步和VGG模型相同，这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
 
 ```python
-net = resnet_cifar10(data, depth=56)
+net = resnet_cifar10(image, depth=56)
 ```
 
 先介绍`resnet_cifar10`中的一些基本函数，再介绍网络连接过程。
@@ -305,9 +305,9 @@ def layer_warp(block_func, ipt, features, count, stride):
 
 `resnet_cifar10` 的连接结构主要有以下几个过程。
 
-1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。 
+1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。
 2. 然后连接3组残差模块即下面配置3组 `layer_warp` ，每组采用图 10 左边残差模块组成。
-3. 最后对网络做均值池化并返回该层。 
+3. 最后对网络做均值池化并返回该层。
 
 注意：除过第一层卷积层和最后一层全连接层之外，要求三组 `layer_warp` 总的含参层数能够被6整除，即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。
 
@@ -356,8 +356,7 @@ momentum_optimizer = paddle.optimizer.Momentum(
     learning_rate=0.1 / 128.0,
     learning_rate_decay_a=0.1,
     learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    batch_size=128)
+    learning_rate_schedule='discexp')
 
 # Create trainer
 trainer = paddle.trainer.SGD(cost=cost,
@@ -375,7 +374,7 @@ $$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
 cifar.train10()每次产生一条样本，在完成shuffle和batch之后，作为训练的输入。
 
 ```python
-reader=paddle.reader.batch(
+reader=paddle.batch(
     paddle.reader.shuffle(
         paddle.dataset.cifar.train10(), buf_size=50000),
         batch_size=128)
@@ -390,6 +389,36 @@ feeding={'image': 0,
 
 可以使用`event_handler`回调函数来观察训练过程，或进行测试等, 该回调函数是`trainer.train`函数里设定。
 
+`event_handler_plot`可以用来利用回调数据来打点画图:
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 1 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+
+        result = trainer.test(
+            reader=paddle.batch(
+                paddle.dataset.cifar.test10(), batch_size=128),
+            feeding=feeding)
+        cost_ploter.append(test_title, step, result.cost)
+```
+
+`event_handler` 用来在训练过程中输出文本日志
+
 ```python
 # End batch and end pass event handler
 def event_handler(event):
@@ -401,11 +430,14 @@ def event_handler(event):
             sys.stdout.write('.')
             sys.stdout.flush()
     if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
         result = trainer.test(
-            reader=paddle.reader.batch(
+            reader=paddle.batch(
                 paddle.dataset.cifar.test10(), batch_size=128),
-            reader_dict={'image': 0,
-                         'label': 1})
+            feeding=feeding)
         print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
 
@@ -415,7 +447,7 @@ def event_handler(event):
 trainer.train(
     reader=reader,
     num_passes=200,
-    event_handler=event_handler,
+    event_handler=event_handler_plot,
     feeding=feeding)
 ```
 
@@ -440,6 +472,41 @@ Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
 图12. CIFAR10数据集上VGG模型的分类错误率
 </p>
 
+## 应用模型
+
+可以使用训练好的模型对图片进行分类，下面程序展示了如何使用`paddle.infer`接口进行推断，可以打开注释，更改加载的模型。
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file)
+    im = im.resize((32, 32), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32)
+    # PIL打开图片存储顺序为H(高度)，W(宽度)，C(通道)。
+    # PaddlePaddle要求数据顺序为CHW，所以需要转换顺序。
+    im = im.transpose((2, 0, 1)) # CHW
+    # CIFAR训练图片通道顺序为B(蓝),G(绿),R(红),
+    # 而PIL打开图片默认通道顺序为RGB,因为需要交换通道。
+    im = im[(2, 1, 0),:,:] # BGR
+    im = im.flatten()
+    im = im / 255.0
+    return im
+
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/dog.png'),))
+
+# with open('params_pass_50.tar', 'r') as f:
+#    parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=out, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/dog.png is: %d" % lab[0][0]
+```
+
 
 ## 总结
 
@@ -452,7 +519,7 @@ Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
 
 [2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
 
-[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. 
+[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
 
 [4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
 
@@ -493,4 +560,4 @@ Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
 [22] http://cs231n.github.io/classification/
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/03.image_classification/README.md b/03.image_classification/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..68f9fcbc4b8f5410072955ba51af2d4442f472d7
--- /dev/null
+++ b/03.image_classification/README.md
@@ -0,0 +1,572 @@
+
+Image Classification
+=======================
+
+The source code for this chapter is at [book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification). First-time users, please refer to PaddlePaddle [Installation Tutorial](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book) for installation instructions.
+
+## Background
+
+Compared to words, images provide much more vivid and easier to understand information with an artistic sense. They are an important source for people to express and exchange ideas. In this chapter, we focus on one of the essential problems in image recognition -- image classification.
+
+Image classification is the task of distinguishing images in different categories based on their semantic meaning. It is a core problem in computer vision and is also the foundation of other higher level computer vision tasks such as object detection, image segmentation, object tracking, action recognition, etc. Image classification has applications in many areas such as face recognition, intelligent video analysis in security systems, traffic scene recognition in transportation systems, content-based image retrieval and automatic photo indexing in web services, image classification in medicine, etc.
+
+To classify an image we firstly encode the entire image using handcrafted or learned features and then determine the category using a classifier. Thus, feature extraction plays an important role in image classification. Prior to deep learning the BoW(Bag of Words) model was the most widely used method for classifying an image as well as an object. The BoW technique was introduced in Natural Language Processing where a training sentence is represented as a bag of words. In the context of image classification, the BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding** and **classifier design**.
+
+Using Deep learning, image classification can be framed as a supervised or unsupervised learning problem that uses hierarchical features automatically without any need for manually crafted features from the image. In recent years, Convolutional Neural Networks (CNNs) have made significant progress in image classification. CNNs use raw image pixels as input, extract low-level and high-level abstract features through convolution operations, and directly output the classification results from the model. This style of end-to-end learning has lead to not only increased performance but also wider adoption various applications.
+
+In this chapter, we introduce deep-learning-based image classification methods and explain how to train a CNN model using PaddlePaddle.
+
+## Demonstration
+
+An image can be classified by a general as well as fine-grained image classifier.
+
+
+Figure 1 shows the results of a general image classifier -- the trained model can correctly recognize the main objects in the images.
+
+<p align="center">
+<img src="image/dog_cat.png "  width="350" ><br/>
+Figure 1. General image classification
+</p>
+
+
+Figure 2 shows the results of a fine-grained image classifier. This task of flower recognition requires correctly recognizing of the flower's categories.
+
+<p align="center">
+<img src="image/flowers.png" width="400" ><br/>
+Figure 2. Fine-grained image classification
+</p>
+
+
+A good model should recognize objects of different categories correctly. The results of such a model should not vary due to viewpoint variation, illumination conditions, object distortion or occlusion.
+Figure 3 shows some images with various disturbances. A good model should classify these images correctly like humans.
+
+<p align="center">
+<img src="image/variations_en.png" width="550" ><br/>
+Figure 3. Disturbed images [22]
+</p>
+
+## Model Overview
+
+A large amount of research in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/) etc. Many image classification algorithms are usually evaluated and compared on top of these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset for Large Scale Visual Recognition Challenge (ILSVRC) started in 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
+
+Before 2012, traditional image classification was accomplished with the three steps described in the background section. A complete model construction usually involves the following stages: low-level feature extraction, feature encoding, spatial constraint or feature clustering, classifier design, model ensemble.
+
+  1). **Low-level feature extraction**: This step extracts large amounts of local features according to fixed strides and scales. Popular local features include Scale-Invariant Feature Transform (SIFT)[1], Histogram of Oriented Gradient(HOG)[2], Local Binary Pattern(LBP)[3], etc. A common practice is to employ multiple feature descriptors in order to avoid missing a lot of information.
+
+  2). **Feature encoding**: Low-level features contain a large amount of redundancy and noise. In order to improve the robustness of features, it is necessary to employ a feature transformation to encode low-level features. This is called feature encoding. Common feature encoding methods include vector quantization [4], sparse coding [5], locality-constrained linear coding [6], Fisher vector encoding [7], etc.
+
+  3). **Spatial constraint**: Spatial constraint or feature clustering is usually adopted after feature encoding for extracting the maximum or average of each dimension in the spatial domain. Pyramid feature matching--a popular feature clustering method--divides an image uniformly into patches and performs feature clustering in each patch.
+
+  4). **Classification**: In the above steps an image can be described by a vector of fixed dimension. Then a classifier can be used to classify the image into categories. Common classifiers include Support Vector Machine(SVM), random forest etc. Kernel SVM is the most popular classifier and has achieved very good performance in traditional image classification tasks.
+
+This method has been used widely as image classification algorithm in PASCAL VOC [18]. NEC Labs(http://www.nec-labs.com/) won the championship by employing SIFT and LBP features, two non-linear encoders and SVM in ILSVRC 2010 [8].
+
+The CNN model--AlexNet proposed by Alex Krizhevsky et al.[9], made a breakthrough in ILSVRC 2012. It dramatically outperformed traditional methods and won the ILSVRC championship in 2012. This was also the first time that a deep learning method was used for large-scale image classification. Since AlexNet, a series of CNN models have been proposed that have advanced the state of the art steadily on Imagenet as shown in Figure 4. With deeper and more sophisticated architectures, Top-5 error rate is getting lower and lower (to around 3.5%). The error rate of human raters on the same Imagenet dataset is 5.1%, which means that the image classification capability of a deep learning model has surpassed human raters.
+
+<p align="center">
+<img src="image/ilsvrc.png" width="500" ><br/>
+Figure 4. Top-5 error rates on ILSVRC image classification
+</p>
+
+### CNN
+
+Traditional CNNs consist of convolutional and fully-connected layers and use the softmax multi-category classifier with the cross-entropy loss function. Figure 5 shows a typical CNN. We first introduce the common components of a CNN.
+
+<p align="center">
+<img src="image/lenet_en.png"><br/>
+Figure 5. A CNN example [20]
+</p>
+
+- convolutional layer: this layer uses the convolution operation to extract (low-level and high-level) features and to discover local correlation and spatial invariance.
+
+- pooling layer: this layer down samples feature maps by extracting local max (max-pooling) or average (avg-pooling) value of each patch in the feature map. Down-sampling is a common operation in image processing and is used to filter out high-frequency information.
+
+- fully-connected layer: this layer fully connects neurons between two adjacent layers.
+
+- non-linear activation: Convolutional and fully-connected layers are usually followed by some non-linear activation layers. Non-linearities enhance the expression capability of the network. Some examples of non-linear activation functions are Sigmoid, Tanh and ReLU. ReLU is the most commonly used activation function in CNN.
+
+- Dropout [10]: At each training stage, individual nodes are dropped out of the network with a certain probability. This improves the network's ability to generalize and avoids overfitting.
+
+Parameter updates at each layer during training causes input layer distributions to change and in turn requires hyper-parameters to be carefully tuned. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
+
+In the following sections, we will introduce the following network architectures - VGG, GoogleNet and ResNets.
+
+### VGG
+
+The Oxford Visual Geometry Group (VGG) proposed the VGG network in ILSVRC 2014 [11]. This model is deeper and wider than previous neural architectures. It consists of five main groups of convolution operations. Adjacent convolution groups are connected via max-pooling layers. Each group contains a series of 3x3 convolutional layers (i.e. kernels). The number of convolution kernels stays the same within the group and increases from 64 in the first group to 512 in the last one. The total number of learnable layers could be 11, 13, 16, or 19 depending on the number of convolutional layers in each group. Figure 6 illustrates a 16-layer VGG. The neural architecture of VGG is relatively simple and has been adopted by many papers such as the first one that surpassed human-level performance on ImageNet [19].
+
+<p align="center">
+<img src="image/vgg16.png" width="750" ><br/>
+Figure 6. VGG16 model for ImageNet
+</p>
+
+### GoogleNet
+
+GoogleNet [12] won the ILSVRC championship in 2014. GoogleNet borrowed some ideas from the Network in Network(NIN) model [13] and is built on the Inception blocks. Let us first familiarize ourselves with these first.
+
+The two main characteristics of the NIN model are:
+
+1) A single-layer convolutional network is replaced with a Multi-Layer Perceptron Convolution (MLPconv). MLPconv is a tiny multi-layer convolutional network. It enhances non-linearity by adding several 1x1 convolutional layers after linear ones.
+
+2) In traditional CNNs, the last fewer layers are usually fully-connected with a large number of parameters. In contrast, NIN replaces all fully-connected layers with convolutional layers with feature maps of the same size as the category dimension and a global average pooling. This replacement of fully-connected layers significantly reduces the number of parameters.
+
+Figure 7 depicts two Inception blocks. Figure 7(a) is the simplest design. The output is a concatenation of features from three convolutional layers and one pooling layer. The disadvantage of this design is that the pooling layer does not change the number of filters and leads to an increase in the number of outputs. After several of such blocks, the number of outputs and parameters become larger and larger and lead to higher computation complexity. To overcome this drawback, the Inception block in Figure 7(b) employs three 1x1 convolutional layers. These reduce dimensions or the number of channels but improve the non-linearity of the network.
+
+<p align="center">
+<img src="image/inception_en.png" width="800" ><br/>
+Figure 7. Inception block
+</p>
+
+GoogleNet consists of multiple stacked Inception blocks followed by an avg-pooling layer as in NIN instead of traditional fully connected layers. The difference between GoogleNet and NIN is that GoogleNet adds a fully connected layer after avg-pooling layer to output a vector of category size. Besides these two characteristics, the features from middle layers of a GoogleNet are also very discriminative. Therefore, GoogeleNet inserts two auxiliary classifiers in the model for enhancing gradient and regularization when doing backpropagation. The loss function of the whole network is the weighted sum of these three classifiers.
+
+Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks -- the first group contains two Inception blocks, the second group has five, and the third group has two. It ends with an average pooling and a fully-connected layer.
+
+<p align="center">
+<img src="image/googlenet.jpeg" ><br/>
+Figure 8. GoogleNet[12]
+</p>
+
+The above model is the first version of GoogleNet or GoogelNet-v1. GoogleNet-v2 [14] introduced BN layer; GoogleNet-v3 [16] further split some convolutional layers, which increases non-linearity and network depth; GoogelNet-v4 [17] leads to the design idea of ResNet which will be introduced in the next section. The evolution from v1 to v4 improved the accuracy rate consistently. We will not go into details of the neural architectures of v2 to v4.
+
+### ResNet
+
+Residual Network(ResNet)[15] won the 2015 championship on three ImageNet competitions -- image classification, object localization, and object detection. The main challenge in training deeper networks is that accuracy degrades with network depth. The authors of ResNet proposed a residual learning approach to ease the difficulty of training deeper networks. Based on the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer's inputs. The outputs of these two branches are then added up.
+
+Figure 9 illustrates the ResNet architecture. To the left is the basic building block, it consists of two 3x3 convolutional layers of the same channels. To the right is a Bottleneck block. The bottleneck is a 1x1 convolutional layer used to reduce dimension from 256 to 64. The other 1x1 convolutional layer is used to increase dimension from 64 to 256. Thus, the number of input and output channels of the middle 3x3 convolutional layer is 64, which is relatively small.
+
+<p align="center">
+<img src="image/resnet_block.jpg" width="400"><br/>
+Figure 9. Residual block
+</p>
+
+Figure 10 illustrates ResNets with 50, 101, 152 layers, respectively. All three networks use bottleneck blocks of different numbers of repetitions. ResNet converges very fast and can be trained with hundreds or thousands of layers.
+
+<p align="center">
+<img src="image/resnet.png"><br/>
+Figure 10. ResNet model for ImageNet
+</p>
+
+
+## Dataset
+
+Commonly used public datasets for image classification are [CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html), [ImageNet](http://image-net.org/), [COCO](http://mscoco.org/), etc. Those used for fine-grained image classification are [CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), [Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/), [Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among these, the ImageNet dataset is the largest. Most research results are reported on ImageNet as mentioned in the Model Overview section. Since 2010, the ImageNet dataset has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average.
+
+Since ImageNet is too large to be downloaded and trained efficiently, we use [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR-10 as well as 10 images randomly sampled from each category.
+
+<p align="center">
+<img src="image/cifar.png" width="350"><br/>
+Figure 11. CIFAR10 dataset[21]
+</p>
+
+ `paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need to manually download and preprocess CIFAR-10.
+
+After running the command `python train.py`, training will start immediately. The following sections will describe in details.
+
+## Model Structure
+
+### Initialize PaddlePaddle
+
+We must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).
+
+```python
+import sys
+import paddle.v2 as paddle
+from vgg import vgg_bn_drop
+from resnet import resnet_cifar10
+
+# PaddlePaddle init
+paddle.init(use_gpu=False, trainer_count=1)
+```
+Now we are going to walk you through the implementations of the VGG and ResNet.
+
+### VGG
+
+Let's start with the VGG model. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we use a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
+
+1. Define input data and its dimension
+
+    The input to the network is defined as `paddle.layer.data`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10.
+
+    ```python
+    datadim = 3 * 32 * 32
+    classdim = 10
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+    ```
+
+2. Define VGG main module
+
+    ```python
+    net = vgg_bn_drop(image)
+    ```
+    The input to VGG main module is from the data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail:
+
+    ```python
+    def vgg_bn_drop(input):
+        def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+            return paddle.networks.img_conv_group(
+                input=ipt,
+                num_channels=num_channels,
+                pool_size=2,
+                pool_stride=2,
+                conv_num_filter=[num_filter] * groups,
+                conv_filter_size=3,
+                conv_act=paddle.activation.Relu(),
+                conv_with_batchnorm=True,
+                conv_batchnorm_drop_rate=dropouts,
+                pool_type=paddle.pooling.Max())
+
+        conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+        conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+        conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+        conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+        conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+        drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+        fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+        bn = paddle.layer.batch_norm(
+            input=fc1,
+            act=paddle.activation.Relu(),
+            layer_attr=paddle.attr.Extra(drop_rate=0.5))
+        fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+        return fc2
+    ```
+
+    2.1. Firstly, it defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
+
+    2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
+
+    2.3. The last two layers are fully-connected layers of dimension 512.
+
+3. Define Classifier
+
+    The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category.
+
+    ```python
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+    ```
+
+4. Define Loss Function and Outputs
+
+    In the context of supervised learning, labels of training images are defined in `paddle.layer.data` as well. During training, the cross-entropy loss function is used and the loss is the output of the network. During testing, the outputs are the probabilities calculated in the classifier.
+
+    ```python
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+    ```
+
+### ResNet
+
+The first, third and fourth steps of a ResNet are the same as a VGG. The second step is the main module of ResNet.
+
+```python
+net = resnet_cifar10(image, depth=56)
+```
+
+Here are some basic functions used in `resnet_cifar10`:
+
+  - `conv_bn_layer` : convolutional layer followed by BN.
+  - `shortcut` : the shortcut branch in a residual block. There are two kinds of shortcuts: 1x1 convolution used when the number of channels between input and output is different; direct connection used otherwise.
+
+  - `basicblock` : a basic residual module as shown in the left of Figure 9, it consists of two sequential 3x3 convolutions and one "shortcut" branch.
+  - `bottleneck` : a bottleneck module as shown in the right of Figure 9, it consists of two 1x1 convolutions with one 3x3 convolution in between branch and a "shortcut" branch.
+  - `layer_warp` : a group of residual modules consisting of several stacking blocks. In each group, the sliding window size of the first residual block could be different from the rest of blocks, in order to reduce the size of feature maps along horizontal and vertical directions.
+
+```python
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+
+def shortcut(ipt, n_in, n_out, stride):
+    if n_in != n_out:
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return ipt
+
+def basicblock(ipt, ch_out, stride):
+    ch_in = ch_out * 2
+    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
+    short = shortcut(ipt, ch_in, ch_out, stride)
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
+
+def layer_warp(block_func, ipt, features, count, stride):
+    tmp = block_func(ipt, features, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, features, 1)
+    return tmp
+```
+
+The following are the components of `resnet_cifar10`:
+
+1. The lowest level is `conv_bn_layer`.
+2. The middle level consists of three `layer_warp`, each of which uses the left residual block in Figure 9.
+3. The last level is average pooling layer.
+
+Note: besides the first convolutional layer and the last fully-connected layer, the total number of layers in three `layer_warp` should be dividable by 6, that is the depth of `resnet_cifar10` should satisfy $(depth - 2) % 6 == 0$.
+
+```python
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    return pool
+```
+
+## Model Training
+
+### Define Parameters
+
+Firstly, we create the model parameters according to the previous model configuration `cost`.
+
+```python
+# Create parameters
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+Before creating a training module, it is necessary to set the algorithm.
+Here we specify `Momentum` optimization algorithm via `paddle.optimizer`.
+
+```python
+# Create optimizer
+momentum_optimizer = paddle.optimizer.Momentum(
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+    learning_rate=0.1 / 128.0,
+    learning_rate_decay_a=0.1,
+    learning_rate_decay_b=50000 * 100,
+    learning_rate_schedule='discexp')
+
+# Create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=momentum_optimizer)
+```
+
+The learning rate adjustment policy can be defined with variables `learning_rate_decay_a`($a$), `learning_rate_decay_b`($b$) and `learning_rate_schedule`. In this example, discrete exponential method is used for adjusting learning rate. The formula is as follows,
+$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
+where $n$ is the number of processed samples, $lr_{0}$ is the learning_rate.
+
+### Training
+
+`cifar.train10()` will yield records during each pass, after shuffling, a batch input is generated for training.
+
+```python
+reader=paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=50000),
+        batch_size=128)
+```
+
+`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance,
+ the first column of data generated by `cifar.train10()` corresponds to image layer's feature.
+
+```python
+feeding={'image': 0,
+         'label': 1}
+```
+
+Callback function `event_handler` will be called during training when a pre-defined event happens.
+
+`event_handler_plot`is used to plot a figure like below：
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 1 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+            reader=paddle.batch(
+                paddle.dataset.cifar.test10(), batch_size=128),
+            feeding=feeding)
+        cost_ploter.append(test_title, step, result.cost)
+```
+
+`event_handler` is used to plot some text data when training.
+
+```python
+# event handler to track training and testing process
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "\nPass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        else:
+            sys.stdout.write('.')
+            sys.stdout.flush()
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(
+            reader=paddle.batch(
+                paddle.dataset.cifar.test10(), batch_size=128),
+            feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+Finally, we can invoke `trainer.train` to start training:
+
+```python
+trainer.train(
+    reader=reader,
+    num_passes=200,
+    event_handler=event_handler_plot,
+    feeding=feeding)
+```
+
+Here is an example log after training for one pass. The average error rates are 0.6875 on the training set and 0.8852 on the validation set.
+
+```text
+Pass 0, Batch 0, Cost 2.473182, {'classification_error_evaluator': 0.9140625}
+...................................................................................................
+Pass 0, Batch 100, Cost 1.913076, {'classification_error_evaluator': 0.78125}
+...................................................................................................
+Pass 0, Batch 200, Cost 1.783041, {'classification_error_evaluator': 0.7421875}
+...................................................................................................
+Pass 0, Batch 300, Cost 1.668833, {'classification_error_evaluator': 0.6875}
+..........................................................................................
+Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
+```
+
+Figure 12 shows the curve of training error rate, which indicates it converges at Pass 200 with error rate 8.54%.
+<p align="center">
+<img src="image/plot_en.png" width="400" ><br/>
+Figure 12. The error rate of VGG model on CIFAR10
+</p>
+
+
+
+## Application
+
+After training is completed, users can use the trained model to classify images. The following code shows how to infer through `paddle.infer` interface. You can uncomment some lines from below to change the model name.
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file)
+    im = im.resize((32, 32), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32)
+    # The storage order of the loaded image is W(widht),
+    # H(height), C(channel). PaddlePaddle requires
+    # the CHW order, so transpose them.
+    im = im.transpose((2, 0, 1)) # CHW
+    # In the training phase, the channel order of CIFAR
+    # image is B(Blue), G(green), R(Red). But PIL open
+    # image in RGB mode. It must swap the channel order.
+    im = im[(2, 1, 0),:,:] # BGR
+    im = im.flatten()
+    im = im / 255.0
+    return im
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/dog.png'),))
+
+# users can remove the comments and change the model name
+# with open('params_pass_50.tar', 'r') as f:
+#    parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=out, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/dog.png is: %d" % lab[0][0]
+```
+
+
+## Conclusion
+
+Traditional image classification methods involve multiple stages of processing, which has to utilize complex frameworks. Contrarily, CNN models can be trained end-to-end with a significant increase in classification accuracy. In this chapter, we introduced three models -- VGG, GoogleNet, ResNet and provided PaddlePaddle config files for training VGG and ResNet on CIFAR10. We also explained how to perform prediction and feature extraction using the PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
+
+
+## References
+
+[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
+
+[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
+
+[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
+
+[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
+
+[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
+
+[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
+
+[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
+
+[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
+
+[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
+
+[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
+
+[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
+
+[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
+
+[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
+
+[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
+
+[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
+
+[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
+
+[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
+
+[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
+
+[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
+
+[20] http://deeplearning.net/tutorial/lenet.html
+
+[21] https://www.cs.toronto.edu/~kriz/cifar.html
+
+[22] http://cs231n.github.io/classification/
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/image_classification/image/cifar.png b/03.image_classification/image/cifar.png
similarity index 100%
rename from image_classification/image/cifar.png
rename to 03.image_classification/image/cifar.png
diff --git a/image_classification/image/dog.png b/03.image_classification/image/dog.png
similarity index 100%
rename from image_classification/image/dog.png
rename to 03.image_classification/image/dog.png
diff --git a/image_classification/image/dog_cat.png b/03.image_classification/image/dog_cat.png
similarity index 100%
rename from image_classification/image/dog_cat.png
rename to 03.image_classification/image/dog_cat.png
diff --git a/image_classification/image/fea_conv0.png b/03.image_classification/image/fea_conv0.png
similarity index 100%
rename from image_classification/image/fea_conv0.png
rename to 03.image_classification/image/fea_conv0.png
diff --git a/image_classification/image/flowers.png b/03.image_classification/image/flowers.png
similarity index 100%
rename from image_classification/image/flowers.png
rename to 03.image_classification/image/flowers.png
diff --git a/image_classification/image/googlenet.jpeg b/03.image_classification/image/googlenet.jpeg
similarity index 100%
rename from image_classification/image/googlenet.jpeg
rename to 03.image_classification/image/googlenet.jpeg
diff --git a/image_classification/image/ilsvrc.png b/03.image_classification/image/ilsvrc.png
similarity index 100%
rename from image_classification/image/ilsvrc.png
rename to 03.image_classification/image/ilsvrc.png
diff --git a/image_classification/image/inception.png b/03.image_classification/image/inception.png
similarity index 100%
rename from image_classification/image/inception.png
rename to 03.image_classification/image/inception.png
diff --git a/image_classification/image/inception_en.png b/03.image_classification/image/inception_en.png
similarity index 100%
rename from image_classification/image/inception_en.png
rename to 03.image_classification/image/inception_en.png
diff --git a/image_classification/image/lenet.png b/03.image_classification/image/lenet.png
similarity index 100%
rename from image_classification/image/lenet.png
rename to 03.image_classification/image/lenet.png
diff --git a/image_classification/image/lenet_en.png b/03.image_classification/image/lenet_en.png
similarity index 100%
rename from image_classification/image/lenet_en.png
rename to 03.image_classification/image/lenet_en.png
diff --git a/image_classification/image/plot.png b/03.image_classification/image/plot.png
similarity index 100%
rename from image_classification/image/plot.png
rename to 03.image_classification/image/plot.png
diff --git a/image_classification/image/plot_en.png b/03.image_classification/image/plot_en.png
similarity index 100%
rename from image_classification/image/plot_en.png
rename to 03.image_classification/image/plot_en.png
diff --git a/image_classification/image/resnet.png b/03.image_classification/image/resnet.png
similarity index 100%
rename from image_classification/image/resnet.png
rename to 03.image_classification/image/resnet.png
diff --git a/image_classification/image/resnet_block.jpg b/03.image_classification/image/resnet_block.jpg
similarity index 100%
rename from image_classification/image/resnet_block.jpg
rename to 03.image_classification/image/resnet_block.jpg
diff --git a/03.image_classification/image/train_and_test.png b/03.image_classification/image/train_and_test.png
new file mode 100644
index 0000000000000000000000000000000000000000..c6336a9a69b95dc978719ce68896e3e752e67fed
Binary files /dev/null and b/03.image_classification/image/train_and_test.png differ
diff --git a/image_classification/image/variations.png b/03.image_classification/image/variations.png
similarity index 100%
rename from image_classification/image/variations.png
rename to 03.image_classification/image/variations.png
diff --git a/image_classification/image/variations_en.png b/03.image_classification/image/variations_en.png
similarity index 100%
rename from image_classification/image/variations_en.png
rename to 03.image_classification/image/variations_en.png
diff --git a/image_classification/image/vgg16.png b/03.image_classification/image/vgg16.png
similarity index 100%
rename from image_classification/image/vgg16.png
rename to 03.image_classification/image/vgg16.png
diff --git a/image_classification/index.html b/03.image_classification/index.cn.html
similarity index 67%
rename from image_classification/index.html
rename to 03.image_classification/index.cn.html
index 0e48c728c61a2b12aa400f8840c1bc0478bf21df..f2e5155563b241011ed47440663366435a50ee0c 100644
--- a/image_classification/index.html
+++ b/03.image_classification/index.cn.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,17 +35,17 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
-图像分类
-=======
 
-本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+# 图像分类
+
+本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。
 
-## 背景介绍 
+## 背景介绍
 
 图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在本教程中，我们专注于图像识别领域的一个重要问题，即图像分类。
 
@@ -92,7 +93,7 @@
   2). **特征编码**: 底层特征中包含了大量冗余与噪声，为了提高特征表达的鲁棒性，需要使用一种特征变换算法对底层特征进行编码，称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
   3). **空间特征约束**: 特征编码之后一般会经过空间特征约束，也称作**特征汇聚**。特征汇聚是指在一个空间范围内，对每一维特征取最大值或者平均值，可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法，这种方法提出将图像均匀分块，在分块内做特征汇聚。
   4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述，接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器，在传统图像分类任务上性能很好。
- 
+
 这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征，两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。
 
 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
@@ -108,8 +109,8 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 
 <p align="center">
 <img src="image/lenet.png"><br/>
-图5. CNN网络示例[20] 
-</p> 
+图5. CNN网络示例[20]
+</p>
 
 - 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征，发掘出图片局部关联性质和空间不变性质。
 - 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作，可以过滤掉一些不重要的高频信息。
@@ -149,7 +150,7 @@ GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普
 
 <p align="center">
 <img src="image/googlenet.jpeg" ><br/>
-图8. GoogleNet[12] 
+图8. GoogleNet[12]
 </p>
 
 
@@ -177,108 +178,35 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 
 ## 数据准备
 
-### 数据介绍与下载
-
 通用图像分类公开的标准数据集常用的有[CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。
 
-由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](https://www.cs.toronto.edu/~kriz/cifar.html)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
+由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
 
 <p align="center">
 <img src="image/cifar.png" width="350"><br/>
 图11. CIFAR10数据集[21]
 </p>
 
-下面命令用于下载数据和基于训练集计算图像均值，在网络输入前，基于该均值对输入数据做预处理。
-
-```bash
-./data/get_data.sh
-```
-
-### 数据提供给PaddlePaddle
-
-我们使用Python接口传递数据给系统，下面 `dataprovider.py` 针对CIFAR10数据给出了完整示例。
-
-- `initializer` 函数进行dataprovider的初始化，这里加载图像的均值，定义了输入image和label两个字段的类型。
+Paddle API提供了自动加载cifar数据集模块 `paddle.dataset.cifar`。
 
-- `process` 函数将数据逐条传输给系统，在图像分类任务里，可以在该函数中完成数据扰动操作，再传输给PaddlePaddle。这里对训练集做随机左右翻转，并将原始图片减去均值后传输给系统。
+通过输入`python train.py`，就可以开始训练模型了，以下小节将详细介绍`train.py`的相关内容。
 
+### 模型结构
 
-```python
-import numpy as np
-import cPickle
-from paddle.trainer.PyDataProvider2 import *
-
-def initializer(settings, mean_path, is_train, **kwargs):
-    settings.is_train = is_train
-    settings.input_size = 3 * 32 * 32
-    settings.mean = np.load(mean_path)['mean']
-    settings.input_types = {
-        'image': dense_vector(settings.input_size),
-        'label': integer_value(10)
-    }
-
-
-@provider(init_hook=initializer, pool_size=50000)
-def process(settings, file_list):
-    with open(file_list, 'r') as fdata:
-        for fname in fdata:
-            fo = open(fname.strip(), 'rb')
-            batch = cPickle.load(fo)
-            fo.close()
-            images = batch['data']
-            labels = batch['labels']
-            for im, lab in zip(images, labels):
-                if settings.is_train and np.random.randint(2):
-                    im = im.reshape(3, 32, 32)
-                    im = im[:,:,::-1]
-                    im = im.flatten()
-                im = im - settings.mean
-                yield {
-                    'image': im.astype('float32'),
-                    'label': int(lab)
-                }
-```
-
-## 模型配置说明
-
-### 数据定义
+#### Paddle 初始化
 
-在模型配置中，定义通过 `define_py_data_sources2` 函数从 dataprovider 中读入数据， 其中 args 指定均值文件的路径。如果该配置文件用于预测，则不需要数据定义部分。
+通过 `paddle.init`，初始化Paddle是否使用GPU，trainer的数目等等。
 
 ```python
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-if not is_predict:
-    define_py_data_sources2(
-        train_list='data/train.list',
-        test_list='data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'mean_path': 'data/mean.meta'})
-```
-
-### 算法配置
+import sys
+import paddle.v2 as paddle
+from vgg import vgg_bn_drop
+from resnet import resnet_cifar10
 
-在模型配置中，通过 `settings` 设置训练使用的优化算法，并指定batch size 、初始学习率、momentum以及L2正则。
-
-```python
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128),)
+# PaddlePaddle init
+paddle.init(use_gpu=False, trainer_count=1)
 ```
 
-通过 `learning_rate_decay_a` (简写$a$） 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略，这里采用离散指数的方式调节学习率，计算公式如下， $n$ 代表已经处理过的累计总样本数，$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`。
-
-$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
-
-### 模型结构
-
 本教程中我们提供了VGG和ResNet两个模型的配置。
 
 #### VGG
@@ -287,84 +215,86 @@ $$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
 
 1. 定义数据输入及其维度
 
-	网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)，类别大小为10，即10分类。
-	
-	```python
-	datadim = 3 * 32 * 32
-	classdim = 10
-	data = data_layer(name='image', size=datadim)
-	```
+    网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)，类别大小为10，即10分类。
+
+    ```python
+    datadim = 3 * 32 * 32
+    classdim = 10
+
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+    ```
 
 2. 定义VGG网络核心模块
 
-	```python
-	net = vgg_bn_drop(data)
-	```
-	VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
-	
-	```python
-	def vgg_bn_drop(input, num_channels):
-	    def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
-	        return img_conv_group(
-	            input=ipt,
-	            num_channels=num_channels_,
-	            pool_size=2,
-	            pool_stride=2,
-	            conv_num_filter=[num_filter] * groups,
-	            conv_filter_size=3,
-	            conv_act=ReluActivation(),
-	            conv_with_batchnorm=True,
-	            conv_batchnorm_drop_rate=dropouts,
-	            pool_type=MaxPooling())
-	
-	    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
-	    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-	    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-	    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-	    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-	
-	    drop = dropout_layer(input=conv5, dropout_rate=0.5)
-	    fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
-	    bn = batch_norm_layer(
-	        input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
-	    fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
-	    return fc2
-	
-	```
-	
-	2.1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.trainer_config_helpers`中预定义的模块，由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成，
-	
-	2.2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
-	
-	2.3. 最后接两层512维的全连接。
+    ```python
+    net = vgg_bn_drop(image)
+    ```
+    VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
+
+    ```python
+    def vgg_bn_drop(input):
+        def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+            return paddle.networks.img_conv_group(
+                input=ipt,
+                num_channels=num_channels,
+                pool_size=2,
+                pool_stride=2,
+                conv_num_filter=[num_filter] * groups,
+                conv_filter_size=3,
+                conv_act=paddle.activation.Relu(),
+                conv_with_batchnorm=True,
+                conv_batchnorm_drop_rate=dropouts,
+                pool_type=paddle.pooling.Max())
+
+        conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+        conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+        conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+        conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+        conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+        drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+        fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+        bn = paddle.layer.batch_norm(
+            input=fc1,
+            act=paddle.activation.Relu(),
+            layer_attr=paddle.attr.Extra(drop_rate=0.5))
+        fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+        return fc2
+    ```
+
+    2.1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.networks`中预定义的模块，由若干组 Conv->BN->ReLu->Dropout 和 一组 Pooling 组成。
+
+    2.2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
+
+    2.3. 最后接两层512维的全连接。
 
 3. 定义分类器
 
-	通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
+    通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
 
-	```python
-	out = fc_layer(input=net, size=class_num, act=SoftmaxActivation())
-	```
+    ```python
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+    ```
 
 4. 定义损失函数和网络输出
 
-	在有监督训练中需要输入图像对应的类别信息，同样通过`data_layer`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
-	
-	```python
-	if not is_predict:
-	    lbl = data_layer(name="label", size=class_num)
-	    cost = classification_cost(input=out, label=lbl)
-	    outputs(cost)
-	else:
-	    outputs(out)
-	```
+    在有监督训练中需要输入图像对应的类别信息，同样通过`paddle.layer.data`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
+
+    ```python
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+    ```
 
 ### ResNet
 
 ResNet模型的第1、3、4步和VGG模型相同，这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
 
 ```python
-net = resnet_cifar10(data, depth=56)
+net = resnet_cifar10(image, depth=56)
 ```
 
 先介绍`resnet_cifar10`中的一些基本函数，再介绍网络连接过程。
@@ -381,158 +311,245 @@ def conv_bn_layer(input,
                   filter_size,
                   stride,
                   padding,
-                  active_type=ReluActivation(),
+                  active_type=paddle.activation.Relu(),
                   ch_in=None):
-    tmp = img_conv_layer(
+    tmp = paddle.layer.img_conv(
         input=input,
         filter_size=filter_size,
         num_channels=ch_in,
         num_filters=ch_out,
         stride=stride,
         padding=padding,
-        act=LinearActivation(),
+        act=paddle.activation.Linear(),
         bias_attr=False)
-    return batch_norm_layer(input=tmp, act=active_type)
-
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
 
 def shortcut(ipt, n_in, n_out, stride):
     if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
     else:
         return ipt
 
 def basicblock(ipt, ch_out, stride):
-    ch_in = ipt.num_filters
+    ch_in = ch_out * 2
     tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
-
-def bottleneck(ipt, ch_out, stride):
-    ch_in = ipt.num_filter
-    tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
-    tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
     short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
 
 def layer_warp(block_func, ipt, features, count, stride):
     tmp = block_func(ipt, features, stride)
     for i in range(1, count):
         tmp = block_func(tmp, features, 1)
     return tmp
-
 ```
 
 `resnet_cifar10` 的连接结构主要有以下几个过程。
 
-1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。 
+1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。
 2. 然后连接3组残差模块即下面配置3组 `layer_warp` ，每组采用图 10 左边残差模块组成。
-3. 最后对网络做均值池化并返回该层。 
+3. 最后对网络做均值池化并返回该层。
 
 注意：除过第一层卷积层和最后一层全连接层之外，要求三组 `layer_warp` 总的含参层数能够被6整除，即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。
 
 ```python
-def resnet_cifar10(ipt, depth=56):
+def resnet_cifar10(ipt, depth=32):
     # depth should be one of 20, 32, 44, 56, 110, 1202
     assert (depth - 2) % 6 == 0
     n = (depth - 2) / 6
     nStages = {16, 64, 128}
-    conv1 = conv_bn_layer(ipt,
-        ch_in=3,
-        ch_out=16,
-        filter_size=3,
-        stride=1,
-        padding=1)
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
     res1 = layer_warp(basicblock, conv1, 16, n, 1)
     res2 = layer_warp(basicblock, res1, 32, n, 2)
     res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = img_pool_layer(input=res3,
-                         pool_size=8,
-                         stride=1,
-                         pool_type=AvgPooling())
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
     return pool
 ```
 
-## 模型训练
+## 训练模型
+
+### 定义参数
 
-执行脚本 train.sh 进行模型训练， 其中指定配置文件、设备类型、线程个数、总共训练的轮数、模型存储路径等。
+首先依据模型配置的`cost`定义模型参数。
 
-``` bash
-sh train.sh
+```python
+# Create parameters
+parameters = paddle.parameters.create(cost)
 ```
 
-脚本 `train.sh` 如下：
-
-```bash
-#cfg=models/resnet.py
-cfg=models/vgg.py
-output=output
-log=train.log
-
-paddle train \
-    --config=$cfg \
-    --use_gpu=true \
-    --trainer_count=1 \
-    --log_period=100 \
-    --num_passes=300 \
-    --save_dir=$output \
-    2>&1 | tee $log
+可以打印参数名字，如果在网络配置中没有指定名字，则默认生成。
+
+```python
+print parameters.keys()
 ```
 
-- `--config=$cfg` : 指定配置文件，默认是 `models/vgg.py`。
-- `--use_gpu=true` : 指定使用GPU训练，若使用CPU，设置为false。
-- `--trainer_count=1` : 指定线程个数或GPU个数。
-- `--log_period=100` : 指定日志打印的batch间隔。
-- `--save_dir=$output` : 指定模型存储路径。
+### 构造训练(Trainer)
 
-一轮训练log示例如下所示，经过1个pass， 训练集上平均error为0.79958 ，测试集上平均error为0.7858 。
+根据网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的Momentum方法，同时设定了学习率、正则等。
 
-```text
-TrainerInternal.cpp:165]  Batch=300 samples=38400 AvgCost=2.07708 CurrentCost=1.96158 Eval: classification_error_evaluator=0.81151  CurrentEval: classification_error_evaluator=0.789297
-TrainerInternal.cpp:181]  Pass=0 Batch=391 samples=50000 AvgCost=2.03348 Eval: classification_error_evaluator=0.79958
-Tester.cpp:115]  Test samples=10000 cost=1.99246 Eval: classification_error_evaluator=0.7858
+```python
+# Create optimizer
+momentum_optimizer = paddle.optimizer.Momentum(
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+    learning_rate=0.1 / 128.0,
+    learning_rate_decay_a=0.1,
+    learning_rate_decay_b=50000 * 100,
+    learning_rate_schedule='discexp')
+
+# Create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=momentum_optimizer)
 ```
 
-图12是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
+通过 `learning_rate_decay_a` (简写$a$） 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略，这里采用离散指数的方式调节学习率，计算公式如下， $n$ 代表已经处理过的累计总样本数，$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`。
 
-<p align="center">
-<img src="image/plot.png" width="400" ><br/>
-图12. CIFAR10数据集上VGG模型的分类错误率
-</p>
+$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
 
-## 模型应用
 
-在训练完成后，模型会保存在路径 `output/pass-%05d` 下，例如第300个pass的模型会保存在路径 `output/pass-00299`。 可以使用脚本 `classify.py` 对图片进行预测或提取特征，注意该脚本默认使用模型配置为 `models/vgg.py`，
+### 训练
 
+cifar.train10()每次产生一条样本，在完成shuffle和batch之后，作为训练的输入。
 
-### 预测
+```python
+reader=paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=50000),
+        batch_size=128)
+```
 
-可以按照下面方式预测图片的类别，默认使用GPU预测，如果使用CPU预测，在后面加参数 `-c`即可。
+通过`feeding`来指定每一个数据和`paddle.layer.data`的对应关系。例如: `cifar.train10()`产生数据的第0列对应image层的特征。
 
-```bash
-python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
+```python
+feeding={'image': 0,
+         'label': 1}
 ```
 
-预测结果为：
+可以使用`event_handler`回调函数来观察训练过程，或进行测试等, 该回调函数是`trainer.train`函数里设定。
 
-```text
-Label of image/dog.png is: 5
+`event_handler_plot`可以用来利用回调数据来打点画图:
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 1 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+
+        result = trainer.test(
+            reader=paddle.batch(
+                paddle.dataset.cifar.test10(), batch_size=128),
+            feeding=feeding)
+        cost_ploter.append(test_title, step, result.cost)
 ```
 
-### 特征提取
+`event_handler` 用来在训练过程中输出文本日志
 
-可以按照下面方式对图片提取特征，和预测使用方式不同的是指定job类型为extract，并需要指定提取的层。`classify.py` 默认以第一层卷积特征为例提取特征，并画出了类似图13的可视化图。VGG模型的第一层卷积有64个通道，图13展示了每个通道的灰度图。
+```python
+# End batch and end pass event handler
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "\nPass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        else:
+            sys.stdout.write('.')
+            sys.stdout.flush()
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(
+            reader=paddle.batch(
+                paddle.dataset.cifar.test10(), batch_size=128),
+            feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+通过`trainer.train`函数训练:
+
+```python
+trainer.train(
+    reader=reader,
+    num_passes=200,
+    event_handler=event_handler_plot,
+    feeding=feeding)
+```
+
+一轮训练log示例如下所示，经过1个pass， 训练集上平均error为0.6875 ，测试集上平均error为0.8852 。
 
-```bash
-python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
+```text
+Pass 0, Batch 0, Cost 2.473182, {'classification_error_evaluator': 0.9140625}
+...................................................................................................
+Pass 0, Batch 100, Cost 1.913076, {'classification_error_evaluator': 0.78125}
+...................................................................................................
+Pass 0, Batch 200, Cost 1.783041, {'classification_error_evaluator': 0.7421875}
+...................................................................................................
+Pass 0, Batch 300, Cost 1.668833, {'classification_error_evaluator': 0.6875}
+..........................................................................................
+Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
 ```
 
+图12是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
+
 <p align="center">
-<img src="image/fea_conv0.png" width="500"><br/>
-图13. 卷积特征可视化图 
+<img src="image/plot.png" width="400" ><br/>
+图12. CIFAR10数据集上VGG模型的分类错误率
 </p>
 
+## 应用模型
+
+可以使用训练好的模型对图片进行分类，下面程序展示了如何使用`paddle.infer`接口进行推断，可以打开注释，更改加载的模型。
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file)
+    im = im.resize((32, 32), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32)
+    # PIL打开图片存储顺序为H(高度)，W(宽度)，C(通道)。
+    # PaddlePaddle要求数据顺序为CHW，所以需要转换顺序。
+    im = im.transpose((2, 0, 1)) # CHW
+    # CIFAR训练图片通道顺序为B(蓝),G(绿),R(红),
+    # 而PIL打开图片默认通道顺序为RGB,因为需要交换通道。
+    im = im[(2, 1, 0),:,:] # BGR
+    im = im.flatten()
+    im = im / 255.0
+    return im
+
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/dog.png'),))
+
+# with open('params_pass_50.tar', 'r') as f:
+#    parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=out, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/dog.png is: %d" % lab[0][0]
+```
+
+
 ## 总结
 
 传统图像分类方法由多个阶段构成，框架较为复杂，而端到端的CNN模型结构可一步到位，而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型；然后基于CIFAR10数据集，介绍如何使用PaddlePaddle配置和训练CNN模型，尤其是VGG和ResNet模型；最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet，配置和训练流程是同样的，大家可以自行进行实验。
@@ -544,7 +561,7 @@ python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png
 
 [2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
 
-[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. 
+[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
 
 [4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
 
@@ -585,7 +602,8 @@ python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png
 [22] http://cs231n.github.io/classification/
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -604,6 +622,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/03.image_classification/index.html b/03.image_classification/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..8b8e697269a5ebd5198983683759d1f4d047eeff
--- /dev/null
+++ b/03.image_classification/index.html
@@ -0,0 +1,636 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+
+Image Classification
+=======================
+
+The source code for this chapter is at [book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification). First-time users, please refer to PaddlePaddle [Installation Tutorial](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book) for installation instructions.
+
+## Background
+
+Compared to words, images provide much more vivid and easier to understand information with an artistic sense. They are an important source for people to express and exchange ideas. In this chapter, we focus on one of the essential problems in image recognition -- image classification.
+
+Image classification is the task of distinguishing images in different categories based on their semantic meaning. It is a core problem in computer vision and is also the foundation of other higher level computer vision tasks such as object detection, image segmentation, object tracking, action recognition, etc. Image classification has applications in many areas such as face recognition, intelligent video analysis in security systems, traffic scene recognition in transportation systems, content-based image retrieval and automatic photo indexing in web services, image classification in medicine, etc.
+
+To classify an image we firstly encode the entire image using handcrafted or learned features and then determine the category using a classifier. Thus, feature extraction plays an important role in image classification. Prior to deep learning the BoW(Bag of Words) model was the most widely used method for classifying an image as well as an object. The BoW technique was introduced in Natural Language Processing where a training sentence is represented as a bag of words. In the context of image classification, the BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding** and **classifier design**.
+
+Using Deep learning, image classification can be framed as a supervised or unsupervised learning problem that uses hierarchical features automatically without any need for manually crafted features from the image. In recent years, Convolutional Neural Networks (CNNs) have made significant progress in image classification. CNNs use raw image pixels as input, extract low-level and high-level abstract features through convolution operations, and directly output the classification results from the model. This style of end-to-end learning has lead to not only increased performance but also wider adoption various applications.
+
+In this chapter, we introduce deep-learning-based image classification methods and explain how to train a CNN model using PaddlePaddle.
+
+## Demonstration
+
+An image can be classified by a general as well as fine-grained image classifier.
+
+
+Figure 1 shows the results of a general image classifier -- the trained model can correctly recognize the main objects in the images.
+
+<p align="center">
+<img src="image/dog_cat.png "  width="350" ><br/>
+Figure 1. General image classification
+</p>
+
+
+Figure 2 shows the results of a fine-grained image classifier. This task of flower recognition requires correctly recognizing of the flower's categories.
+
+<p align="center">
+<img src="image/flowers.png" width="400" ><br/>
+Figure 2. Fine-grained image classification
+</p>
+
+
+A good model should recognize objects of different categories correctly. The results of such a model should not vary due to viewpoint variation, illumination conditions, object distortion or occlusion.
+Figure 3 shows some images with various disturbances. A good model should classify these images correctly like humans.
+
+<p align="center">
+<img src="image/variations_en.png" width="550" ><br/>
+Figure 3. Disturbed images [22]
+</p>
+
+## Model Overview
+
+A large amount of research in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/) etc. Many image classification algorithms are usually evaluated and compared on top of these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset for Large Scale Visual Recognition Challenge (ILSVRC) started in 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
+
+Before 2012, traditional image classification was accomplished with the three steps described in the background section. A complete model construction usually involves the following stages: low-level feature extraction, feature encoding, spatial constraint or feature clustering, classifier design, model ensemble.
+
+  1). **Low-level feature extraction**: This step extracts large amounts of local features according to fixed strides and scales. Popular local features include Scale-Invariant Feature Transform (SIFT)[1], Histogram of Oriented Gradient(HOG)[2], Local Binary Pattern(LBP)[3], etc. A common practice is to employ multiple feature descriptors in order to avoid missing a lot of information.
+
+  2). **Feature encoding**: Low-level features contain a large amount of redundancy and noise. In order to improve the robustness of features, it is necessary to employ a feature transformation to encode low-level features. This is called feature encoding. Common feature encoding methods include vector quantization [4], sparse coding [5], locality-constrained linear coding [6], Fisher vector encoding [7], etc.
+
+  3). **Spatial constraint**: Spatial constraint or feature clustering is usually adopted after feature encoding for extracting the maximum or average of each dimension in the spatial domain. Pyramid feature matching--a popular feature clustering method--divides an image uniformly into patches and performs feature clustering in each patch.
+
+  4). **Classification**: In the above steps an image can be described by a vector of fixed dimension. Then a classifier can be used to classify the image into categories. Common classifiers include Support Vector Machine(SVM), random forest etc. Kernel SVM is the most popular classifier and has achieved very good performance in traditional image classification tasks.
+
+This method has been used widely as image classification algorithm in PASCAL VOC [18]. NEC Labs(http://www.nec-labs.com/) won the championship by employing SIFT and LBP features, two non-linear encoders and SVM in ILSVRC 2010 [8].
+
+The CNN model--AlexNet proposed by Alex Krizhevsky et al.[9], made a breakthrough in ILSVRC 2012. It dramatically outperformed traditional methods and won the ILSVRC championship in 2012. This was also the first time that a deep learning method was used for large-scale image classification. Since AlexNet, a series of CNN models have been proposed that have advanced the state of the art steadily on Imagenet as shown in Figure 4. With deeper and more sophisticated architectures, Top-5 error rate is getting lower and lower (to around 3.5%). The error rate of human raters on the same Imagenet dataset is 5.1%, which means that the image classification capability of a deep learning model has surpassed human raters.
+
+<p align="center">
+<img src="image/ilsvrc.png" width="500" ><br/>
+Figure 4. Top-5 error rates on ILSVRC image classification
+</p>
+
+### CNN
+
+Traditional CNNs consist of convolutional and fully-connected layers and use the softmax multi-category classifier with the cross-entropy loss function. Figure 5 shows a typical CNN. We first introduce the common components of a CNN.
+
+<p align="center">
+<img src="image/lenet_en.png"><br/>
+Figure 5. A CNN example [20]
+</p>
+
+- convolutional layer: this layer uses the convolution operation to extract (low-level and high-level) features and to discover local correlation and spatial invariance.
+
+- pooling layer: this layer down samples feature maps by extracting local max (max-pooling) or average (avg-pooling) value of each patch in the feature map. Down-sampling is a common operation in image processing and is used to filter out high-frequency information.
+
+- fully-connected layer: this layer fully connects neurons between two adjacent layers.
+
+- non-linear activation: Convolutional and fully-connected layers are usually followed by some non-linear activation layers. Non-linearities enhance the expression capability of the network. Some examples of non-linear activation functions are Sigmoid, Tanh and ReLU. ReLU is the most commonly used activation function in CNN.
+
+- Dropout [10]: At each training stage, individual nodes are dropped out of the network with a certain probability. This improves the network's ability to generalize and avoids overfitting.
+
+Parameter updates at each layer during training causes input layer distributions to change and in turn requires hyper-parameters to be carefully tuned. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
+
+In the following sections, we will introduce the following network architectures - VGG, GoogleNet and ResNets.
+
+### VGG
+
+The Oxford Visual Geometry Group (VGG) proposed the VGG network in ILSVRC 2014 [11]. This model is deeper and wider than previous neural architectures. It consists of five main groups of convolution operations. Adjacent convolution groups are connected via max-pooling layers. Each group contains a series of 3x3 convolutional layers (i.e. kernels). The number of convolution kernels stays the same within the group and increases from 64 in the first group to 512 in the last one. The total number of learnable layers could be 11, 13, 16, or 19 depending on the number of convolutional layers in each group. Figure 6 illustrates a 16-layer VGG. The neural architecture of VGG is relatively simple and has been adopted by many papers such as the first one that surpassed human-level performance on ImageNet [19].
+
+<p align="center">
+<img src="image/vgg16.png" width="750" ><br/>
+Figure 6. VGG16 model for ImageNet
+</p>
+
+### GoogleNet
+
+GoogleNet [12] won the ILSVRC championship in 2014. GoogleNet borrowed some ideas from the Network in Network(NIN) model [13] and is built on the Inception blocks. Let us first familiarize ourselves with these first.
+
+The two main characteristics of the NIN model are:
+
+1) A single-layer convolutional network is replaced with a Multi-Layer Perceptron Convolution (MLPconv). MLPconv is a tiny multi-layer convolutional network. It enhances non-linearity by adding several 1x1 convolutional layers after linear ones.
+
+2) In traditional CNNs, the last fewer layers are usually fully-connected with a large number of parameters. In contrast, NIN replaces all fully-connected layers with convolutional layers with feature maps of the same size as the category dimension and a global average pooling. This replacement of fully-connected layers significantly reduces the number of parameters.
+
+Figure 7 depicts two Inception blocks. Figure 7(a) is the simplest design. The output is a concatenation of features from three convolutional layers and one pooling layer. The disadvantage of this design is that the pooling layer does not change the number of filters and leads to an increase in the number of outputs. After several of such blocks, the number of outputs and parameters become larger and larger and lead to higher computation complexity. To overcome this drawback, the Inception block in Figure 7(b) employs three 1x1 convolutional layers. These reduce dimensions or the number of channels but improve the non-linearity of the network.
+
+<p align="center">
+<img src="image/inception_en.png" width="800" ><br/>
+Figure 7. Inception block
+</p>
+
+GoogleNet consists of multiple stacked Inception blocks followed by an avg-pooling layer as in NIN instead of traditional fully connected layers. The difference between GoogleNet and NIN is that GoogleNet adds a fully connected layer after avg-pooling layer to output a vector of category size. Besides these two characteristics, the features from middle layers of a GoogleNet are also very discriminative. Therefore, GoogeleNet inserts two auxiliary classifiers in the model for enhancing gradient and regularization when doing backpropagation. The loss function of the whole network is the weighted sum of these three classifiers.
+
+Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks -- the first group contains two Inception blocks, the second group has five, and the third group has two. It ends with an average pooling and a fully-connected layer.
+
+<p align="center">
+<img src="image/googlenet.jpeg" ><br/>
+Figure 8. GoogleNet[12]
+</p>
+
+The above model is the first version of GoogleNet or GoogelNet-v1. GoogleNet-v2 [14] introduced BN layer; GoogleNet-v3 [16] further split some convolutional layers, which increases non-linearity and network depth; GoogelNet-v4 [17] leads to the design idea of ResNet which will be introduced in the next section. The evolution from v1 to v4 improved the accuracy rate consistently. We will not go into details of the neural architectures of v2 to v4.
+
+### ResNet
+
+Residual Network(ResNet)[15] won the 2015 championship on three ImageNet competitions -- image classification, object localization, and object detection. The main challenge in training deeper networks is that accuracy degrades with network depth. The authors of ResNet proposed a residual learning approach to ease the difficulty of training deeper networks. Based on the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer's inputs. The outputs of these two branches are then added up.
+
+Figure 9 illustrates the ResNet architecture. To the left is the basic building block, it consists of two 3x3 convolutional layers of the same channels. To the right is a Bottleneck block. The bottleneck is a 1x1 convolutional layer used to reduce dimension from 256 to 64. The other 1x1 convolutional layer is used to increase dimension from 64 to 256. Thus, the number of input and output channels of the middle 3x3 convolutional layer is 64, which is relatively small.
+
+<p align="center">
+<img src="image/resnet_block.jpg" width="400"><br/>
+Figure 9. Residual block
+</p>
+
+Figure 10 illustrates ResNets with 50, 101, 152 layers, respectively. All three networks use bottleneck blocks of different numbers of repetitions. ResNet converges very fast and can be trained with hundreds or thousands of layers.
+
+<p align="center">
+<img src="image/resnet.png"><br/>
+Figure 10. ResNet model for ImageNet
+</p>
+
+
+## Dataset
+
+Commonly used public datasets for image classification are [CIFAR](https://www.cs.toronto.edu/~kriz/cifar.html), [ImageNet](http://image-net.org/), [COCO](http://mscoco.org/), etc. Those used for fine-grained image classification are [CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), [Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/), [Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among these, the ImageNet dataset is the largest. Most research results are reported on ImageNet as mentioned in the Model Overview section. Since 2010, the ImageNet dataset has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average.
+
+Since ImageNet is too large to be downloaded and trained efficiently, we use [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR-10 as well as 10 images randomly sampled from each category.
+
+<p align="center">
+<img src="image/cifar.png" width="350"><br/>
+Figure 11. CIFAR10 dataset[21]
+</p>
+
+ `paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need to manually download and preprocess CIFAR-10.
+
+After running the command `python train.py`, training will start immediately. The following sections will describe in details.
+
+## Model Structure
+
+### Initialize PaddlePaddle
+
+We must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).
+
+```python
+import sys
+import paddle.v2 as paddle
+from vgg import vgg_bn_drop
+from resnet import resnet_cifar10
+
+# PaddlePaddle init
+paddle.init(use_gpu=False, trainer_count=1)
+```
+Now we are going to walk you through the implementations of the VGG and ResNet.
+
+### VGG
+
+Let's start with the VGG model. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we use a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
+
+1. Define input data and its dimension
+
+    The input to the network is defined as `paddle.layer.data`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10.
+
+    ```python
+    datadim = 3 * 32 * 32
+    classdim = 10
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+    ```
+
+2. Define VGG main module
+
+    ```python
+    net = vgg_bn_drop(image)
+    ```
+    The input to VGG main module is from the data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail:
+
+    ```python
+    def vgg_bn_drop(input):
+        def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+            return paddle.networks.img_conv_group(
+                input=ipt,
+                num_channels=num_channels,
+                pool_size=2,
+                pool_stride=2,
+                conv_num_filter=[num_filter] * groups,
+                conv_filter_size=3,
+                conv_act=paddle.activation.Relu(),
+                conv_with_batchnorm=True,
+                conv_batchnorm_drop_rate=dropouts,
+                pool_type=paddle.pooling.Max())
+
+        conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+        conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+        conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+        conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+        conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+        drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+        fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+        bn = paddle.layer.batch_norm(
+            input=fc1,
+            act=paddle.activation.Relu(),
+            layer_attr=paddle.attr.Extra(drop_rate=0.5))
+        fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+        return fc2
+    ```
+
+    2.1. Firstly, it defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
+
+    2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
+
+    2.3. The last two layers are fully-connected layers of dimension 512.
+
+3. Define Classifier
+
+    The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category.
+
+    ```python
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+    ```
+
+4. Define Loss Function and Outputs
+
+    In the context of supervised learning, labels of training images are defined in `paddle.layer.data` as well. During training, the cross-entropy loss function is used and the loss is the output of the network. During testing, the outputs are the probabilities calculated in the classifier.
+
+    ```python
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+    ```
+
+### ResNet
+
+The first, third and fourth steps of a ResNet are the same as a VGG. The second step is the main module of ResNet.
+
+```python
+net = resnet_cifar10(image, depth=56)
+```
+
+Here are some basic functions used in `resnet_cifar10`:
+
+  - `conv_bn_layer` : convolutional layer followed by BN.
+  - `shortcut` : the shortcut branch in a residual block. There are two kinds of shortcuts: 1x1 convolution used when the number of channels between input and output is different; direct connection used otherwise.
+
+  - `basicblock` : a basic residual module as shown in the left of Figure 9, it consists of two sequential 3x3 convolutions and one "shortcut" branch.
+  - `bottleneck` : a bottleneck module as shown in the right of Figure 9, it consists of two 1x1 convolutions with one 3x3 convolution in between branch and a "shortcut" branch.
+  - `layer_warp` : a group of residual modules consisting of several stacking blocks. In each group, the sliding window size of the first residual block could be different from the rest of blocks, in order to reduce the size of feature maps along horizontal and vertical directions.
+
+```python
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+
+def shortcut(ipt, n_in, n_out, stride):
+    if n_in != n_out:
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return ipt
+
+def basicblock(ipt, ch_out, stride):
+    ch_in = ch_out * 2
+    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
+    short = shortcut(ipt, ch_in, ch_out, stride)
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
+
+def layer_warp(block_func, ipt, features, count, stride):
+    tmp = block_func(ipt, features, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, features, 1)
+    return tmp
+```
+
+The following are the components of `resnet_cifar10`:
+
+1. The lowest level is `conv_bn_layer`.
+2. The middle level consists of three `layer_warp`, each of which uses the left residual block in Figure 9.
+3. The last level is average pooling layer.
+
+Note: besides the first convolutional layer and the last fully-connected layer, the total number of layers in three `layer_warp` should be dividable by 6, that is the depth of `resnet_cifar10` should satisfy $(depth - 2) % 6 == 0$.
+
+```python
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    return pool
+```
+
+## Model Training
+
+### Define Parameters
+
+Firstly, we create the model parameters according to the previous model configuration `cost`.
+
+```python
+# Create parameters
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+Before creating a training module, it is necessary to set the algorithm.
+Here we specify `Momentum` optimization algorithm via `paddle.optimizer`.
+
+```python
+# Create optimizer
+momentum_optimizer = paddle.optimizer.Momentum(
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+    learning_rate=0.1 / 128.0,
+    learning_rate_decay_a=0.1,
+    learning_rate_decay_b=50000 * 100,
+    learning_rate_schedule='discexp')
+
+# Create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=momentum_optimizer)
+```
+
+The learning rate adjustment policy can be defined with variables `learning_rate_decay_a`($a$), `learning_rate_decay_b`($b$) and `learning_rate_schedule`. In this example, discrete exponential method is used for adjusting learning rate. The formula is as follows,
+$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
+where $n$ is the number of processed samples, $lr_{0}$ is the learning_rate.
+
+### Training
+
+`cifar.train10()` will yield records during each pass, after shuffling, a batch input is generated for training.
+
+```python
+reader=paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=50000),
+        batch_size=128)
+```
+
+`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance,
+ the first column of data generated by `cifar.train10()` corresponds to image layer's feature.
+
+```python
+feeding={'image': 0,
+         'label': 1}
+```
+
+Callback function `event_handler` will be called during training when a pre-defined event happens.
+
+`event_handler_plot`is used to plot a figure like below：
+
+![png](./image/train_and_test.png)
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 1 == 0:
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+        step += 1
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+            reader=paddle.batch(
+                paddle.dataset.cifar.test10(), batch_size=128),
+            feeding=feeding)
+        cost_ploter.append(test_title, step, result.cost)
+```
+
+`event_handler` is used to plot some text data when training.
+
+```python
+# event handler to track training and testing process
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "\nPass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        else:
+            sys.stdout.write('.')
+            sys.stdout.flush()
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(
+            reader=paddle.batch(
+                paddle.dataset.cifar.test10(), batch_size=128),
+            feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+Finally, we can invoke `trainer.train` to start training:
+
+```python
+trainer.train(
+    reader=reader,
+    num_passes=200,
+    event_handler=event_handler_plot,
+    feeding=feeding)
+```
+
+Here is an example log after training for one pass. The average error rates are 0.6875 on the training set and 0.8852 on the validation set.
+
+```text
+Pass 0, Batch 0, Cost 2.473182, {'classification_error_evaluator': 0.9140625}
+...................................................................................................
+Pass 0, Batch 100, Cost 1.913076, {'classification_error_evaluator': 0.78125}
+...................................................................................................
+Pass 0, Batch 200, Cost 1.783041, {'classification_error_evaluator': 0.7421875}
+...................................................................................................
+Pass 0, Batch 300, Cost 1.668833, {'classification_error_evaluator': 0.6875}
+..........................................................................................
+Test with Pass 0, {'classification_error_evaluator': 0.885200023651123}
+```
+
+Figure 12 shows the curve of training error rate, which indicates it converges at Pass 200 with error rate 8.54%.
+<p align="center">
+<img src="image/plot_en.png" width="400" ><br/>
+Figure 12. The error rate of VGG model on CIFAR10
+</p>
+
+
+
+## Application
+
+After training is completed, users can use the trained model to classify images. The following code shows how to infer through `paddle.infer` interface. You can uncomment some lines from below to change the model name.
+
+```python
+from PIL import Image
+import numpy as np
+import os
+def load_image(file):
+    im = Image.open(file)
+    im = im.resize((32, 32), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32)
+    # The storage order of the loaded image is W(widht),
+    # H(height), C(channel). PaddlePaddle requires
+    # the CHW order, so transpose them.
+    im = im.transpose((2, 0, 1)) # CHW
+    # In the training phase, the channel order of CIFAR
+    # image is B(Blue), G(green), R(Red). But PIL open
+    # image in RGB mode. It must swap the channel order.
+    im = im[(2, 1, 0),:,:] # BGR
+    im = im.flatten()
+    im = im / 255.0
+    return im
+test_data = []
+cur_dir = os.getcwd()
+test_data.append((load_image(cur_dir + '/image/dog.png'),))
+
+# users can remove the comments and change the model name
+# with open('params_pass_50.tar', 'r') as f:
+#    parameters = paddle.parameters.Parameters.from_tar(f)
+
+probs = paddle.infer(
+    output_layer=out, parameters=parameters, input=test_data)
+lab = np.argsort(-probs) # probs and lab are the results of one batch data
+print "Label of image/dog.png is: %d" % lab[0][0]
+```
+
+
+## Conclusion
+
+Traditional image classification methods involve multiple stages of processing, which has to utilize complex frameworks. Contrarily, CNN models can be trained end-to-end with a significant increase in classification accuracy. In this chapter, we introduced three models -- VGG, GoogleNet, ResNet and provided PaddlePaddle config files for training VGG and ResNet on CIFAR10. We also explained how to perform prediction and feature extraction using the PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
+
+
+## References
+
+[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
+
+[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
+
+[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
+
+[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
+
+[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
+
+[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
+
+[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
+
+[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
+
+[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
+
+[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
+
+[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
+
+[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
+
+[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
+
+[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
+
+[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
+
+[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
+
+[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
+
+[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
+
+[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
+
+[20] http://deeplearning.net/tutorial/lenet.html
+
+[21] https://www.cs.toronto.edu/~kriz/cifar.html
+
+[22] http://cs231n.github.io/classification/
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/image_classification/resnet.py b/03.image_classification/resnet.py
similarity index 80%
rename from image_classification/resnet.py
rename to 03.image_classification/resnet.py
index 19d20540780becf504973a23b50445d4b65dc2ef..c60d19fc59dfea31d8a9b22d974047f60475b092 100644
--- a/image_classification/resnet.py
+++ b/03.image_classification/resnet.py
@@ -36,26 +36,25 @@ def conv_bn_layer(input,
     return paddle.layer.batch_norm(input=tmp, act=active_type)
 
 
-def shortcut(ipt, n_in, n_out, stride):
-    if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+def shortcut(ipt, ch_in, ch_out, stride):
+    if ch_in != ch_out:
+        return conv_bn_layer(ipt, ch_out, 1, stride, 0,
                              paddle.activation.Linear())
     else:
         return ipt
 
 
-def basicblock(ipt, ch_out, stride):
-    ch_in = ch_out * 2
+def basicblock(ipt, ch_in, ch_out, stride):
     tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
     tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
     short = shortcut(ipt, ch_in, ch_out, stride)
     return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
 
 
-def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(ipt, features, stride)
+def layer_warp(block_func, ipt, ch_in, ch_out, count, stride):
+    tmp = block_func(ipt, ch_in, ch_out, stride)
     for i in range(1, count):
-        tmp = block_func(tmp, features, 1)
+        tmp = block_func(tmp, ch_out, ch_out, 1)
     return tmp
 
 
@@ -66,9 +65,9 @@ def resnet_cifar10(ipt, depth=32):
     nStages = {16, 64, 128}
     conv1 = conv_bn_layer(
         ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
     pool = paddle.layer.img_pool(
         input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
     return pool
diff --git a/image_classification/train.py b/03.image_classification/train.py
similarity index 57%
rename from image_classification/train.py
rename to 03.image_classification/train.py
index 743b10a50a389b9024a1236d97e1b31157e24896..faafc7ff5038cd8b40944d7742a4d1612468f80b 100644
--- a/image_classification/train.py
+++ b/03.image_classification/train.py
@@ -12,20 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-import sys
+import sys, os
 
 import paddle.v2 as paddle
 
 from vgg import vgg_bn_drop
 from resnet import resnet_cifar10
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 
 def main():
     datadim = 3 * 32 * 32
     classdim = 10
 
     # PaddlePaddle init
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
 
     image = paddle.layer.data(
         name="image", type=paddle.data_type.dense_vector(datadim))
@@ -36,9 +38,8 @@ def main():
     # option 2. vgg
     net = vgg_bn_drop(image)
 
-    out = paddle.layer.fc(input=net,
-                          size=classdim,
-                          act=paddle.activation.Softmax())
+    out = paddle.layer.fc(
+        input=net, size=classdim, act=paddle.activation.Softmax())
 
     lbl = paddle.layer.data(
         name="label", type=paddle.data_type.integer_value(classdim))
@@ -54,8 +55,11 @@ def main():
         learning_rate=0.1 / 128.0,
         learning_rate_decay_a=0.1,
         learning_rate_decay_b=50000 * 100,
-        learning_rate_schedule='discexp',
-        batch_size=128)
+        learning_rate_schedule='discexp')
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=momentum_optimizer)
 
     # End batch and end pass event handler
     def event_handler(event):
@@ -67,6 +71,10 @@ def main():
                 sys.stdout.write('.')
                 sys.stdout.flush()
         if isinstance(event, paddle.event.EndPass):
+            # save parameters
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
             result = trainer.test(
                 reader=paddle.batch(
                     paddle.dataset.cifar.test10(), batch_size=128),
@@ -74,10 +82,11 @@ def main():
                          'label': 1})
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=momentum_optimizer)
+    # Save the inference topology to protobuf.
+    inference_topology = paddle.topology.Topology(layers=out)
+    with open("inference_topology.pkl", 'wb') as f:
+        inference_topology.serialize_for_inference(f)
+
     trainer.train(
         reader=paddle.batch(
             paddle.reader.shuffle(
@@ -88,6 +97,40 @@ def main():
         feeding={'image': 0,
                  'label': 1})
 
+    # inference
+    from PIL import Image
+    import numpy as np
+    import os
+
+    def load_image(file):
+        im = Image.open(file)
+        im = im.resize((32, 32), Image.ANTIALIAS)
+        im = np.array(im).astype(np.float32)
+        # The storage order of the loaded image is W(widht),
+        # H(height), C(channel). PaddlePaddle requires
+        # the CHW order, so transpose them.
+        im = im.transpose((2, 0, 1))  # CHW
+        # In the training phase, the channel order of CIFAR
+        # image is B(Blue), G(green), R(Red). But PIL open
+        # image in RGB mode. It must swap the channel order.
+        im = im[(2, 1, 0), :, :]  # BGR
+        im = im.flatten()
+        im = im / 255.0
+        return im
+
+    test_data = []
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    test_data.append((load_image(cur_dir + '/image/dog.png'), ))
+
+    # users can remove the comments and change the model name
+    # with open('params_pass_50.tar', 'r') as f:
+    #    parameters = paddle.parameters.Parameters.from_tar(f)
+
+    probs = paddle.infer(
+        output_layer=out, parameters=parameters, input=test_data)
+    lab = np.argsort(-probs)  # probs and lab are the results of one batch data
+    print "Label of image/dog.png is: %d" % lab[0][0]
+
 
 if __name__ == '__main__':
     main()
diff --git a/image_classification/vgg.py b/03.image_classification/vgg.py
similarity index 100%
rename from image_classification/vgg.py
rename to 03.image_classification/vgg.py
diff --git a/word2vec/.gitignore b/04.word2vec/.gitignore
similarity index 100%
rename from word2vec/.gitignore
rename to 04.word2vec/.gitignore
diff --git a/word2vec/README.md b/04.word2vec/README.cn.md
similarity index 66%
rename from word2vec/README.md
rename to 04.word2vec/README.cn.md
index 1a942f4ec26cf2763977a57b2b8e9232c95f521e..47422bcb0c61f840b3010e0eb4e04d67e783e0ae 100644
--- a/word2vec/README.md
+++ b/04.word2vec/README.cn.md
@@ -1,12 +1,13 @@
+
 # 词向量
 
-本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/word2vec)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/175.html)。
 
 ## 背景介绍
 
 本章我们介绍词的向量表征，也称为word embedding。词向量是自然语言处理中常见的一个操作，是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。
 
-在这些互联网服务里，我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较，我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。 
+在这些互联网服务里，我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较，我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。
 在这种方式里，每个词被表示成一个实数向量（one-hot vector），其长度为字典大小，每个维度对应一个字典里的每个词，除了这个词对应维度上的值是1，其他元素都是0。
 
 One-hot vector虽然自然，但是用处有限。比如，在互联网广告系统里，如果用户输入的query是“母亲节”，而有一个广告的关键词是“康乃馨”。虽然按照常理，我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨；但是这两个词对应的one-hot vectors之间的距离度量，无论是欧氏距离还是余弦相似度(cosine similarity)，由于其向量正交，都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是：每个词本身的信息量都太小。所以，仅仅给定两个词，不足以让我们准确判别它们是否相关。要想精确计算相关性，我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。
@@ -31,8 +32,8 @@ $$X = USV^T$$
 本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
 
 <p align="center">
-	<img src = "image/2d_similarity.png" width=400><br/>
-	图1. 词向量的二维投影
+    <img src = "image/2d_similarity.png" width=400><br/>
+    图1. 词向量的二维投影
 </p>
 
 另一方面，我们知道两个向量的余弦值在$[-1,1]$的区间内：两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0，两个方向完全相反的向量余弦值为-1，即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度:
@@ -68,7 +69,7 @@ $$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
 
 
 
-### N-gram neural model 
+### N-gram neural model
 
 在计算语言学中，n-gram是一种重要的文本表示方法，表示一个文本中连续的n个项。基于具体的应用场景，每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法，用n-gram训练语言模型时，一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。
 
@@ -84,41 +85,41 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 
 其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率，$R(\theta)$表示参数正则项。
 
-<p align="center">	
-   	<img src="image/nnlm.png" width=500><br/>
-   	图2. N-gram神经网络模型
+<p align="center">
+       <img src="image/nnlm.png" width=500><br/>
+       图2. N-gram神经网络模型
 </p>
 
 图2展示了N-gram神经网络模型，从下往上看，该模型分为以下几个部分：
  - 对于每个样本，模型输入$w_{t-n+1},...w_{t-1}$, 输出句子第t个词为字典中`|V|`个词的概率。
- 
+
    每个输入词$w_{t-n+1},...w_{t-1}$首先通过映射矩阵映射到词向量$C(w_{t-n+1}),...C(w_{t-1})$。
- 
+
  - 然后所有词语的词向量连接成一个大向量，并经过一个非线性映射得到历史词语的隐层表示：
- 
-	$$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
-	
+
+    $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
+
     其中，$x$为所有词语的词向量连接成的大向量，表示文本历史特征；$\theta$、$U$、$b_1$、$b_2$和$W$分别为词向量层到隐层连接的参数。$g$表示未经归一化的所有输出单词概率，$g_i$表示未经归一化的字典中第$i$个单词的输出概率。
 
  - 根据softmax的定义，通过归一化$g_i$, 生成目标词$w_t$的概率为：
- 
+
   $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
 
  - 整个网络的损失值(cost)为多类分类交叉熵，用公式表示为
 
-   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$ 
+   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
 
    其中$y_k^i$表示第$i$个样本第$k$类的真实标签(0或1)，$softmax(g_k^i)$表示第i个样本第k类softmax输出的概率。
-   
 
 
-### Continuous Bag-of-Words model(CBOW) 
+
+### Continuous Bag-of-Words model(CBOW)
 
 CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
 
-<p align="center">	
-	<img src="image/cbow.png" width=250><br/>
-	图3. CBOW模型
+<p align="center">
+    <img src="image/cbow.png" width=250><br/>
+    图3. CBOW模型
 </p>
 
 具体来说，不考虑上下文的词语输入顺序，CBOW是用上下文词语的词向量的均值来预测当前词。即：
@@ -127,13 +128,13 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
 
 其中$x_t$为第$t$个词的词向量，分类分数（score）向量 $z=U*context$，最终的分类$y$采用softmax，损失函数采用多类分类交叉熵。
 
-### Skip-gram model 
+### Skip-gram model
 
 CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
 
-<p align="center">	
-	<img src="image/skipgram.png" width=250><br/>
-	图4. Skip-gram模型
+<p align="center">
+    <img src="image/skipgram.png" width=250><br/>
+    图4. Skip-gram模型
 </p>
 
 如上图所示，Skip-gram模型的具体做法是，将一个词的词向量映射到$2n$个词的词向量（$2n$表示当前输入词的前后各$n$个词），然后分别通过softmax得到这$2n$个词的分类损失值之和。
@@ -143,29 +144,29 @@ CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去
 
 ### 数据介绍
 
-本教程使用Penn Tree Bank (PTB)数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
+本教程使用Penn Treebank （PTB）（经Tomas Mikolov预处理过的版本）数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
 
 <p align="center">
 <table>
-	<tr>
-		<td>训练数据</td>
-		<td>验证数据</td>
-		<td>测试数据</td>
-	</tr>
-	<tr>
-		<td>ptb.train.txt</td>
-		<td>ptb.valid.txt</td>
-		<td>ptb.test.txt</td>
-	</tr>
-	<tr>
-		<td>42068句</td>
-		<td>3370句</td>
-		<td>3761句</td>
-	</tr>
+    <tr>
+        <td>训练数据</td>
+        <td>验证数据</td>
+        <td>测试数据</td>
+    </tr>
+    <tr>
+        <td>ptb.train.txt</td>
+        <td>ptb.valid.txt</td>
+        <td>ptb.test.txt</td>
+    </tr>
+    <tr>
+        <td>42068句</td>
+        <td>3370句</td>
+        <td>3761句</td>
+    </tr>
 </table>
 </p>
 
-	
+
 ### 数据预处理
 
 本章训练的是5-gram模型，表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`，自动做数据的下载与预处理，方便大家使用。
@@ -182,13 +183,14 @@ a dream that one day
 dream that one day <e>
 ```
 
+最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。
 ## 编程实现
 
 本配置的模型结构如下图所示：
 
-<p align="center">	
-	<img src="image/ngram.png" width=400><br/>
-	图5. 模型配置中的N-gram神经网络模型
+<p align="center">
+    <img src="image/ngram.png" width=400><br/>
+    图5. 模型配置中的N-gram神经网络模型
 </p>
 
 首先，加载所需要的包：
@@ -205,11 +207,33 @@ hiddensize = 256 # 隐层维度
 N = 5 # 训练5-Gram
 ```
 
+用于保存和加载word_dict和embedding table的函数
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
 接着，定义网络结构：
 
 - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
-	
-```python	
+
+```python
 def wordemb(inlayer):
     wordemb = paddle.layer.table_projection(
         input=inlayer,
@@ -218,61 +242,61 @@ def wordemb(inlayer):
             name="_proj",
             initial_std=0.001,
             learning_rate=1,
-            l2_rate=0, ))
+            l2_rate=0,
+            sparse_update=True))
     return wordemb
 ```
 
 - 定义输入层接受的数据类型以及名字。
 
 ```python
-def main():
-    paddle.init(use_gpu=False, trainer_count=1) # 初始化PaddlePaddle
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-	# 每个输入层都接受整形数据，这些数据的范围是[0, dict_size)
-    firstword = paddle.layer.data(
-        name="firstw", type=paddle.data_type.integer_value(dict_size))
-    secondword = paddle.layer.data(
-        name="secondw", type=paddle.data_type.integer_value(dict_size))
-    thirdword = paddle.layer.data(
-        name="thirdw", type=paddle.data_type.integer_value(dict_size))
-    fourthword = paddle.layer.data(
-        name="fourthw", type=paddle.data_type.integer_value(dict_size))
-    nextword = paddle.layer.data(
-        name="fifthw", type=paddle.data_type.integer_value(dict_size))
-
-    Efirst = wordemb(firstword)
-    Esecond = wordemb(secondword)
-    Ethird = wordemb(thirdword)
-    Efourth = wordemb(fourthword)
+paddle.init(use_gpu=False, trainer_count=3) # 初始化PaddlePaddle
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+# 每个输入层都接受整形数据，这些数据的范围是[0, dict_size)
+firstword = paddle.layer.data(
+    name="firstw", type=paddle.data_type.integer_value(dict_size))
+secondword = paddle.layer.data(
+    name="secondw", type=paddle.data_type.integer_value(dict_size))
+thirdword = paddle.layer.data(
+    name="thirdw", type=paddle.data_type.integer_value(dict_size))
+fourthword = paddle.layer.data(
+    name="fourthw", type=paddle.data_type.integer_value(dict_size))
+nextword = paddle.layer.data(
+    name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+Efirst = wordemb(firstword)
+Esecond = wordemb(secondword)
+Ethird = wordemb(thirdword)
+Efourth = wordemb(fourthword)
 ```
 
 - 将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
 
 ```python
-    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
 ```
 
 - 将历史文本特征经过一个全连接得到文本隐层特征。
 
 ```python
-    hidden1 = paddle.layer.fc(input=contextemb,
-                              size=hiddensize,
-                              act=paddle.activation.Sigmoid(),
-                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
-                              bias_attr=paddle.attr.Param(learning_rate=2),
-                              param_attr=paddle.attr.Param(
-                                  initial_std=1. / math.sqrt(embsize * 8),
-                                  learning_rate=1))
+hidden1 = paddle.layer.fc(input=contextemb,
+                          size=hiddensize,
+                          act=paddle.activation.Sigmoid(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                          bias_attr=paddle.attr.Param(learning_rate=2),
+                          param_attr=paddle.attr.Param(
+                                initial_std=1. / math.sqrt(embsize * 8),
+                                learning_rate=1))
 ```
-	
+
 - 将文本隐层特征，再经过一个全连接，映射成一个$|V|$维向量，同时通过softmax归一化得到这`|V|`个词的生成概率。
 
 ```python
-    predictword = paddle.layer.fc(input=hidden1,
-                                  size=dict_size,
-                                  bias_attr=paddle.attr.Param(learning_rate=2),
-                                  act=paddle.activation.Softmax())
+predictword = paddle.layer.fc(input=hidden1,
+                              size=dict_size,
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              act=paddle.activation.Softmax())
 ```
 
 - 网络的损失函数为多分类交叉熵，可直接调用`classification_cost`函数。
@@ -288,11 +312,11 @@ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
 - 正则化（regularization）： 是防止网络过拟合的一种手段，此处采用L2正则化。
 
 ```python
-    parameters = paddle.parameters.create(cost)
-    adam_optimizer = paddle.optimizer.Adam(
-        learning_rate=3e-3,
-        regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+parameters = paddle.parameters.create(cost)
+adagrad = paddle.optimizer.AdaGrad(
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+trainer = paddle.trainer.SGD(cost, parameters, adagrad)
 ```
 
 下一步，我们开始训练过程。`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python generator。
@@ -300,113 +324,106 @@ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
 `paddle.batch`的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minbatch。
 
 ```python
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                result = trainer.test(
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
                     paddle.batch(
                         paddle.dataset.imikolov.test(word_dict, N), 32))
-                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    result.metrics)
-
-    trainer.train(
-        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
-        num_passes=30,
-        event_handler=event_handler)
+        print "Pass %d, Testing metrics %s" % (event.pass_id, result.metrics)
+        with open("model_%d.tar"%event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+trainer.train(
+    paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+    num_passes=100,
+    event_handler=event_handler)
 ```
 
-训练过程是完全自动的，event_handler里打印的日志类似如下所示：
-
 ```text
-.............................
-I1222 09:27:16.477841 12590 TrainerInternal.cpp:162]  Batch=3000 samples=300000 AvgCost=5.36135 CurrentCost=5.36135 Eval: classification_error_evaluator=0.818653  CurrentEval: class
-ification_error_evaluator=0.818653 
-.............................
-I1222 09:27:22.416700 12590 TrainerInternal.cpp:162]  Batch=6000 samples=600000 AvgCost=5.29301 CurrentCost=5.22467 Eval: classification_error_evaluator=0.814542  CurrentEval: class
-ification_error_evaluator=0.81043 
-.............................
-I1222 09:27:28.343756 12590 TrainerInternal.cpp:162]  Batch=9000 samples=900000 AvgCost=5.22494 CurrentCost=5.08876 Eval: classification_error_evaluator=0.810088  CurrentEval: class
-ification_error_evaluator=0.80118 
-..I1222 09:27:29.128582 12590 TrainerInternal.cpp:179]  Pass=0 Batch=9296 samples=929600 AvgCost=5.21786 Eval: classification_error_evaluator=0.809647 
-I1222 09:27:29.627616 12590 Tester.cpp:111]  Test samples=73760 cost=4.9594 Eval: classification_error_evaluator=0.79676 
-I1222 09:27:29.627713 12590 GradientMachine.cpp:112] Saving parameters to model/pass-00000
+Pass 0, Batch 0, Cost 7.870579, {'classification_error_evaluator': 1.0}, Testing metrics {'classification_error_evaluator': 0.999591588973999}
+Pass 0, Batch 100, Cost 6.136420, {'classification_error_evaluator': 0.84375}, Testing metrics {'classification_error_evaluator': 0.8328699469566345}
+Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Testing metrics {'classification_error_evaluator': 0.8328542709350586}
+...
 ```
+
+训练过程是完全自动的，event_handler里打印的日志类似如上所示：
+
 经过30个pass，我们将得到平均错误率为classification_error_evaluator=0.735611。
 
+## 保存词典和embedding
+
+训练完成之后，我们可以把词典和embedding table单独保存下来，后面可以直接使用
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
 
 ## 应用模型
-训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型参数从二进制格式转换成文本格式进行后续应用。
+训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型查看参数用来做后续应用。
 
-### 初始化其他模型
 
-训练好的模型参数可以用来初始化其他模型。具体方法如下：
-在PaddlePaddle 训练命令行中，用`--init_model_path` 来定义初始化模型的位置，用`--load_missing_parameter_strategy`指定除了词向量以外的新模型其他参数的初始化策略。注意，新模型需要和原模型共享被初始化参数的参数名。
-	
 ### 查看词向量
-PaddlePaddle训练出来的参数为二进制格式，存储在对应训练pass的文件夹下。这里我们提供了文件`format_convert.py`用来互转PaddlePaddle训练结果的二进制文件和文本格式特征文件。
 
-```bash
-python format_convert.py --b2t -i INPUT -o OUTPUT -d DIM
-```
-其中，INPUT是输入的（二进制）词向量模型名称，OUTPUT是输出的文本模型名称，DIM是词向量参数维度。
+PaddlePaddle训练出来的参数可以直接使用`parameters.get()`获取出来。例如查看单词`apple`的词向量，即为
+
 
-用法如：
+```python
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
 
-```bash
-python format_convert.py --b2t -i model/pass-00029/_proj -o model/pass-00029/_proj.txt -d 32
+print embeddings[word_dict['apple']]
 ```
-转换后得到的文本文件如下：
 
 ```text
-0,4,62496
--0.7444070,-0.1846171,-1.5771370,0.7070392,2.1963732,-0.0091410, ......
--0.0721337,-0.2429973,-0.0606297,0.1882059,-0.2072131,-0.7661019, ......
-......
+[-0.38961065 -0.02392169 -0.00093231  0.36301503  0.13538605  0.16076435
+-0.0678709   0.1090285   0.42014077 -0.24119169 -0.31847557  0.20410083
+0.04910378  0.19021918 -0.0122014  -0.04099389 -0.16924137  0.1911236
+-0.10917275  0.13068172 -0.23079982  0.42699069 -0.27679482 -0.01472992
+0.2069038   0.09005053 -0.3282454   0.12717034 -0.24218646  0.25304323
+0.19072419 -0.24286366]
 ```
 
-其中，第一行是PaddlePaddle 输出文件的格式说明，包含3个属性：<br/>
-1) PaddlePaddle的版本号，本例中为0;<br/>
-2) 浮点数占用的字节数，本例中为4;<br/>
-3) 总计的参数个数, 本例中为62496（即1953*32）;<br/>
-第二行及之后的每一行都按顺序表示字典里一个词的特征，用逗号分隔。
-	
+
 ### 修改词向量
 
-我们可以对词向量进行修改，并转换成PaddlePaddle参数二进制格式，方法：	
+获得到的embedding为一个标准的numpy矩阵。我们可以对这个numpy矩阵进行修改，然后赋值回去。
 
-```bash
-python format_convert.py --t2b -i INPUT -o OUTPUT
-```
 
-其中，INPUT是输入的输入的文本词向量模型名称，OUTPUT是输出的二进制词向量模型名称
-
-输入的文本格式如下（注意，不包含上面二进制转文本后第一行的格式说明）：
+```python
+def modify_embedding(emb):
+    # Add your modification here.
+    pass
 
-```text
--0.7444070,-0.1846171,-1.5771370,0.7070392,2.1963732,-0.0091410, ......
--0.0721337,-0.2429973,-0.0606297,0.1882059,-0.2072131,-0.7661019, ......
-......
+modify_embedding(embeddings)
+parameters.set("_proj", embeddings)
 ```
-	
-	
 
 ### 计算词语之间的余弦距离
 
 两个向量之间的距离可以用余弦值来表示，余弦值在$[-1,1]$的区间内，向量间余弦值越大，其距离越近。这里我们在`calculate_dis.py`中实现不同词语的距离度量。
 用法如下：
 
-```bash
-python calculate_dis.py VOCABULARY EMBEDDINGLAYER` 
-```
 
-其中，`VOCABULARY`是字典，`EMBEDDINGLAYER`是词向量模型，示例如下：
+```python
+from scipy import spatial
+
+emb_1 = embeddings[word_dict['world']]
+emb_2 = embeddings[word_dict['would']]
 
-```bash
-python calculate_dis.py data/vocabulary.txt model/pass-00029/_proj.txt
+print spatial.distance.cosine(emb_1, emb_2)
 ```
- 
- 
+
+```text
+0.99375076448
+```
+
 ## 总结
 本章中，我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中，我们可以根据向量间的余弦夹角，来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中，训练好的词向量可以用来初始化模型，以得到更好的效果。在文档分类中，有了词向量之后，可以用聚类的方法将文档中同义词进行分组。希望大家在本章后能够自行运用词向量进行相关领域的研究。
 
@@ -419,4 +436,4 @@ python calculate_dis.py data/vocabulary.txt model/pass-00029/_proj.txt
 5. https://en.wikipedia.org/wiki/Singular_value_decomposition
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/04.word2vec/README.md b/04.word2vec/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b405587f4dc05af49d3a1c0c7bbecd16ab6f8b6
--- /dev/null
+++ b/04.word2vec/README.md
@@ -0,0 +1,450 @@
+# Word2Vec
+
+This is intended as a reference tutorial. The source code of this tutorial is located at [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec).
+
+For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Background Introduction
+
+This section introduces the concept of **word embeddings**, which are vector representations of words. Word embeddings is a popular technique used in natural language processing to support applications such as search engines, advertising systems, and recommendation systems.
+
+### One-Hot Vectors
+
+Building these applications requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
+
+For each word, its vector representation has the corresponding entry in the vector as 1, and all other entries as 0. The lengths of one-hot vectors match the size of the dictionary. Each entry of a vector corresponds to the presence (or absence) of a word in the dictionary.
+
+One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword "carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
+
+Like many machine learning models, word embeddings can represent knowledge in various ways. Another model may project an one-hot vector to an embedding vector of lower dimension e.g. $embedding(mother's day) = [0.3, 4.2, -1.5, ...], embedding(carnations) = [0.2, 5.6, -2.3, ...]$. Mapping one-hot vectors onto an embedded vector space has the potential to bring the embedding vectors of similar words (either semantically or usage-wise) closer to each other, so that the cosine similarity between the corresponding vectors for words like "Mother's Day" and "carnations" are no longer zero.
+
+A word embedding model could be a probabilistic model, a co-occurrence matrix model, or a neural network. Before people started using neural networks to generate word embedding, the traditional method was to calculate a co-occurrence matrix $X$ of words. Here, $X$ is a $|V| \times |V|$ matrix, where $X_{ij}$ represents the co-occurrence times of the $i$th and $j$th words in the vocabulary `V` within all corpus, and $|V|$ is the size of the vocabulary. By performing matrix decomposition on $X$ e.g. Singular Value Decomposition \[[5](#references)\]
+
+$$X = USV^T$$
+
+the resulting $U$ can be seen as the word embedding of all the words.
+
+However, this method suffers from many drawbacks:
+1) Since many pairs of words don't co-occur, the co-occurrence matrix is sparse. To achieve good performance of matrix factorization, further treatment on word frequency is needed;
+2) The matrix is large, frequently on the order of $10^6*10^6$;
+3) We need to manually filter out stop words (like "although", "a", ...), otherwise these frequent words will affect the performance of matrix factorization.
+
+The neural network based model does not require storing huge hash tables of statistics on all of the corpus. It obtains the word embedding by learning from semantic information, hence could avoid the aforementioned problems in the traditional method. In this chapter, we will introduce the details of neural network word embedding model and how to train such model in PaddlePaddle.
+
+## Results Demonstration
+
+In this section, we use the $t-$SNE\[[4](#references)\] data visualization algorithm to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we can see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
+
+<p align="center">
+    <img src = "image/2d_similarity.png" width=400><br/>
+    Figure 1. Two dimension projection of word embeddings
+</p>
+
+### Cosine Similarity
+
+On the other hand, we know that the cosine similarity between two vectors falls between $[-1,1]$. Specifically, the cosine similarity is 1 when the vectors are identical, 0 when the vectors are perpendicular, -1 when the are of opposite directions. That is, the cosine similarity between two vectors scales with their relevance. So we can calculate the cosine similarity of two word embedding vectors to represent their relevance:
+
+```
+please input two words: big huge
+similarity: 0.899180685161
+
+please input two words: from company
+similarity: -0.0997506977351
+```
+
+The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
+
+
+## Model Overview
+
+In this section, we will introduce three word embedding models: N-gram model, CBOW, and Skip-gram, which all output the frequency of each word given its immediate context.
+
+For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Training](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
+
+The latter two models, which became popular recently, are neural word embedding model developed by Tomas Mikolov at Google \[[3](#references)\]. Despite their apparent simplicity, these models train very well.
+
+### Language Model
+
+Before diving into word embedding models, we will first introduce the concept of **language model**. Language models build the joint probability function $P(w_1, ..., w_T)$ of a sentence, where $w_i$ is the i-th word in the sentence. The goal is to give higher probabilities to meaningful sentences, and lower probabilities to meaningless constructions.
+
+In general, models that generate the probability of a sequence can be applied to many fields, like machine translation, speech recognition, information retrieval, part-of-speech tagging, and handwriting recognition. Take information retrieval, for example. If you were to search for "how long is a football bame" (where bame is a medical noun), the search engine would have asked if you had meant "how long is a football game" instead. This is because the probability of "how long is a football bame" is very low according to the language model; in addition, among all of the words easily confused with "bame", "game" would build the most probable sentence.
+
+#### Target Probability
+For language model's target probability $P(w_1, ..., w_T)$, if the words in the sentence were to be independent, the joint probability of the whole sentence would be the product of each word's probability:
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
+
+However, the frequency of words in a sentence typically relates to the words before them, so canonical language models are constructed using conditional probability in its target probability:
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
+
+
+### N-gram neural model
+
+In computational linguistics, n-gram is an important method to represent text. An n-gram represents a contiguous sequence of n consecutive items given a text. Based on the desired application scenario, each item could be a letter, a syllable or a word. The N-gram model is also an important method in statistical language modeling. When training language models with n-grams, the first (n-1) words of an n-gram are used to predict the *n*th word.
+
+Yoshua Bengio and other scientists describe how to train a word embedding model using neural network in the famous paper of Neural Probabilistic Language Models \[[1](#references)\] published in 2003. The Neural Network Language Model (NNLM) described in the paper learns the language model and word embedding simultaneously through a linear transformation and a non-linear hidden connection. That is, after training on large amounts of corpus, the model learns the word embedding; then, it computes the probability of the whole sentence, using the embedding. This type of language model can overcome the **curse of dimensionality** i.e. model inaccuracy caused by the difference in dimensionality between training and testing data. Note that the term *neural network language model* is ill-defined, so we will not use the name NNLM but only refer to it as *N-gram neural model* in this section.
+
+We have previously described language model using conditional probability, where the probability of the *t*-th word in a sentence depends on all $t-1$ words before it. Furthermore, since words further prior have less impact on a word, and every word within an n-gram is only effected by its previous n-1 words, we have:
+
+$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
+
+Given some real corpus in which all sentences are meaningful, the n-gram model should maximize the following objective function:
+
+$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
+
+where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional logarithmic probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
+
+<p align="center">
+       <img src="image/nnlm_en.png" width=500><br/>
+       Figure 2. N-gram neural network model
+</p>
+
+
+Figure 2 shows the N-gram neural network model. From the bottom up, the model has the following components:
+
+ - For each sample, the model gets input $w_{t-n+1},...w_{t-1}$, and outputs the probability that the t-th word is one of `|V|` in the dictionary.
+
+ Every input word $w_{t-n+1},...w_{t-1}$ first gets transformed into word embedding $C(w_{t-n+1}),...C(w_{t-1})$ through a transformation matrix.
+
+ - All the word embeddings concatenate into a single vector, which is mapped (nonlinearly) into the $t$-th word hidden representation:
+
+    $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
+
+   where $x$ is the large vector concatenated from all the word embeddings representing the context; $\theta$, $U$, $b_1$, $b_2$ and $W$ are parameters connecting word embedding layers to the hidden layers. $g$ represents the unnormalized probability of the output word, $g_i$ represents the unnormalized probability of the output word being the i-th word in the dictionary.
+
+ - Based on the definition of softmax, using normalized $g_i$, the probability that the output word is $w_t$ is represented as:
+
+  $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
+
+ - The cost of the entire network is a multi-class cross-entropy and can be described by the following loss function
+
+   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
+
+   where $y_k^i$ represents the true label for the $k$-th class in the $i$-th sample ($0$ or $1$), $softmax(g_k^i)$ represents the softmax probability for the $k$-th class in the $i$-th sample.
+
+### Continuous Bag-of-Words model(CBOW)
+
+CBOW model predicts the current word based on the N words both before and after it. When $N=2$, the model is as the figure below:
+
+<p align="center">
+    <img src="image/cbow_en.png" width=250><br/>
+    Figure 3. CBOW model
+</p>
+
+Specifically, by ignoring the order of words in the sequence, CBOW uses the average value of the word embedding of the context to predict the current word:
+
+$$\text{context} = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
+
+where $x_t$ is the word embedding of the t-th word, classification score vector is $z=U*\text{context}$, the final classification $y$ uses softmax and the loss function uses multi-class cross-entropy.
+
+### Skip-gram model
+
+The advantages of CBOW is that it smooths over the word embeddings of the context and reduces noise, so it is very effective on small dataset. Skip-gram uses a word to predict its context and get multiple context for the given word, so it can be used in larger datasets.
+
+<p align="center">
+    <img src="image/skipgram_en.png" width=250><br/>
+    Figure 4. Skip-gram model
+</p>
+
+As illustrated in the figure above, skip-gram model maps the word embedding of the given word onto $2n$ word embeddings (including $n$ words before and $n$ words after the given word), and then combine the classification loss of all those $2n$ words by softmax.
+
+## Dataset
+
+We will use Penn Treebank (PTB) (Tomas Mikolov's pre-processed version) dataset. PTB is a small dataset, used in Recurrent Neural Network Language Modeling Toolkit\[[2](#references)\]. Its statistics are as follows:
+
+<p align="center">
+<table>
+    <tr>
+        <td>training set</td>
+        <td>validation set</td>
+        <td>test set</td>
+    </tr>
+    <tr>
+        <td>ptb.train.txt</td>
+        <td>ptb.valid.txt</td>
+        <td>ptb.test.txt</td>
+    </tr>
+    <tr>
+        <td>42068 lines</td>
+        <td>3370 lines</td>
+        <td>3761 lines</td>
+    </tr>
+</table>
+</p>
+
+### Python Dataset Module
+
+We encapsulated the PTB Data Set in our Python module `paddle.dataset.imikolov`. This module can
+
+1. download the dataset to `~/.cache/paddle/dataset/imikolov`, if not yet, and
+2. [preprocesses](#preprocessing) the dataset.
+
+### Preprocessing
+
+We will be training a 5-gram model. Given five words in a window, we will predict the fifth word given the first four words.
+
+Beginning and end of a sentence have a special meaning, so we will add begin token `<s>` in the front of the sentence. And end token `<e>` in the end of the sentence. By moving the five word window in the sentence, data instances are generated.
+
+For example, the sentence "I have a dream that one day" generates five data instances:
+
+```text
+<s> I have a dream
+I have a dream that
+have a dream that one
+a dream that one day
+dream that one day <e>
+```
+
+At last, each data instance will be converted into an integer sequence according it's words' index inside the dictionary.
+
+## Training
+
+The neural network that we will be using is illustrated in the graph below:
+
+<p align="center">
+    <img src="image/ngram.en.png" width=400><br/>
+    Figure 5. N-gram neural network model in model configuration
+</p>
+
+`word2vec/train.py` demonstrates training word2vec using PaddlePaddle:
+
+- Import packages.
+
+```python
+import math
+import paddle.v2 as paddle
+```
+
+- Configure parameter.
+
+```python
+embsize = 32 # word vector dimension
+hiddensize = 256 # hidden layer dimension
+N = 5 # train 5-gram
+```
+
+
+- functions used to save and load word dict and embedding table
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
+- Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example).
+
+```python
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+```
+
+- Define name and type for input to data layer.
+
+```python
+paddle.init(use_gpu=False, trainer_count=3)
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+# Every layer takes integer value of range [0, dict_size)
+firstword = paddle.layer.data(
+    name="firstw", type=paddle.data_type.integer_value(dict_size))
+secondword = paddle.layer.data(
+    name="secondw", type=paddle.data_type.integer_value(dict_size))
+thirdword = paddle.layer.data(
+    name="thirdw", type=paddle.data_type.integer_value(dict_size))
+fourthword = paddle.layer.data(
+    name="fourthw", type=paddle.data_type.integer_value(dict_size))
+nextword = paddle.layer.data(
+    name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+Efirst = wordemb(firstword)
+Esecond = wordemb(secondword)
+Ethird = wordemb(thirdword)
+Efourth = wordemb(fourthword)
+```
+
+- Concatenate n-1 word embedding vectors into a single feature vector.
+
+```python
+contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+```
+
+- Feature vector will go through a fully connected layer which outputs a hidden feature vector.
+
+```python
+hidden1 = paddle.layer.fc(input=contextemb,
+                          size=hiddensize,
+                          act=paddle.activation.Sigmoid(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                          bias_attr=paddle.attr.Param(learning_rate=2),
+                          param_attr=paddle.attr.Param(
+                                initial_std=1. / math.sqrt(embsize * 8),
+                                learning_rate=1))
+```
+
+- Hidden feature vector will go through another fully conected layer, turn into a $|V|$ dimensional vector. At the same time softmax will be applied to get the probability of each word being generated.
+
+```python
+predictword = paddle.layer.fc(input=hidden1,
+                              size=dict_size,
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              act=paddle.activation.Softmax())
+```
+
+- We will use cross-entropy cost function.
+
+```python
+cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+```
+
+- Create parameter, optimizer and trainer.
+
+```python
+parameters = paddle.parameters.create(cost)
+adagrad = paddle.optimizer.AdaGrad(
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+trainer = paddle.trainer.SGD(cost, parameters, adagrad)
+```
+
+Next, we will begin the training process. `paddle.dataset.imikolov.train()` and `paddle.dataset.imikolov.test()` is our training set and test set. Both of the function will return a **reader**: In PaddlePaddle, reader is a python function which returns a Python iterator which output a single data instance at a time.
+
+`paddle.batch` takes reader as input, outputs a **batched reader**: In PaddlePaddle, a reader outputs a single data instance at a time but batched reader outputs a minibatch of data instances.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+        print "Pass %d, Testing metrics %s" % (event.pass_id, result.metrics)
+        with open("model_%d.tar"%event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+trainer.train(
+    paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+    num_passes=100,
+    event_handler=event_handler)
+```
+
+`trainer.train` will start training, the output of `event_handler` will be similar to following:
+```text
+Pass 0, Batch 0, Cost 7.870579, {'classification_error_evaluator': 1.0}, Testing metrics {'classification_error_evaluator': 0.999591588973999}
+Pass 0, Batch 100, Cost 6.136420, {'classification_error_evaluator': 0.84375}, Testing metrics {'classification_error_evaluator': 0.8328699469566345}
+Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Testing metrics {'classification_error_evaluator': 0.8328542709350586}
+...
+```
+
+After 30 passes, we can get average error rate around 0.735611.
+
+## Save word dict and embedding table
+
+after training, we can save the word dict and embedding table for the future usage.
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
+
+## Model Application
+
+After the model is trained, we can load the  saved model parameters and use it for other models. We can also use the parameters in various applications.
+
+### Viewing Word Vector
+
+Parameters trained by PaddlePaddle can be viewed by `parameters.get()`. For example, we can check the word vector for word `apple`.
+
+```python
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+
+print embeddings[word_dict['apple']]
+```
+
+```text
+[-0.38961065 -0.02392169 -0.00093231  0.36301503  0.13538605  0.16076435
+-0.0678709   0.1090285   0.42014077 -0.24119169 -0.31847557  0.20410083
+0.04910378  0.19021918 -0.0122014  -0.04099389 -0.16924137  0.1911236
+-0.10917275  0.13068172 -0.23079982  0.42699069 -0.27679482 -0.01472992
+0.2069038   0.09005053 -0.3282454   0.12717034 -0.24218646  0.25304323
+0.19072419 -0.24286366]
+```
+
+### Modifying Word Vector
+
+Word vectors (`embeddings`) that we get is a numpy array. We can modify this array and set it back to `parameters`.
+
+
+```python
+def modify_embedding(emb):
+    # Add your modification here.
+    pass
+
+modify_embedding(embeddings)
+parameters.set("_proj", embeddings)
+```
+
+### Calculating Cosine Similarity
+
+Cosine similarity is one way of quantifying the similarity between two vectors. The range of result is $[-1, 1]$. The bigger the value, the similar two vectors are:
+
+
+```python
+from scipy import spatial
+
+emb_1 = embeddings[word_dict['world']]
+emb_2 = embeddings[word_dict['would']]
+
+print spatial.distance.cosine(emb_1, emb_2)
+```
+
+```text
+0.99375076448
+```
+
+## Conclusion
+
+This chapter introduces word embeddings, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
+
+In information retrieval, the relevance between the query and document keyword can be computed through the cosine similarity of their word embeddings. In grammar analysis and semantic analysis, a previously trained word embedding can initialize models for better performance. In document classification, clustering the word embedding can group synonyms in the documents. We hope that readers can use word embedding models in their work after reading this chapter.
+
+
+## References
+1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155.
+2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201.
+3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013.
+4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605.
+5. https://en.wikipedia.org/wiki/Singular_value_decomposition
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/word2vec/calculate_dis.py b/04.word2vec/calculate_dis.py
similarity index 100%
rename from word2vec/calculate_dis.py
rename to 04.word2vec/calculate_dis.py
diff --git a/word2vec/format_convert.py b/04.word2vec/format_convert.py
similarity index 95%
rename from word2vec/format_convert.py
rename to 04.word2vec/format_convert.py
index f12ad81c0aa0d532d6f337d41479228f5b04ebc9..ddbff62a942d0249bbca49aabd07ef3276b2d15c 100755
--- a/word2vec/format_convert.py
+++ b/04.word2vec/format_convert.py
@@ -30,25 +30,25 @@ import struct
 
 def binary2text(input, output, paraDim):
     """
-    Convert a binary parameter file of embedding model to be a text file.  
+    Convert a binary parameter file of embedding model to be a text file.
     input: the name of input binary parameter file, the format is:
            1) the first 16 bytes is filehead:
                 version(4 bytes): version of paddle, default = 0
                 floatSize(4 bytes): sizeof(float) = 4
                 paraCount(8 bytes): total number of parameter
-           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes 
+           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
     output: the name of output text parameter file, for example:
            0,4,32156096
            -0.7845433,1.1937413,-0.1704215,...
            0.0000909,0.0009465,-0.0008813,...
            ...
            the format is:
-           1) the first line is filehead: 
+           1) the first line is filehead:
               version=0, floatSize=4, paraCount=32156096
            2) other lines print the paramters
               a) each line prints paraDim paramters splitted by ','
               b) there is paraCount/paraDim lines (embedding words)
-    paraDim: dimension of parameters 
+    paraDim: dimension of parameters
     """
     fi = open(input, "rb")
     fo = open(output, "w")
@@ -78,7 +78,7 @@ def binary2text(input, output, paraDim):
 
 def get_para_count(input):
     """
-    Compute the total number of embedding parameters in input text file. 
+    Compute the total number of embedding parameters in input text file.
     input: the name of input text file
     """
     numRows = 1
@@ -96,14 +96,14 @@ def text2binary(input, output, paddle_head=True):
     Convert a text parameter file of embedding model to be a binary file.
     input: the name of input text parameter file, for example:
            -0.7845433,1.1937413,-0.1704215,...
-           0.0000909,0.0009465,-0.0008813,... 
+           0.0000909,0.0009465,-0.0008813,...
            ...
            the format is:
            1) it doesn't have filehead
-           2) each line stores the same dimension of parameters, 
+           2) each line stores the same dimension of parameters,
               the separator is commas ','
     output: the name of output binary parameter file, the format is:
-           1) the first 16 bytes is filehead: 
+           1) the first 16 bytes is filehead:
              version(4 bytes), floatSize(4 bytes), paraCount(8 bytes)
            2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
     """
@@ -127,7 +127,7 @@ def text2binary(input, output, paddle_head=True):
 
 def main():
     """
-    Main entry for running format_convert.py 
+    Main entry for running format_convert.py
     """
     usage = "usage: \n" \
             "python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
diff --git a/word2vec/image/2d_similarity.png b/04.word2vec/image/2d_similarity.png
similarity index 100%
rename from word2vec/image/2d_similarity.png
rename to 04.word2vec/image/2d_similarity.png
diff --git a/word2vec/image/cbow.png b/04.word2vec/image/cbow.png
similarity index 100%
rename from word2vec/image/cbow.png
rename to 04.word2vec/image/cbow.png
diff --git a/word2vec/image/cbow_en.png b/04.word2vec/image/cbow_en.png
similarity index 100%
rename from word2vec/image/cbow_en.png
rename to 04.word2vec/image/cbow_en.png
diff --git a/word2vec/image/ngram.en.png b/04.word2vec/image/ngram.en.png
similarity index 100%
rename from word2vec/image/ngram.en.png
rename to 04.word2vec/image/ngram.en.png
diff --git a/word2vec/image/ngram.png b/04.word2vec/image/ngram.png
similarity index 100%
rename from word2vec/image/ngram.png
rename to 04.word2vec/image/ngram.png
diff --git a/word2vec/image/nnlm.png b/04.word2vec/image/nnlm.png
similarity index 100%
rename from word2vec/image/nnlm.png
rename to 04.word2vec/image/nnlm.png
diff --git a/word2vec/image/nnlm_en.png b/04.word2vec/image/nnlm_en.png
similarity index 100%
rename from word2vec/image/nnlm_en.png
rename to 04.word2vec/image/nnlm_en.png
diff --git a/word2vec/image/sentence_emb.png b/04.word2vec/image/sentence_emb.png
similarity index 100%
rename from word2vec/image/sentence_emb.png
rename to 04.word2vec/image/sentence_emb.png
diff --git a/word2vec/image/skipgram.png b/04.word2vec/image/skipgram.png
similarity index 100%
rename from word2vec/image/skipgram.png
rename to 04.word2vec/image/skipgram.png
diff --git a/word2vec/image/skipgram_en.png b/04.word2vec/image/skipgram_en.png
similarity index 100%
rename from word2vec/image/skipgram_en.png
rename to 04.word2vec/image/skipgram_en.png
diff --git a/word2vec/index.html b/04.word2vec/index.cn.html
similarity index 67%
rename from word2vec/index.html
rename to 04.word2vec/index.cn.html
index dbc9c53bef2f5614dcad34a4f22596da5f47b0e9..5a1aaf1229e255d6044a3eb9778c200602202530 100644
--- a/word2vec/index.html
+++ b/04.word2vec/index.cn.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,20 +35,21 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
+
 # 词向量
 
-本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/word2vec)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/175.html)。
 
 ## 背景介绍
 
 本章我们介绍词的向量表征，也称为word embedding。词向量是自然语言处理中常见的一个操作，是搜索引擎、广告系统、推荐系统等互联网服务背后常见的基础技术。
 
-在这些互联网服务里，我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较，我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。 
+在这些互联网服务里，我们经常要比较两个词或者两段文本之间的相关性。为了做这样的比较，我们往往先要把词表示成计算机适合处理的方式。最自然的方式恐怕莫过于向量空间模型(vector space model)。
 在这种方式里，每个词被表示成一个实数向量（one-hot vector），其长度为字典大小，每个维度对应一个字典里的每个词，除了这个词对应维度上的值是1，其他元素都是0。
 
 One-hot vector虽然自然，但是用处有限。比如，在互联网广告系统里，如果用户输入的query是“母亲节”，而有一个广告的关键词是“康乃馨”。虽然按照常理，我们知道这两个词之间是有联系的——母亲节通常应该送给母亲一束康乃馨；但是这两个词对应的one-hot vectors之间的距离度量，无论是欧氏距离还是余弦相似度(cosine similarity)，由于其向量正交，都认为这两个词毫无相关性。 得出这种与我们相悖的结论的根本原因是：每个词本身的信息量都太小。所以，仅仅给定两个词，不足以让我们准确判别它们是否相关。要想精确计算相关性，我们还需要更多的信息——从大量数据里通过机器学习方法归纳出来的知识。
@@ -72,8 +74,8 @@ $$X = USV^T$$
 本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
 
 <p align="center">
-	<img src = "image/2d_similarity.png" width=400><br/>
-	图1. 词向量的二维投影
+    <img src = "image/2d_similarity.png" width=400><br/>
+    图1. 词向量的二维投影
 </p>
 
 另一方面，我们知道两个向量的余弦值在$[-1,1]$的区间内：两个完全相同的向量余弦值为1, 两个相互垂直的向量之间余弦值为0，两个方向完全相反的向量余弦值为-1，即相关性和余弦值大小成正比。因此我们还可以计算两个词向量的余弦相似度:
@@ -109,7 +111,7 @@ $$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
 
 
 
-### N-gram neural model 
+### N-gram neural model
 
 在计算语言学中，n-gram是一种重要的文本表示方法，表示一个文本中连续的n个项。基于具体的应用场景，每一项可以是一个字母、单词或者音节。 n-gram模型也是统计语言模型中的一种重要方法，用n-gram训练语言模型时，一般用每个n-gram的历史n-1个词语组成的内容来预测第n个词。
 
@@ -125,41 +127,41 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 
 其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率，$R(\theta)$表示参数正则项。
 
-<p align="center">	
-   	<img src="image/nnlm.png" width=500><br/>
-   	图2. N-gram神经网络模型
+<p align="center">
+       <img src="image/nnlm.png" width=500><br/>
+       图2. N-gram神经网络模型
 </p>
 
 图2展示了N-gram神经网络模型，从下往上看，该模型分为以下几个部分：
  - 对于每个样本，模型输入$w_{t-n+1},...w_{t-1}$, 输出句子第t个词为字典中`|V|`个词的概率。
- 
+
    每个输入词$w_{t-n+1},...w_{t-1}$首先通过映射矩阵映射到词向量$C(w_{t-n+1}),...C(w_{t-1})$。
- 
+
  - 然后所有词语的词向量连接成一个大向量，并经过一个非线性映射得到历史词语的隐层表示：
- 
-	$$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
-	
+
+    $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
+
     其中，$x$为所有词语的词向量连接成的大向量，表示文本历史特征；$\theta$、$U$、$b_1$、$b_2$和$W$分别为词向量层到隐层连接的参数。$g$表示未经归一化的所有输出单词概率，$g_i$表示未经归一化的字典中第$i$个单词的输出概率。
 
  - 根据softmax的定义，通过归一化$g_i$, 生成目标词$w_t$的概率为：
- 
+
   $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
 
  - 整个网络的损失值(cost)为多类分类交叉熵，用公式表示为
 
-   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$ 
+   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
 
    其中$y_k^i$表示第$i$个样本第$k$类的真实标签(0或1)，$softmax(g_k^i)$表示第i个样本第k类softmax输出的概率。
-   
 
 
-### Continuous Bag-of-Words model(CBOW) 
+
+### Continuous Bag-of-Words model(CBOW)
 
 CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
 
-<p align="center">	
-	<img src="image/cbow.png" width=250><br/>
-	图3. CBOW模型
+<p align="center">
+    <img src="image/cbow.png" width=250><br/>
+    图3. CBOW模型
 </p>
 
 具体来说，不考虑上下文的词语输入顺序，CBOW是用上下文词语的词向量的均值来预测当前词。即：
@@ -168,13 +170,13 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
 
 其中$x_t$为第$t$个词的词向量，分类分数（score）向量 $z=U*context$，最终的分类$y$采用softmax，损失函数采用多类分类交叉熵。
 
-### Skip-gram model 
+### Skip-gram model
 
 CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
 
-<p align="center">	
-	<img src="image/skipgram.png" width=250><br/>
-	图4. Skip-gram模型
+<p align="center">
+    <img src="image/skipgram.png" width=250><br/>
+    图4. Skip-gram模型
 </p>
 
 如上图所示，Skip-gram模型的具体做法是，将一个词的词向量映射到$2n$个词的词向量（$2n$表示当前输入词的前后各$n$个词），然后分别通过softmax得到这$2n$个词的分类损失值之和。
@@ -184,29 +186,29 @@ CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去
 
 ### 数据介绍
 
-本教程使用Penn Tree Bank (PTB)数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
+本教程使用Penn Treebank （PTB）（经Tomas Mikolov预处理过的版本）数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
 
 <p align="center">
 <table>
-	<tr>
-		<td>训练数据</td>
-		<td>验证数据</td>
-		<td>测试数据</td>
-	</tr>
-	<tr>
-		<td>ptb.train.txt</td>
-		<td>ptb.valid.txt</td>
-		<td>ptb.test.txt</td>
-	</tr>
-	<tr>
-		<td>42068句</td>
-		<td>3370句</td>
-		<td>3761句</td>
-	</tr>
+    <tr>
+        <td>训练数据</td>
+        <td>验证数据</td>
+        <td>测试数据</td>
+    </tr>
+    <tr>
+        <td>ptb.train.txt</td>
+        <td>ptb.valid.txt</td>
+        <td>ptb.test.txt</td>
+    </tr>
+    <tr>
+        <td>42068句</td>
+        <td>3370句</td>
+        <td>3761句</td>
+    </tr>
 </table>
 </p>
 
-	
+
 ### 数据预处理
 
 本章训练的是5-gram模型，表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`，自动做数据的下载与预处理，方便大家使用。
@@ -223,13 +225,14 @@ a dream that one day
 dream that one day <e>
 ```
 
+最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。
 ## 编程实现
 
 本配置的模型结构如下图所示：
 
-<p align="center">	
-	<img src="image/ngram.png" width=400><br/>
-	图5. 模型配置中的N-gram神经网络模型
+<p align="center">
+    <img src="image/ngram.png" width=400><br/>
+    图5. 模型配置中的N-gram神经网络模型
 </p>
 
 首先，加载所需要的包：
@@ -246,11 +249,33 @@ hiddensize = 256 # 隐层维度
 N = 5 # 训练5-Gram
 ```
 
+用于保存和加载word_dict和embedding table的函数
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
 接着，定义网络结构：
 
 - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
-	
-```python	
+
+```python
 def wordemb(inlayer):
     wordemb = paddle.layer.table_projection(
         input=inlayer,
@@ -259,61 +284,61 @@ def wordemb(inlayer):
             name="_proj",
             initial_std=0.001,
             learning_rate=1,
-            l2_rate=0, ))
+            l2_rate=0,
+            sparse_update=True))
     return wordemb
 ```
 
 - 定义输入层接受的数据类型以及名字。
 
 ```python
-def main():
-    paddle.init(use_gpu=False, trainer_count=1) # 初始化PaddlePaddle
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-	# 每个输入层都接受整形数据，这些数据的范围是[0, dict_size)
-    firstword = paddle.layer.data(
-        name="firstw", type=paddle.data_type.integer_value(dict_size))
-    secondword = paddle.layer.data(
-        name="secondw", type=paddle.data_type.integer_value(dict_size))
-    thirdword = paddle.layer.data(
-        name="thirdw", type=paddle.data_type.integer_value(dict_size))
-    fourthword = paddle.layer.data(
-        name="fourthw", type=paddle.data_type.integer_value(dict_size))
-    nextword = paddle.layer.data(
-        name="fifthw", type=paddle.data_type.integer_value(dict_size))
-
-    Efirst = wordemb(firstword)
-    Esecond = wordemb(secondword)
-    Ethird = wordemb(thirdword)
-    Efourth = wordemb(fourthword)
+paddle.init(use_gpu=False, trainer_count=3) # 初始化PaddlePaddle
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+# 每个输入层都接受整形数据，这些数据的范围是[0, dict_size)
+firstword = paddle.layer.data(
+    name="firstw", type=paddle.data_type.integer_value(dict_size))
+secondword = paddle.layer.data(
+    name="secondw", type=paddle.data_type.integer_value(dict_size))
+thirdword = paddle.layer.data(
+    name="thirdw", type=paddle.data_type.integer_value(dict_size))
+fourthword = paddle.layer.data(
+    name="fourthw", type=paddle.data_type.integer_value(dict_size))
+nextword = paddle.layer.data(
+    name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+Efirst = wordemb(firstword)
+Esecond = wordemb(secondword)
+Ethird = wordemb(thirdword)
+Efourth = wordemb(fourthword)
 ```
 
 - 将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
 
 ```python
-    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
 ```
 
 - 将历史文本特征经过一个全连接得到文本隐层特征。
 
 ```python
-    hidden1 = paddle.layer.fc(input=contextemb,
-                              size=hiddensize,
-                              act=paddle.activation.Sigmoid(),
-                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
-                              bias_attr=paddle.attr.Param(learning_rate=2),
-                              param_attr=paddle.attr.Param(
-                                  initial_std=1. / math.sqrt(embsize * 8),
-                                  learning_rate=1))
+hidden1 = paddle.layer.fc(input=contextemb,
+                          size=hiddensize,
+                          act=paddle.activation.Sigmoid(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                          bias_attr=paddle.attr.Param(learning_rate=2),
+                          param_attr=paddle.attr.Param(
+                                initial_std=1. / math.sqrt(embsize * 8),
+                                learning_rate=1))
 ```
-	
+
 - 将文本隐层特征，再经过一个全连接，映射成一个$|V|$维向量，同时通过softmax归一化得到这`|V|`个词的生成概率。
 
 ```python
-    predictword = paddle.layer.fc(input=hidden1,
-                                  size=dict_size,
-                                  bias_attr=paddle.attr.Param(learning_rate=2),
-                                  act=paddle.activation.Softmax())
+predictword = paddle.layer.fc(input=hidden1,
+                              size=dict_size,
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              act=paddle.activation.Softmax())
 ```
 
 - 网络的损失函数为多分类交叉熵，可直接调用`classification_cost`函数。
@@ -329,11 +354,11 @@ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
 - 正则化（regularization）： 是防止网络过拟合的一种手段，此处采用L2正则化。
 
 ```python
-    parameters = paddle.parameters.create(cost)
-    adam_optimizer = paddle.optimizer.Adam(
-        learning_rate=3e-3,
-        regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+parameters = paddle.parameters.create(cost)
+adagrad = paddle.optimizer.AdaGrad(
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+trainer = paddle.trainer.SGD(cost, parameters, adagrad)
 ```
 
 下一步，我们开始训练过程。`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python generator。
@@ -341,113 +366,106 @@ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
 `paddle.batch`的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minbatch。
 
 ```python
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                result = trainer.test(
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
                     paddle.batch(
                         paddle.dataset.imikolov.test(word_dict, N), 32))
-                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    result.metrics)
-
-    trainer.train(
-        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
-        num_passes=30,
-        event_handler=event_handler)
+        print "Pass %d, Testing metrics %s" % (event.pass_id, result.metrics)
+        with open("model_%d.tar"%event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+trainer.train(
+    paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+    num_passes=100,
+    event_handler=event_handler)
 ```
 
-训练过程是完全自动的，event_handler里打印的日志类似如下所示：
-
 ```text
-.............................
-I1222 09:27:16.477841 12590 TrainerInternal.cpp:162]  Batch=3000 samples=300000 AvgCost=5.36135 CurrentCost=5.36135 Eval: classification_error_evaluator=0.818653  CurrentEval: class
-ification_error_evaluator=0.818653 
-.............................
-I1222 09:27:22.416700 12590 TrainerInternal.cpp:162]  Batch=6000 samples=600000 AvgCost=5.29301 CurrentCost=5.22467 Eval: classification_error_evaluator=0.814542  CurrentEval: class
-ification_error_evaluator=0.81043 
-.............................
-I1222 09:27:28.343756 12590 TrainerInternal.cpp:162]  Batch=9000 samples=900000 AvgCost=5.22494 CurrentCost=5.08876 Eval: classification_error_evaluator=0.810088  CurrentEval: class
-ification_error_evaluator=0.80118 
-..I1222 09:27:29.128582 12590 TrainerInternal.cpp:179]  Pass=0 Batch=9296 samples=929600 AvgCost=5.21786 Eval: classification_error_evaluator=0.809647 
-I1222 09:27:29.627616 12590 Tester.cpp:111]  Test samples=73760 cost=4.9594 Eval: classification_error_evaluator=0.79676 
-I1222 09:27:29.627713 12590 GradientMachine.cpp:112] Saving parameters to model/pass-00000
+Pass 0, Batch 0, Cost 7.870579, {'classification_error_evaluator': 1.0}, Testing metrics {'classification_error_evaluator': 0.999591588973999}
+Pass 0, Batch 100, Cost 6.136420, {'classification_error_evaluator': 0.84375}, Testing metrics {'classification_error_evaluator': 0.8328699469566345}
+Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Testing metrics {'classification_error_evaluator': 0.8328542709350586}
+...
 ```
+
+训练过程是完全自动的，event_handler里打印的日志类似如上所示：
+
 经过30个pass，我们将得到平均错误率为classification_error_evaluator=0.735611。
 
+## 保存词典和embedding
+
+训练完成之后，我们可以把词典和embedding table单独保存下来，后面可以直接使用
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
 
 ## 应用模型
-训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型参数从二进制格式转换成文本格式进行后续应用。
+训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型查看参数用来做后续应用。
 
-### 初始化其他模型
 
-训练好的模型参数可以用来初始化其他模型。具体方法如下：
-在PaddlePaddle 训练命令行中，用`--init_model_path` 来定义初始化模型的位置，用`--load_missing_parameter_strategy`指定除了词向量以外的新模型其他参数的初始化策略。注意，新模型需要和原模型共享被初始化参数的参数名。
-	
 ### 查看词向量
-PaddlePaddle训练出来的参数为二进制格式，存储在对应训练pass的文件夹下。这里我们提供了文件`format_convert.py`用来互转PaddlePaddle训练结果的二进制文件和文本格式特征文件。
 
-```bash
-python format_convert.py --b2t -i INPUT -o OUTPUT -d DIM
-```
-其中，INPUT是输入的（二进制）词向量模型名称，OUTPUT是输出的文本模型名称，DIM是词向量参数维度。
+PaddlePaddle训练出来的参数可以直接使用`parameters.get()`获取出来。例如查看单词`apple`的词向量，即为
+
 
-用法如：
+```python
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
 
-```bash
-python format_convert.py --b2t -i model/pass-00029/_proj -o model/pass-00029/_proj.txt -d 32
+print embeddings[word_dict['apple']]
 ```
-转换后得到的文本文件如下：
 
 ```text
-0,4,62496
--0.7444070,-0.1846171,-1.5771370,0.7070392,2.1963732,-0.0091410, ......
--0.0721337,-0.2429973,-0.0606297,0.1882059,-0.2072131,-0.7661019, ......
-......
+[-0.38961065 -0.02392169 -0.00093231  0.36301503  0.13538605  0.16076435
+-0.0678709   0.1090285   0.42014077 -0.24119169 -0.31847557  0.20410083
+0.04910378  0.19021918 -0.0122014  -0.04099389 -0.16924137  0.1911236
+-0.10917275  0.13068172 -0.23079982  0.42699069 -0.27679482 -0.01472992
+0.2069038   0.09005053 -0.3282454   0.12717034 -0.24218646  0.25304323
+0.19072419 -0.24286366]
 ```
 
-其中，第一行是PaddlePaddle 输出文件的格式说明，包含3个属性：<br/>
-1) PaddlePaddle的版本号，本例中为0;<br/>
-2) 浮点数占用的字节数，本例中为4;<br/>
-3) 总计的参数个数, 本例中为62496（即1953*32）;<br/>
-第二行及之后的每一行都按顺序表示字典里一个词的特征，用逗号分隔。
-	
-### 修改词向量
 
-我们可以对词向量进行修改，并转换成PaddlePaddle参数二进制格式，方法：	
+### 修改词向量
 
-```bash
-python format_convert.py --t2b -i INPUT -o OUTPUT
-```
+获得到的embedding为一个标准的numpy矩阵。我们可以对这个numpy矩阵进行修改，然后赋值回去。
 
-其中，INPUT是输入的输入的文本词向量模型名称，OUTPUT是输出的二进制词向量模型名称
 
-输入的文本格式如下（注意，不包含上面二进制转文本后第一行的格式说明）：
+```python
+def modify_embedding(emb):
+    # Add your modification here.
+    pass
 
-```text
--0.7444070,-0.1846171,-1.5771370,0.7070392,2.1963732,-0.0091410, ......
--0.0721337,-0.2429973,-0.0606297,0.1882059,-0.2072131,-0.7661019, ......
-......
+modify_embedding(embeddings)
+parameters.set("_proj", embeddings)
 ```
-	
-	
 
 ### 计算词语之间的余弦距离
 
 两个向量之间的距离可以用余弦值来表示，余弦值在$[-1,1]$的区间内，向量间余弦值越大，其距离越近。这里我们在`calculate_dis.py`中实现不同词语的距离度量。
 用法如下：
 
-```bash
-python calculate_dis.py VOCABULARY EMBEDDINGLAYER` 
-```
 
-其中，`VOCABULARY`是字典，`EMBEDDINGLAYER`是词向量模型，示例如下：
+```python
+from scipy import spatial
+
+emb_1 = embeddings[word_dict['world']]
+emb_2 = embeddings[word_dict['would']]
 
-```bash
-python calculate_dis.py data/vocabulary.txt model/pass-00029/_proj.txt
+print spatial.distance.cosine(emb_1, emb_2)
 ```
- 
- 
+
+```text
+0.99375076448
+```
+
 ## 总结
 本章中，我们介绍了词向量、语言模型和词向量的关系、以及如何通过训练神经网络模型获得词向量。在信息检索中，我们可以根据向量间的余弦夹角，来判断query和文档关键词这二者间的相关性。在句法分析和语义分析中，训练好的词向量可以用来初始化模型，以得到更好的效果。在文档分类中，有了词向量之后，可以用聚类的方法将文档中同义词进行分组。希望大家在本章后能够自行运用词向量进行相关领域的研究。
 
@@ -460,7 +478,8 @@ python calculate_dis.py data/vocabulary.txt model/pass-00029/_proj.txt
 5. https://en.wikipedia.org/wiki/Singular_value_decomposition
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -479,6 +498,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/04.word2vec/index.html b/04.word2vec/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..60d41221fff2b87b6519624314dbf9e73de42246
--- /dev/null
+++ b/04.word2vec/index.html
@@ -0,0 +1,514 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Word2Vec
+
+This is intended as a reference tutorial. The source code of this tutorial is located at [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec).
+
+For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Background Introduction
+
+This section introduces the concept of **word embeddings**, which are vector representations of words. Word embeddings is a popular technique used in natural language processing to support applications such as search engines, advertising systems, and recommendation systems.
+
+### One-Hot Vectors
+
+Building these applications requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
+
+For each word, its vector representation has the corresponding entry in the vector as 1, and all other entries as 0. The lengths of one-hot vectors match the size of the dictionary. Each entry of a vector corresponds to the presence (or absence) of a word in the dictionary.
+
+One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword "carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
+
+Like many machine learning models, word embeddings can represent knowledge in various ways. Another model may project an one-hot vector to an embedding vector of lower dimension e.g. $embedding(mother's day) = [0.3, 4.2, -1.5, ...], embedding(carnations) = [0.2, 5.6, -2.3, ...]$. Mapping one-hot vectors onto an embedded vector space has the potential to bring the embedding vectors of similar words (either semantically or usage-wise) closer to each other, so that the cosine similarity between the corresponding vectors for words like "Mother's Day" and "carnations" are no longer zero.
+
+A word embedding model could be a probabilistic model, a co-occurrence matrix model, or a neural network. Before people started using neural networks to generate word embedding, the traditional method was to calculate a co-occurrence matrix $X$ of words. Here, $X$ is a $|V| \times |V|$ matrix, where $X_{ij}$ represents the co-occurrence times of the $i$th and $j$th words in the vocabulary `V` within all corpus, and $|V|$ is the size of the vocabulary. By performing matrix decomposition on $X$ e.g. Singular Value Decomposition \[[5](#references)\]
+
+$$X = USV^T$$
+
+the resulting $U$ can be seen as the word embedding of all the words.
+
+However, this method suffers from many drawbacks:
+1) Since many pairs of words don't co-occur, the co-occurrence matrix is sparse. To achieve good performance of matrix factorization, further treatment on word frequency is needed;
+2) The matrix is large, frequently on the order of $10^6*10^6$;
+3) We need to manually filter out stop words (like "although", "a", ...), otherwise these frequent words will affect the performance of matrix factorization.
+
+The neural network based model does not require storing huge hash tables of statistics on all of the corpus. It obtains the word embedding by learning from semantic information, hence could avoid the aforementioned problems in the traditional method. In this chapter, we will introduce the details of neural network word embedding model and how to train such model in PaddlePaddle.
+
+## Results Demonstration
+
+In this section, we use the $t-$SNE\[[4](#references)\] data visualization algorithm to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we can see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
+
+<p align="center">
+    <img src = "image/2d_similarity.png" width=400><br/>
+    Figure 1. Two dimension projection of word embeddings
+</p>
+
+### Cosine Similarity
+
+On the other hand, we know that the cosine similarity between two vectors falls between $[-1,1]$. Specifically, the cosine similarity is 1 when the vectors are identical, 0 when the vectors are perpendicular, -1 when the are of opposite directions. That is, the cosine similarity between two vectors scales with their relevance. So we can calculate the cosine similarity of two word embedding vectors to represent their relevance:
+
+```
+please input two words: big huge
+similarity: 0.899180685161
+
+please input two words: from company
+similarity: -0.0997506977351
+```
+
+The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
+
+
+## Model Overview
+
+In this section, we will introduce three word embedding models: N-gram model, CBOW, and Skip-gram, which all output the frequency of each word given its immediate context.
+
+For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Training](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
+
+The latter two models, which became popular recently, are neural word embedding model developed by Tomas Mikolov at Google \[[3](#references)\]. Despite their apparent simplicity, these models train very well.
+
+### Language Model
+
+Before diving into word embedding models, we will first introduce the concept of **language model**. Language models build the joint probability function $P(w_1, ..., w_T)$ of a sentence, where $w_i$ is the i-th word in the sentence. The goal is to give higher probabilities to meaningful sentences, and lower probabilities to meaningless constructions.
+
+In general, models that generate the probability of a sequence can be applied to many fields, like machine translation, speech recognition, information retrieval, part-of-speech tagging, and handwriting recognition. Take information retrieval, for example. If you were to search for "how long is a football bame" (where bame is a medical noun), the search engine would have asked if you had meant "how long is a football game" instead. This is because the probability of "how long is a football bame" is very low according to the language model; in addition, among all of the words easily confused with "bame", "game" would build the most probable sentence.
+
+#### Target Probability
+For language model's target probability $P(w_1, ..., w_T)$, if the words in the sentence were to be independent, the joint probability of the whole sentence would be the product of each word's probability:
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
+
+However, the frequency of words in a sentence typically relates to the words before them, so canonical language models are constructed using conditional probability in its target probability:
+
+$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$
+
+
+### N-gram neural model
+
+In computational linguistics, n-gram is an important method to represent text. An n-gram represents a contiguous sequence of n consecutive items given a text. Based on the desired application scenario, each item could be a letter, a syllable or a word. The N-gram model is also an important method in statistical language modeling. When training language models with n-grams, the first (n-1) words of an n-gram are used to predict the *n*th word.
+
+Yoshua Bengio and other scientists describe how to train a word embedding model using neural network in the famous paper of Neural Probabilistic Language Models \[[1](#references)\] published in 2003. The Neural Network Language Model (NNLM) described in the paper learns the language model and word embedding simultaneously through a linear transformation and a non-linear hidden connection. That is, after training on large amounts of corpus, the model learns the word embedding; then, it computes the probability of the whole sentence, using the embedding. This type of language model can overcome the **curse of dimensionality** i.e. model inaccuracy caused by the difference in dimensionality between training and testing data. Note that the term *neural network language model* is ill-defined, so we will not use the name NNLM but only refer to it as *N-gram neural model* in this section.
+
+We have previously described language model using conditional probability, where the probability of the *t*-th word in a sentence depends on all $t-1$ words before it. Furthermore, since words further prior have less impact on a word, and every word within an n-gram is only effected by its previous n-1 words, we have:
+
+$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
+
+Given some real corpus in which all sentences are meaningful, the n-gram model should maximize the following objective function:
+
+$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
+
+where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional logarithmic probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
+
+<p align="center">
+       <img src="image/nnlm_en.png" width=500><br/>
+       Figure 2. N-gram neural network model
+</p>
+
+
+Figure 2 shows the N-gram neural network model. From the bottom up, the model has the following components:
+
+ - For each sample, the model gets input $w_{t-n+1},...w_{t-1}$, and outputs the probability that the t-th word is one of `|V|` in the dictionary.
+
+ Every input word $w_{t-n+1},...w_{t-1}$ first gets transformed into word embedding $C(w_{t-n+1}),...C(w_{t-1})$ through a transformation matrix.
+
+ - All the word embeddings concatenate into a single vector, which is mapped (nonlinearly) into the $t$-th word hidden representation:
+
+    $$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
+
+   where $x$ is the large vector concatenated from all the word embeddings representing the context; $\theta$, $U$, $b_1$, $b_2$ and $W$ are parameters connecting word embedding layers to the hidden layers. $g$ represents the unnormalized probability of the output word, $g_i$ represents the unnormalized probability of the output word being the i-th word in the dictionary.
+
+ - Based on the definition of softmax, using normalized $g_i$, the probability that the output word is $w_t$ is represented as:
+
+  $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
+
+ - The cost of the entire network is a multi-class cross-entropy and can be described by the following loss function
+
+   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$
+
+   where $y_k^i$ represents the true label for the $k$-th class in the $i$-th sample ($0$ or $1$), $softmax(g_k^i)$ represents the softmax probability for the $k$-th class in the $i$-th sample.
+
+### Continuous Bag-of-Words model(CBOW)
+
+CBOW model predicts the current word based on the N words both before and after it. When $N=2$, the model is as the figure below:
+
+<p align="center">
+    <img src="image/cbow_en.png" width=250><br/>
+    Figure 3. CBOW model
+</p>
+
+Specifically, by ignoring the order of words in the sequence, CBOW uses the average value of the word embedding of the context to predict the current word:
+
+$$\text{context} = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
+
+where $x_t$ is the word embedding of the t-th word, classification score vector is $z=U*\text{context}$, the final classification $y$ uses softmax and the loss function uses multi-class cross-entropy.
+
+### Skip-gram model
+
+The advantages of CBOW is that it smooths over the word embeddings of the context and reduces noise, so it is very effective on small dataset. Skip-gram uses a word to predict its context and get multiple context for the given word, so it can be used in larger datasets.
+
+<p align="center">
+    <img src="image/skipgram_en.png" width=250><br/>
+    Figure 4. Skip-gram model
+</p>
+
+As illustrated in the figure above, skip-gram model maps the word embedding of the given word onto $2n$ word embeddings (including $n$ words before and $n$ words after the given word), and then combine the classification loss of all those $2n$ words by softmax.
+
+## Dataset
+
+We will use Penn Treebank (PTB) (Tomas Mikolov's pre-processed version) dataset. PTB is a small dataset, used in Recurrent Neural Network Language Modeling Toolkit\[[2](#references)\]. Its statistics are as follows:
+
+<p align="center">
+<table>
+    <tr>
+        <td>training set</td>
+        <td>validation set</td>
+        <td>test set</td>
+    </tr>
+    <tr>
+        <td>ptb.train.txt</td>
+        <td>ptb.valid.txt</td>
+        <td>ptb.test.txt</td>
+    </tr>
+    <tr>
+        <td>42068 lines</td>
+        <td>3370 lines</td>
+        <td>3761 lines</td>
+    </tr>
+</table>
+</p>
+
+### Python Dataset Module
+
+We encapsulated the PTB Data Set in our Python module `paddle.dataset.imikolov`. This module can
+
+1. download the dataset to `~/.cache/paddle/dataset/imikolov`, if not yet, and
+2. [preprocesses](#preprocessing) the dataset.
+
+### Preprocessing
+
+We will be training a 5-gram model. Given five words in a window, we will predict the fifth word given the first four words.
+
+Beginning and end of a sentence have a special meaning, so we will add begin token `<s>` in the front of the sentence. And end token `<e>` in the end of the sentence. By moving the five word window in the sentence, data instances are generated.
+
+For example, the sentence "I have a dream that one day" generates five data instances:
+
+```text
+<s> I have a dream
+I have a dream that
+have a dream that one
+a dream that one day
+dream that one day <e>
+```
+
+At last, each data instance will be converted into an integer sequence according it's words' index inside the dictionary.
+
+## Training
+
+The neural network that we will be using is illustrated in the graph below:
+
+<p align="center">
+    <img src="image/ngram.en.png" width=400><br/>
+    Figure 5. N-gram neural network model in model configuration
+</p>
+
+`word2vec/train.py` demonstrates training word2vec using PaddlePaddle:
+
+- Import packages.
+
+```python
+import math
+import paddle.v2 as paddle
+```
+
+- Configure parameter.
+
+```python
+embsize = 32 # word vector dimension
+hiddensize = 256 # hidden layer dimension
+N = 5 # train 5-gram
+```
+
+
+- functions used to save and load word dict and embedding table
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
+- Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example).
+
+```python
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+```
+
+- Define name and type for input to data layer.
+
+```python
+paddle.init(use_gpu=False, trainer_count=3)
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+# Every layer takes integer value of range [0, dict_size)
+firstword = paddle.layer.data(
+    name="firstw", type=paddle.data_type.integer_value(dict_size))
+secondword = paddle.layer.data(
+    name="secondw", type=paddle.data_type.integer_value(dict_size))
+thirdword = paddle.layer.data(
+    name="thirdw", type=paddle.data_type.integer_value(dict_size))
+fourthword = paddle.layer.data(
+    name="fourthw", type=paddle.data_type.integer_value(dict_size))
+nextword = paddle.layer.data(
+    name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+Efirst = wordemb(firstword)
+Esecond = wordemb(secondword)
+Ethird = wordemb(thirdword)
+Efourth = wordemb(fourthword)
+```
+
+- Concatenate n-1 word embedding vectors into a single feature vector.
+
+```python
+contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+```
+
+- Feature vector will go through a fully connected layer which outputs a hidden feature vector.
+
+```python
+hidden1 = paddle.layer.fc(input=contextemb,
+                          size=hiddensize,
+                          act=paddle.activation.Sigmoid(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                          bias_attr=paddle.attr.Param(learning_rate=2),
+                          param_attr=paddle.attr.Param(
+                                initial_std=1. / math.sqrt(embsize * 8),
+                                learning_rate=1))
+```
+
+- Hidden feature vector will go through another fully conected layer, turn into a $|V|$ dimensional vector. At the same time softmax will be applied to get the probability of each word being generated.
+
+```python
+predictword = paddle.layer.fc(input=hidden1,
+                              size=dict_size,
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              act=paddle.activation.Softmax())
+```
+
+- We will use cross-entropy cost function.
+
+```python
+cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+```
+
+- Create parameter, optimizer and trainer.
+
+```python
+parameters = paddle.parameters.create(cost)
+adagrad = paddle.optimizer.AdaGrad(
+    learning_rate=3e-3,
+    regularization=paddle.optimizer.L2Regularization(8e-4))
+trainer = paddle.trainer.SGD(cost, parameters, adagrad)
+```
+
+Next, we will begin the training process. `paddle.dataset.imikolov.train()` and `paddle.dataset.imikolov.test()` is our training set and test set. Both of the function will return a **reader**: In PaddlePaddle, reader is a python function which returns a Python iterator which output a single data instance at a time.
+
+`paddle.batch` takes reader as input, outputs a **batched reader**: In PaddlePaddle, a reader outputs a single data instance at a time but batched reader outputs a minibatch of data instances.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+        print "Pass %d, Testing metrics %s" % (event.pass_id, result.metrics)
+        with open("model_%d.tar"%event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+trainer.train(
+    paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+    num_passes=100,
+    event_handler=event_handler)
+```
+
+`trainer.train` will start training, the output of `event_handler` will be similar to following:
+```text
+Pass 0, Batch 0, Cost 7.870579, {'classification_error_evaluator': 1.0}, Testing metrics {'classification_error_evaluator': 0.999591588973999}
+Pass 0, Batch 100, Cost 6.136420, {'classification_error_evaluator': 0.84375}, Testing metrics {'classification_error_evaluator': 0.8328699469566345}
+Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Testing metrics {'classification_error_evaluator': 0.8328542709350586}
+...
+```
+
+After 30 passes, we can get average error rate around 0.735611.
+
+## Save word dict and embedding table
+
+after training, we can save the word dict and embedding table for the future usage.
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
+
+## Model Application
+
+After the model is trained, we can load the  saved model parameters and use it for other models. We can also use the parameters in various applications.
+
+### Viewing Word Vector
+
+Parameters trained by PaddlePaddle can be viewed by `parameters.get()`. For example, we can check the word vector for word `apple`.
+
+```python
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+
+print embeddings[word_dict['apple']]
+```
+
+```text
+[-0.38961065 -0.02392169 -0.00093231  0.36301503  0.13538605  0.16076435
+-0.0678709   0.1090285   0.42014077 -0.24119169 -0.31847557  0.20410083
+0.04910378  0.19021918 -0.0122014  -0.04099389 -0.16924137  0.1911236
+-0.10917275  0.13068172 -0.23079982  0.42699069 -0.27679482 -0.01472992
+0.2069038   0.09005053 -0.3282454   0.12717034 -0.24218646  0.25304323
+0.19072419 -0.24286366]
+```
+
+### Modifying Word Vector
+
+Word vectors (`embeddings`) that we get is a numpy array. We can modify this array and set it back to `parameters`.
+
+
+```python
+def modify_embedding(emb):
+    # Add your modification here.
+    pass
+
+modify_embedding(embeddings)
+parameters.set("_proj", embeddings)
+```
+
+### Calculating Cosine Similarity
+
+Cosine similarity is one way of quantifying the similarity between two vectors. The range of result is $[-1, 1]$. The bigger the value, the similar two vectors are:
+
+
+```python
+from scipy import spatial
+
+emb_1 = embeddings[word_dict['world']]
+emb_2 = embeddings[word_dict['would']]
+
+print spatial.distance.cosine(emb_1, emb_2)
+```
+
+```text
+0.99375076448
+```
+
+## Conclusion
+
+This chapter introduces word embeddings, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
+
+In information retrieval, the relevance between the query and document keyword can be computed through the cosine similarity of their word embeddings. In grammar analysis and semantic analysis, a previously trained word embedding can initialize models for better performance. In document classification, clustering the word embedding can group synonyms in the documents. We hope that readers can use word embedding models in their work after reading this chapter.
+
+
+## References
+1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155.
+2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201.
+3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013.
+4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605.
+5. https://en.wikipedia.org/wiki/Singular_value_decomposition
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/04.word2vec/train.py b/04.word2vec/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab4f67deb81dfa642ee113c9bc79c52e0fa7e54b
--- /dev/null
+++ b/04.word2vec/train.py
@@ -0,0 +1,111 @@
+import math
+import os
+
+import numpy
+import paddle.v2 as paddle
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
+    return wordemb
+
+
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+
+
+def main():
+    paddle.init(use_gpu=with_gpu, trainer_count=3)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    # Every layer takes integer value of range [0, dict_size)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(
+        input=contextemb,
+        size=hiddensize,
+        act=paddle.activation.Sigmoid(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5),
+        bias_attr=paddle.attr.Param(learning_rate=2),
+        param_attr=paddle.attr.Param(
+            initial_std=1. / math.sqrt(embsize * 8), learning_rate=1))
+    predictword = paddle.layer.fc(
+        input=hidden1,
+        size=dict_size,
+        bias_attr=paddle.attr.Param(learning_rate=2),
+        act=paddle.activation.Softmax())
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost, parameters, adagrad)
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                paddle.batch(paddle.dataset.imikolov.test(word_dict, N), 32))
+            print "Pass %d, Testing metrics %s" % (event.pass_id,
+                                                   result.metrics)
+            with open("model_%d.tar" % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=100,
+        event_handler=event_handler)
+
+    # save word dict and embedding table
+    embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+    save_dict_and_embedding(word_dict, embeddings)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/05.recommender_system/.gitignore b/05.recommender_system/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f23901aeb3a9e7cd12611fc556742670d04a9bb5
--- /dev/null
+++ b/05.recommender_system/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.ipynb_checkpoints
diff --git a/05.recommender_system/README.cn.md b/05.recommender_system/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..365c8a795916ebb59720be94791f9c789df64980
--- /dev/null
+++ b/05.recommender_system/README.cn.md
@@ -0,0 +1,465 @@
+# 个性化推荐
+
+本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/176.html)。
+
+## 背景介绍
+
+在网络技术不断发展和电子商务规模不断扩大的背景下，商品数量和种类快速增长，用户需要花费大量时间才能找到自己想买的商品，这就是信息超载问题。为了解决这个难题，推荐系统（Recommender System）应运而生。
+
+个性化推荐系统是信息过滤系统（Information Filtering System）的子集，它可以用在很多领域，如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为，发现用户的个性化需求与兴趣特点，将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同，推荐系统不需要用户准确地描述出自己的需求，而是根据分析历史行为建模，主动提供满足用户兴趣和需求的信息。
+
+传统的推荐系统方法主要有：
+
+- 协同过滤推荐（Collaborative Filtering Recommendation）：该方法收集分析用户历史行为、活动、偏好，计算一个用户与其他用户的相似度，利用目标用户的相似用户对商品评价的加权评价值，来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品；缺点是对于没有任何行为的新用户存在冷启动的问题，同时也存在用户与商品之间的交互数据不够多造成的稀疏问题，会导致模型难以找到相近用户。
+- 基于内容过滤推荐[[1](#参考文献)]（Content-based Filtering Recommendation）：该方法利用商品的内容描述，抽象出有意义的特征，通过计算用户的兴趣和商品描述之间的相似度，来给用户做推荐。优点是简单直接，不需要依据其他用户对商品的评价，而是通过商品属性进行商品相似度度量，从而推荐给用户所感兴趣商品的相似商品；缺点是对于没有任何行为的新用户同样存在冷启动的问题。
+- 组合推荐[[2](#参考文献)]（Hybrid Recommendation）：运用不同的输入和技术共同进行推荐，以弥补各自推荐技术的缺点。
+
+其中协同过滤是应用最广泛的技术之一，它又可以分为多个子类：基于用户 （User-Based）的推荐[[3](#参考文献)] 、基于物品（Item-Based）的推荐[[4](#参考文献)]、基于社交网络关系（Social-Based）的推荐[[5](#参考文献)]、基于模型（Model-based）的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想，此后，基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。
+
+深度学习具有优秀的自动提取特征的能力，能够学习多层次的抽象特征表示，并对异质或跨域的内容信息进行学习，可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型，以及如何使用PaddlePaddle实现模型。
+
+## 效果展示
+
+我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后，只需要输入对应的用户ID和电影ID，就可以得出一个匹配的分数（范围[0,5]，分数越高视为兴趣越大），然后根据所有电影的推荐得分排序，推荐给用户可能感兴趣的电影。
+
+```
+Input movie_id: 1962
+Input user_id: 1
+Prediction Score is 4.25
+```
+
+## 模型概览
+
+本章中，我们首先介绍YouTube的视频推荐系统[[7](#参考文献)]，然后介绍我们实现的融合推荐模型。
+
+### YouTube的深度神经网络推荐系统
+
+YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
+
+<p align="center">
+<img src="image/YouTube_Overview.png" width="70%" ><br/>
+图1. YouTube 推荐系统结构
+</p>
+
+#### 候选生成网络（Candidate Generation Network）
+
+候选生成网络将推荐问题建模为一个类别数极大的多类分类问题：对于一个Youtube用户，使用其观看历史（视频ID）、搜索词记录（search tokens）、人口学信息（如地理位置、用户登录设备）、二值特征（如性别，是否登录）和连续特征（如用户年龄）等，对视频库中所有视频进行多分类，得到每一类别的分类结果（即每一个视频的推荐概率），最终输出概率较高的几百个视频。
+
+首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
+
+<p align="center">
+<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
+图2. 候选生成网络结构
+</p>
+
+对于一个用户$U$，预测此刻用户要观看的视频$\omega$为视频$i$的概率公式为：
+
+$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
+
+其中$u$为用户$U$的特征表示，$V$为视频库集合，$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量，两者点积可以通过全连接层实现。
+
+考虑到softmax分类的类别数非常多，为了保证一定的计算效率：1）训练阶段，使用负样本类别采样将实际计算的类别数缩小至数千；2）推荐（预测）阶段，忽略softmax的归一化计算（不影响结果），将类别打分问题简化为点积（dot product）空间中的最近邻（nearest neighbor）搜索问题，取与$u$最近的$k$个视频作为生成的候选。
+
+#### 排序网络（Ranking Network）
+排序网络的结构类似于候选生成网络，但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似，这里也构造了大量的用于视频排序的相关特征（如视频 ID、上次观看时间等）。这些特征的处理方式和候选生成网络类似，不同之处是排序网络的顶部是一个加权逻辑回归（weighted logistic regression），它对所有候选视频进行打分，从高到底排序后将分数较高的一些视频返回给用户。
+
+### 融合推荐模型
+本节会使卷积神经网络（Convolutional Neural Networks）来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。
+
+#### 文本卷积神经网络（CNN）
+
+卷积神经网络经常用来处理具有类似网格拓扑结构（grid-like topology）的数据。例如，图像可以视为二维网格的像素点，自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征，并对其进行组合抽象得到更高级的特征表示。实验表明，卷积神经网络能高效地对图像及文本问题进行建模处理。  
+
+卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
+
+<p align="center">
+<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
+图3. 卷积神经网络文本分类模型
+</p>
+
+假设待处理句子的长度为$n$，其中第$i$个词的词向量（word embedding）为$x_i\in\mathbb{R}^k$，$k$为维度大小。  
+
+首先，进行词向量的拼接操作：将每$h$个词拼接起来形成一个大小为$h$的词窗口，记为$x_{i:i+h-1}$，它表示词序列$x_{i},x_{i+1},\ldots,x_{i+h-1}$的拼接，其中，$i$表示词窗口中第一个词在整个句子中的位置，取值范围从$1$到$n-h+1$，$x_{i:i+h-1}\in\mathbb{R}^{hk}$。  
+
+其次，进行卷积操作：把卷积核(kernel)$w\in\mathbb{R}^{hk}$应用于包含$h$个词的窗口$x_{i:i+h-1}$，得到特征$c_i=f(w\cdot x_{i:i+h-1}+b)$，其中$b\in\mathbb{R}$为偏置项（bias），$f$为非线性激活函数，如$sigmoid$。将卷积核应用于句子中所有的词窗口${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$，产生一个特征图（feature map）：
+
+$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
+
+接下来，对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征$\hat c$，它是特征图中所有元素的最大值：
+
+$$\hat c=max(c)$$  
+
+#### 模型概览
+
+在融合推荐模型的电影推荐系统中：
+
+1. 首先，使用用户特征和电影特征作为神经网络的输入，其中：
+
+   - 用户特征融合了四个属性信息，分别是用户ID、性别、职业和年龄。
+
+   - 电影特征融合了三个属性信息，分别是电影ID、电影类型ID和电影名称。
+
+2. 对用户特征，将用户ID映射为维度大小为256的向量表示，输入全连接层，并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。
+
+3. 对电影特征，将电影ID以类似用户ID的方式进行处理，电影类型ID以向量的形式直接输入全连接层，电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。
+
+4. 得到用户和电影的向量表示后，计算二者的余弦相似度作为推荐系统的打分。最后，用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。
+
+<p align="center">
+
+<img src="image/rec_regression_network.png" width="90%" ><br/>
+图4. 融合推荐模型
+</p>
+
+## 数据准备
+
+### 数据介绍与下载
+
+我们以 [MovieLens 百万数据集（ml-1m）](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价（评分范围 1~5 分，均为整数），由 GroupLens Research 实验室搜集整理。
+
+Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens`
+
+
+```python
+import paddle.v2 as paddle
+paddle.init(use_gpu=False)
+```
+
+
+```python
+# Run this block to show dataset's documentation
+# help(paddle.dataset.movielens)
+```
+
+在原始数据中包含电影的特征数据，用户的特征数据，和用户对电影的评分。
+
+例如，其中某一个电影特征为:
+
+
+```python
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+    <MovieInfo id(1), title(Toy Story ), categories(['Animation', "Children's", 'Comedy'])>
+
+
+这表示，电影的id是1，标题是《Toy Story》，该电影被分为到三个类别中。这三个类别是动画，儿童，喜剧。
+
+
+```python
+user_info = paddle.dataset.movielens.user_info()
+print user_info.values()[0]
+```
+
+    <UserInfo id(1), gender(F), age(1), job(10)>
+
+
+这表示，该用户ID是1，女性，年龄比18岁还年轻。职业ID是10。
+
+
+其中，年龄使用下列分布
+*  1:  "Under 18"
+* 18:  "18-24"
+* 25:  "25-34"
+* 35:  "35-44"
+* 45:  "45-49"
+* 50:  "50-55"
+* 56:  "56+"
+
+职业是从下面几种选项里面选则得出:
+*  0:  "other" or not specified
+*  1:  "academic/educator"
+*  2:  "artist"
+*  3:  "clerical/admin"
+*  4:  "college/grad student"
+*  5:  "customer service"
+*  6:  "doctor/health care"
+*  7:  "executive/managerial"
+*  8:  "farmer"
+*  9:  "homemaker"
+* 10:  "K-12 student"
+* 11:  "lawyer"
+* 12:  "programmer"
+* 13:  "retired"
+* 14:  "sales/marketing"
+* 15:  "scientist"
+* 16:  "self-employed"
+* 17:  "technician/engineer"
+* 18:  "tradesman/craftsman"
+* 19:  "unemployed"
+* 20:  "writer"
+
+而对于每一条训练/测试数据，均为 <用户特征> + <电影特征> + 评分。
+
+例如，我们获得第一条训练数据:
+
+
+```python
+train_set_creator = paddle.dataset.movielens.train()
+train_sample = next(train_set_creator())
+uid = train_sample[0]
+mov_id = train_sample[len(user_info[uid].value())]
+print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1])
+```
+
+    User <UserInfo id(1), gender(F), age(1), job(10)> rates Movie <MovieInfo id(1193), title(One Flew Over the Cuckoo's Nest ), categories(['Drama'])> with Score [5.0]
+
+
+即用户1对电影1193的评价为5分。
+
+## 模型配置说明
+
+下面我们开始根据输入数据的形式配置模型。
+
+
+```python
+uid = paddle.layer.data(
+    name='user_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_user_id() + 1))
+usr_emb = paddle.layer.embedding(input=uid, size=32)
+usr_fc = paddle.layer.fc(input=usr_emb, size=32)
+
+usr_gender_id = paddle.layer.data(
+    name='gender_id', type=paddle.data_type.integer_value(2))
+usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
+usr_gender_fc = paddle.layer.fc(input=usr_gender_emb, size=16)
+
+usr_age_id = paddle.layer.data(
+    name='age_id',
+    type=paddle.data_type.integer_value(
+        len(paddle.dataset.movielens.age_table)))
+usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
+usr_age_fc = paddle.layer.fc(input=usr_age_emb, size=16)
+
+usr_job_id = paddle.layer.data(
+    name='job_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_job_id() + 1))
+usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
+usr_job_fc = paddle.layer.fc(input=usr_job_emb, size=16)
+```
+
+如上述代码所示，对于每个用户，我们输入4维特征。其中包括`user_id`,`gender_id`,`age_id`,`job_id`。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便，我们借鉴NLP中的语言模型，将这几维离散的整数值，变换成embedding取出。分别形成`usr_emb`, `usr_gender_emb`, `usr_age_emb`, `usr_job_emb`。
+
+
+```python
+usr_combined_features = paddle.layer.fc(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
+        size=200,
+        act=paddle.activation.Tanh())
+```
+
+然后，我们对于所有的用户特征，均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。
+
+进而，我们对每一个电影特征做类似的变换，网络配置为:
+
+
+```python
+mov_id = paddle.layer.data(
+    name='movie_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_movie_id() + 1))
+mov_emb = paddle.layer.embedding(input=mov_id, size=32)
+mov_fc = paddle.layer.fc(input=mov_emb, size=32)
+
+mov_categories = paddle.layer.data(
+    name='category_id',
+    type=paddle.data_type.sparse_binary_vector(
+        len(paddle.dataset.movielens.movie_categories())))
+mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
+
+movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
+mov_title_id = paddle.layer.data(
+    name='movie_title',
+    type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
+mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
+mov_title_conv = paddle.networks.sequence_conv_pool(
+    input=mov_title_emb, hidden_size=32, context_len=3)
+
+mov_combined_features = paddle.layer.fc(
+    input=[mov_fc, mov_categories_hidden, mov_title_conv],
+    size=200,
+    act=paddle.activation.Tanh())
+```
+
+电影ID和电影类型分别映射到其对应的特征隐层。对于电影标题名称(title)，一个ID序列表示的词语序列，在输入卷积层后，将得到每个时间窗口的特征（序列特征），然后通过在时间维度降采样得到固定维度的特征，整个过程在sequence_conv_pool实现。
+
+最后再将电影的特征融合进`mov_combined_features`中。
+
+
+```python
+inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
+```
+
+进而，我们使用余弦相似度计算用户特征与电影特征的相似性。并将这个相似性拟合(回归)到用户评分上。
+
+
+```python
+cost = paddle.layer.square_error_cost(
+        input=inference,
+        label=paddle.layer.data(
+            name='score', type=paddle.data_type.dense_vector(1)))
+```
+
+至此，我们的优化目标就是这个网络配置中的`cost`了。
+
+## 训练模型
+
+### 定义参数
+神经网络的模型，我们可以简单的理解为网络拓朴结构+参数。之前一节，我们定义出了优化目标`cost`。这个`cost`即为网络模型的拓扑结构。我们开始训练模型，需要先定义出参数。定义方法为:
+
+
+```python
+parameters = paddle.parameters.create(cost)
+```
+
+    [INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
+    [INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__square_error_cost_0__]
+
+
+`parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候，我们没有指定参数名称，这里参数名称是自动生成的。当然，我们也可以指定每一个参数名称，方便日后维护。
+
+
+```python
+print parameters.keys()
+```
+
+    [u'___fc_layer_2__.wbias', u'___fc_layer_2__.w2', u'___embedding_layer_3__.w0', u'___embedding_layer_5__.w0', u'___embedding_layer_2__.w0', u'___embedding_layer_1__.w0', u'___fc_layer_1__.wbias', u'___fc_layer_0__.wbias', u'___fc_layer_1__.w0', u'___fc_layer_0__.w2', u'___fc_layer_0__.w3', u'___fc_layer_0__.w0', u'___fc_layer_0__.w1', u'___fc_layer_2__.w1', u'___fc_layer_2__.w0', u'___embedding_layer_4__.w0', u'___sequence_conv_pool_0___conv_fc.w0', u'___embedding_layer_0__.w0', u'___sequence_conv_pool_0___conv_fc.wbias']
+
+
+### 构造训练(trainer)
+
+下面，我们根据网络拓扑结构和模型参数来构造出一个本地训练(trainer)。在构造本地训练的时候，我们还需要指定这个训练的优化方法。这里我们使用Adam来作为优化算法。
+
+
+```python
+trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
+                            update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
+```
+
+    [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
+    [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__square_error_cost_0__]
+
+
+### 训练
+
+下面我们开始训练过程。
+
+我们直接使用Paddle提供的数据集读取程序。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和预测数据集。并且通过`feeding`来指定每一个数据和data_layer的对应关系。
+
+例如，这里的feeding表示的是，对于数据层 `user_id`，使用了reader中每一条数据的第0个元素。`gender_id`数据层使用了第1个元素。以此类推。
+
+```python
+feeding = {
+    'user_id': 0,
+    'gender_id': 1,
+    'age_id': 2,
+    'job_id': 3,
+    'movie_id': 4,
+    'category_id': 5,
+    'movie_title': 6,
+    'score': 7
+}
+```
+
+训练过程是完全自动的。我们可以使用event_handler与event_handler_plot来观察训练过程，或进行测试等。这里我们在event_handler_plot里面绘制了训练误差曲线和测试误差曲线。并且保存了模型。
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d Batch %d Cost %.2f" % (
+                event.pass_id, event.batch_id, event.cost)
+```
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            cost_ploter.append(train_title, step, event.cost)
+
+        if step % 1000 == 0: # every 1000 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.movielens.test(), batch_size=256),
+                feeding=feeding)
+            cost_ploter.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            cost_ploter.plot()
+
+        step += 1
+```
+
+```python
+trainer.train(
+    reader=paddle.batch(
+            paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+                            batch_size=256),
+    event_handler=event_handler_plot,
+    feeding=feeding,
+    num_passes=2)
+```
+
+
+![png](./image/output_32_0.png)
+
+## 应用模型
+
+在训练了几轮以后，您可以对模型进行推断。我们可以使用任意一个用户ID和电影ID，来预测该用户对该电影的评分。示例程序为:
+
+
+```python
+import copy
+user_id = 234
+movie_id = 345
+
+user = user_info[user_id]
+movie = movie_info[movie_id]
+
+feature = user.value() + movie.value()
+
+infer_dict = copy.copy(feeding)
+del infer_dict['score']
+
+prediction = paddle.infer(inference, parameters=parameters, input=[feature], feeding=infer_dict)
+score = (prediction[0][0] + 5.0) / 2
+print "[Predict] User %d Rating Movie %d With Score %.2f"%(user_id, movie_id, score)
+```
+
+    [INFO 2017-03-06 17:17:08,132 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title]
+    [INFO 2017-03-06 17:17:08,134 networks.py:1478] The output order is [__cos_sim_0__]
+
+
+    [Predict] User 234 Rating Movie 345 With Score 4.16
+
+
+## 总结
+
+本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统，并以电影推荐为例，使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面，而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术，也将会在推荐系统领域大放异彩。
+
+## 参考文献
+
+1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
+2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
+3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
+4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.
+5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
+6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
+7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
+
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/05.recommender_system/README.md b/05.recommender_system/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cea2c33451cfb52996b0a2aadcc9ee9b80bed548
--- /dev/null
+++ b/05.recommender_system/README.md
@@ -0,0 +1,392 @@
+# Personalized Recommendation
+
+The source code from this tutorial is at [here](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system).  For instructions to run it, please refer to [this guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+
+## Background
+
+The recommender system is a component of e-commerce, online videos, and online reading services.  There are several different approaches for recommender systems to learn from user behavior and product properties and to understand users' interests.
+
+- User behavior-based approach.  A well-known method of this approach is collaborative filtering, which assumes that if two users made similar purchases, they share common interests and would likely go on making the same decision. Some variants of collaborative filtering are user-based[[3](#references)], item-based [[4](#references)], social network based[[5](#references)], and model-based.
+
+- Content-based approach[[1](#references)].  This approach represents product properties and user interests as feature vectors of the same space so that it could measure how much a user is interested in a product by the distance between two feature vectors.
+
+- Hybrid approach[[2](#references)]: This one combines above two to help with each other about the data sparsity problem[[6](#references)].
+
+This tutorial explains a deep learning based hybrid approach and its implement in PaddlePaddle.  We are going to train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
+
+
+## Model Overview
+
+To know more about deep learning based recommendation, let us start from going over the Youtube recommender system[[7](#references)] before introducing our hybrid model.
+
+
+### YouTube's Deep Learning Recommendation Model
+
+YouTube is a video-sharing Web site with one of the largest user base in the world.  Its recommender system serves more than a billion users.  This system is composed of two major parts: candidate generation and ranking.  The former selects few hundreds of candidates from millions of videos, and the latter ranks and outputs the top 10.
+
+<p align="center">
+<img src="image/YouTube_Overview.en.png" width="70%" ><br/>
+Figure 1. YouTube recommender system overview.
+</p>
+
+#### Candidate Generation Network
+
+YouTube models candidate generation as a multi-class classification problem with a huge number of classes equal to the number of videos.  The architecture of the model is as follows:
+
+<p align="center">
+<img src="image/Deep_candidate_generation_model_architecture.en.png" width="70%" ><br/>
+Figure 2. Deep candidate generation model.
+</p>
+
+The first stage of this model maps watching history and search queries into fixed-length representative features.  Then, an MLP (multi-layer Perceptron, as described in the [Recognize Digits](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md) tutorial) takes the concatenation of all representative vectors.  The output of the MLP represents the user' *intrinsic interests*.  At training time, it is used together with a softmax output layer for minimizing the classification error.   At serving time, it is used to compute the relevance of the user with all movies.
+
+For a user $U$, the predicted watching probability of video $i$ is
+
+$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
+
+where $u$ is the representative vector of user $U$, $V$ is the corpus of all videos, $v_i$ is the representative vector of the $i$-th video. $u$ and $v_i$ are vectors of the same length, so we can compute their dot product using a fully connected layer.
+
+This model could have a performance issue as the softmax output covers millions of classification labels.  To optimize performance, at the training time, the authors down-sample negative samples, so the actual number of classes is reduced to thousands.  At serving time, the authors ignore the normalization of the softmax outputs, because the results are just for ranking.
+
+#### Ranking Network
+
+The architecture of the ranking network is similar to that of the candidate generation network.  Similar to ranking models widely used in online advertising, it uses rich features like video ID, last watching time, etc.  The output layer of the ranking network is a weighted logistic regression, which rates all candidate videos.
+
+### Hybrid Model
+
+In the section, let us introduce our movie recommendation system. Especially, we feed moives titles into a text convolution network to get a fixed-length representative feature vector. Accordingly we will introduce the convolutional neural network for texts and the hybrid recommendation model respectively.
+
+#### Convolutional Neural Networks for Texts (CNN)
+
+**Convolutional Neural Networks** are frequently applied to data with grid-like topology such as two-dimensional images and one-dimensional texts. A CNN can extract multiple local features, combine them, and produce high-level abstractions, which correspond to semantic understanding. Empirically, CNN is shown to be efficient for image and text modeling.
+
+CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. Here, we briefly describe a CNN as shown in Figure 3.
+
+
+<p align="center">
+<img src="image/text_cnn_en.png" width = "80%" align="center"/><br/>
+Figure 3. CNN for text modeling.
+</p>
+
+Let $n$ be the length of the sentence to process, and the $i$-th word has embedding as $x_i\in\mathbb{R}^k$，where $k$ is the embedding dimensionality.
+
+First, we concatenate the words by piecing together every $h$ words, each as a window of length $h$. This window is denoted as $x_{i:i+h-1}$, consisting of $x_{i},x_{i+1},\ldots,x_{i+h-1}$, where $x_i$ is the first word in the window and $i$ takes value ranging from $1$ to $n-h+1$: $x_{i:i+h-1}\in\mathbb{R}^{hk}$.
+
+Next, we apply the convolution operation: we apply the kernel $w\in\mathbb{R}^{hk}$ in each window, extracting features $c_i=f(w\cdot x_{i:i+h-1}+b)$, where $b\in\mathbb{R}$ is the bias and $f$ is a non-linear activation function such as $sigmoid$. Convolving by the kernel at every window ${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$ produces a feature map in the following form:
+
+$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
+
+Next, we apply *max pooling* over time to represent the whole sentence $\hat c$, which is the maximum element across the feature map:
+
+$$\hat c=max(c)$$
+
+#### Model Structure Of The Hybrid Model
+
+In our network, the input includes features of users and movies.  The user feature includes four properties: user ID, gender, occupation, and age.  Movie features include their IDs, genres, and titles.
+
+We use fully-connected layers to map user features into representative feature vectors and concatenate them.  The process of movie features is similar, except that for movie titles -- we feed titles into a text convolution network as described in the above section to get a fixed-length representative feature vector.
+
+Given the feature vectors of users and movies, we compute the relevance using cosine similarity.  We minimize the squared error at training time.
+
+<p align="center">
+<img src="image/rec_regression_network_en.png" width="90%" ><br/>
+Figure 4. A hybrid recommendation model.
+</p>
+
+## Dataset
+
+We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m.zip) to train our model.  This dataset includes 10,000 ratings of 4,000 movies from 6,000 users to 4,000 movies.  Each rate is in the range of 1~5.  Thanks to GroupLens Research for collecting, processing and publishing the dataset.
+
+`paddle.v2.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need for us to manually download and preprocess `MovieLens` dataset.
+
+The raw `MoiveLens` contains movie ratings, relevant features from both movies and users.
+For instance, one movie's feature could be:
+
+```python
+import paddle.v2 as paddle
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+```text
+<MovieInfo id(1), title(Toy Story), categories(['Animation', "Children's", 'Comedy'])>
+```
+
+One user's feature could be:
+
+```python
+user_info = paddle.dataset.movielens.user_info()
+print user_info.values()[0]
+```
+
+```text
+<UserInfo id(1), gender(F), age(1), job(10)>
+```
+
+In this dateset, the distribution of age is shown as follows:
+
+```text
+1: "Under 18"
+18: "18-24"
+25: "25-34"
+35: "35-44"
+45: "45-49"
+50: "50-55"
+56: "56+"
+```
+
+User's occupation is selected from the following options:
+
+```text
+0: "other" or not specified
+1: "academic/educator"
+2: "artist"
+3: "clerical/admin"
+4: "college/grad student"
+5: "customer service"
+6: "doctor/health care"
+7: "executive/managerial"
+8: "farmer"
+9: "homemaker"
+10: "K-12 student"
+11: "lawyer"
+12: "programmer"
+13: "retired"
+14: "sales/marketing"
+15: "scientist"
+16: "self-employed"
+17: "technician/engineer"
+18: "tradesman/craftsman"
+19: "unemployed"
+20: "writer"
+```
+
+Each record consists of three main components: user features, movie features and movie ratings.
+Likewise, as a simple example, consider the following:
+
+```python
+train_set_creator = paddle.dataset.movielens.train()
+train_sample = next(train_set_creator())
+uid = train_sample[0]
+mov_id = train_sample[len(user_info[uid].value())]
+print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1])
+```
+
+```text
+User <UserInfo id(1), gender(F), age(1), job(10)> rates Movie <MovieInfo id(1193), title(One Flew Over the Cuckoo's Nest), categories(['Drama'])> with Score [5.0]
+```
+
+The output shows that user 1 gave movie `1193` a rating of 5.
+
+After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works.
+
+## Model Architecture
+
+### Initialize PaddlePaddle
+
+First, we must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).
+
+```python
+import paddle.v2 as paddle
+paddle.init(use_gpu=False)
+```
+
+### Model Configuration
+
+```python
+uid = paddle.layer.data(
+    name='user_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_user_id() + 1))
+usr_emb = paddle.layer.embedding(input=uid, size=32)
+usr_fc = paddle.layer.fc(input=usr_emb, size=32)
+
+usr_gender_id = paddle.layer.data(
+    name='gender_id', type=paddle.data_type.integer_value(2))
+usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
+usr_gender_fc = paddle.layer.fc(input=usr_gender_emb, size=16)
+
+usr_age_id = paddle.layer.data(
+    name='age_id',
+    type=paddle.data_type.integer_value(
+        len(paddle.dataset.movielens.age_table)))
+usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
+usr_age_fc = paddle.layer.fc(input=usr_age_emb, size=16)
+
+usr_job_id = paddle.layer.data(
+    name='job_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_job_id() + 1))
+usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
+usr_job_fc = paddle.layer.fc(input=usr_job_emb, size=16)
+```
+
+As shown in the above code, the input is four dimension integers for each user, that is,  `user_id`,`gender_id`, `age_id` and `job_id`. In order to deal with these features conveniently, we use the language model in NLP to transform these discrete values into embedding vaules `usr_emb`, `usr_gender_emb`, `usr_age_emb` and `usr_job_emb`.
+
+```python
+usr_combined_features = paddle.layer.fc(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
+        size=200,
+        act=paddle.activation.Tanh())
+```
+
+Then, employing user features as input, directly connecting to a fully-connected layer, which is used to reduce dimension to 200.
+
+Furthermore, we do a similar transformation for each movie feature. The model configuration is:
+
+```python
+mov_id = paddle.layer.data(
+    name='movie_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_movie_id() + 1))
+mov_emb = paddle.layer.embedding(input=mov_id, size=32)
+mov_fc = paddle.layer.fc(input=mov_emb, size=32)
+
+mov_categories = paddle.layer.data(
+    name='category_id',
+    type=paddle.data_type.sparse_binary_vector(
+        len(paddle.dataset.movielens.movie_categories())))
+mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
+
+movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
+mov_title_id = paddle.layer.data(
+    name='movie_title',
+    type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
+mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
+mov_title_conv = paddle.networks.sequence_conv_pool(
+    input=mov_title_emb, hidden_size=32, context_len=3)
+
+mov_combined_features = paddle.layer.fc(
+    input=[mov_fc, mov_categories_hidden, mov_title_conv],
+    size=200,
+    act=paddle.activation.Tanh())
+```
+
+Movie title, a sequence of words represented by an integer word index sequence, will be feed into a `sequence_conv_pool` layer, which will apply convolution and pooling on time dimension. Because pooling is done on time dimension, the output will be a fixed-length vector regardless the length of the input sequence.
+
+Finally, we can use cosine similarity to calculate the similarity between user characteristics and movie features.
+
+```python
+inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
+cost = paddle.layer.square_error_cost(
+        input=inference,
+        label=paddle.layer.data(
+        name='score', type=paddle.data_type.dense_vector(1)))
+```
+
+## Model Training
+
+### Define Parameters
+
+First, we define the model parameters according to the previous model configuration `cost`.
+
+```python
+# Create parameters
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+Before jumping into creating a training module, algorithm setting is also necessary. Here we specified Adam optimization algorithm via `paddle.optimizer`.
+
+```python
+trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
+                             update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
+```
+
+```text
+[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
+[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__square_error_cost_0__]
+```
+
+### Training
+
+`paddle.dataset.movielens.train` will yield records during each pass, after shuffling, a batch input is generated for training.
+
+```python
+reader=paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=256)
+```
+
+`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `movielens.train` corresponds to `user_id` feature.
+
+```python
+feeding = {
+    'user_id': 0,
+    'gender_id': 1,
+    'age_id': 2,
+    'job_id': 3,
+    'movie_id': 4,
+    'category_id': 5,
+    'movie_title': 6,
+    'score': 7
+}
+```
+
+Callback function `event_handler` and  `event_handler_plot` will be called during training when a pre-defined event happens.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d Batch %d Cost %.2f" % (
+                event.pass_id, event.batch_id, event.cost)
+```
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            cost_ploter.append(train_title, step, event.cost)
+
+        if step % 1000 == 0: # every 1000 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.movielens.test(), batch_size=256),
+                feeding=feeding)
+            cost_ploter.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            cost_ploter.plot()
+
+        step += 1
+```
+
+Finally, we can invoke `trainer.train` to start training:
+
+```python
+trainer.train(
+    reader=reader,
+    event_handler=event_handler_plot,
+    feeding=feeding,
+    num_passes=2)
+```
+
+## Conclusion
+
+This tutorial goes over traditional approaches in recommender system and a deep learning based approach.  We also show that how to train and use the model with PaddlePaddle.  Deep learning has been well used in computer vision and NLP, we look forward to its new successes in recommender systems.
+
+## References
+
+1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
+2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
+3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
+4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th International Conference on World Wide Web*. ACM, 2001.
+5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: Combining Social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
+6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
+7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/recommender_system/image/Deep_candidate_generation_model_architecture.en.png b/05.recommender_system/image/Deep_candidate_generation_model_architecture.en.png
similarity index 100%
rename from recommender_system/image/Deep_candidate_generation_model_architecture.en.png
rename to 05.recommender_system/image/Deep_candidate_generation_model_architecture.en.png
diff --git a/recommender_system/image/Deep_candidate_generation_model_architecture.png b/05.recommender_system/image/Deep_candidate_generation_model_architecture.png
similarity index 100%
rename from recommender_system/image/Deep_candidate_generation_model_architecture.png
rename to 05.recommender_system/image/Deep_candidate_generation_model_architecture.png
diff --git a/recommender_system/image/YouTube_Overview.en.png b/05.recommender_system/image/YouTube_Overview.en.png
similarity index 100%
rename from recommender_system/image/YouTube_Overview.en.png
rename to 05.recommender_system/image/YouTube_Overview.en.png
diff --git a/recommender_system/image/YouTube_Overview.png b/05.recommender_system/image/YouTube_Overview.png
similarity index 100%
rename from recommender_system/image/YouTube_Overview.png
rename to 05.recommender_system/image/YouTube_Overview.png
diff --git a/05.recommender_system/image/output_32_0.png b/05.recommender_system/image/output_32_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..7fd97b9cc3a0b9105b41591af4e8f8e4646bd681
Binary files /dev/null and b/05.recommender_system/image/output_32_0.png differ
diff --git a/recommender_system/image/rec_regression_network.png b/05.recommender_system/image/rec_regression_network.png
similarity index 100%
rename from recommender_system/image/rec_regression_network.png
rename to 05.recommender_system/image/rec_regression_network.png
diff --git a/05.recommender_system/image/rec_regression_network_en.png b/05.recommender_system/image/rec_regression_network_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..6fc8e11967000ec48c1c0a6fa3c2eaecb80cbb84
Binary files /dev/null and b/05.recommender_system/image/rec_regression_network_en.png differ
diff --git a/05.recommender_system/image/text_cnn.png b/05.recommender_system/image/text_cnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..61e63d9147cbc2901706ef80776d706e5368c3c5
Binary files /dev/null and b/05.recommender_system/image/text_cnn.png differ
diff --git a/05.recommender_system/image/text_cnn_en.png b/05.recommender_system/image/text_cnn_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbcae2be81141be955076e877b94b0ea5d7e4d4a
Binary files /dev/null and b/05.recommender_system/image/text_cnn_en.png differ
diff --git a/05.recommender_system/index.cn.html b/05.recommender_system/index.cn.html
new file mode 100644
index 0000000000000000000000000000000000000000..97f13fdee3b1674184f5dbee771ca3301a24c5de
--- /dev/null
+++ b/05.recommender_system/index.cn.html
@@ -0,0 +1,529 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 个性化推荐
+
+本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/176.html)。
+
+## 背景介绍
+
+在网络技术不断发展和电子商务规模不断扩大的背景下，商品数量和种类快速增长，用户需要花费大量时间才能找到自己想买的商品，这就是信息超载问题。为了解决这个难题，推荐系统（Recommender System）应运而生。
+
+个性化推荐系统是信息过滤系统（Information Filtering System）的子集，它可以用在很多领域，如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为，发现用户的个性化需求与兴趣特点，将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同，推荐系统不需要用户准确地描述出自己的需求，而是根据分析历史行为建模，主动提供满足用户兴趣和需求的信息。
+
+传统的推荐系统方法主要有：
+
+- 协同过滤推荐（Collaborative Filtering Recommendation）：该方法收集分析用户历史行为、活动、偏好，计算一个用户与其他用户的相似度，利用目标用户的相似用户对商品评价的加权评价值，来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品；缺点是对于没有任何行为的新用户存在冷启动的问题，同时也存在用户与商品之间的交互数据不够多造成的稀疏问题，会导致模型难以找到相近用户。
+- 基于内容过滤推荐[[1](#参考文献)]（Content-based Filtering Recommendation）：该方法利用商品的内容描述，抽象出有意义的特征，通过计算用户的兴趣和商品描述之间的相似度，来给用户做推荐。优点是简单直接，不需要依据其他用户对商品的评价，而是通过商品属性进行商品相似度度量，从而推荐给用户所感兴趣商品的相似商品；缺点是对于没有任何行为的新用户同样存在冷启动的问题。
+- 组合推荐[[2](#参考文献)]（Hybrid Recommendation）：运用不同的输入和技术共同进行推荐，以弥补各自推荐技术的缺点。
+
+其中协同过滤是应用最广泛的技术之一，它又可以分为多个子类：基于用户 （User-Based）的推荐[[3](#参考文献)] 、基于物品（Item-Based）的推荐[[4](#参考文献)]、基于社交网络关系（Social-Based）的推荐[[5](#参考文献)]、基于模型（Model-based）的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想，此后，基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。
+
+深度学习具有优秀的自动提取特征的能力，能够学习多层次的抽象特征表示，并对异质或跨域的内容信息进行学习，可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型，以及如何使用PaddlePaddle实现模型。
+
+## 效果展示
+
+我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后，只需要输入对应的用户ID和电影ID，就可以得出一个匹配的分数（范围[0,5]，分数越高视为兴趣越大），然后根据所有电影的推荐得分排序，推荐给用户可能感兴趣的电影。
+
+```
+Input movie_id: 1962
+Input user_id: 1
+Prediction Score is 4.25
+```
+
+## 模型概览
+
+本章中，我们首先介绍YouTube的视频推荐系统[[7](#参考文献)]，然后介绍我们实现的融合推荐模型。
+
+### YouTube的深度神经网络推荐系统
+
+YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
+
+<p align="center">
+<img src="image/YouTube_Overview.png" width="70%" ><br/>
+图1. YouTube 推荐系统结构
+</p>
+
+#### 候选生成网络（Candidate Generation Network）
+
+候选生成网络将推荐问题建模为一个类别数极大的多类分类问题：对于一个Youtube用户，使用其观看历史（视频ID）、搜索词记录（search tokens）、人口学信息（如地理位置、用户登录设备）、二值特征（如性别，是否登录）和连续特征（如用户年龄）等，对视频库中所有视频进行多分类，得到每一类别的分类结果（即每一个视频的推荐概率），最终输出概率较高的几百个视频。
+
+首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
+
+<p align="center">
+<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
+图2. 候选生成网络结构
+</p>
+
+对于一个用户$U$，预测此刻用户要观看的视频$\omega$为视频$i$的概率公式为：
+
+$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
+
+其中$u$为用户$U$的特征表示，$V$为视频库集合，$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量，两者点积可以通过全连接层实现。
+
+考虑到softmax分类的类别数非常多，为了保证一定的计算效率：1）训练阶段，使用负样本类别采样将实际计算的类别数缩小至数千；2）推荐（预测）阶段，忽略softmax的归一化计算（不影响结果），将类别打分问题简化为点积（dot product）空间中的最近邻（nearest neighbor）搜索问题，取与$u$最近的$k$个视频作为生成的候选。
+
+#### 排序网络（Ranking Network）
+排序网络的结构类似于候选生成网络，但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似，这里也构造了大量的用于视频排序的相关特征（如视频 ID、上次观看时间等）。这些特征的处理方式和候选生成网络类似，不同之处是排序网络的顶部是一个加权逻辑回归（weighted logistic regression），它对所有候选视频进行打分，从高到底排序后将分数较高的一些视频返回给用户。
+
+### 融合推荐模型
+本节会使卷积神经网络（Convolutional Neural Networks）来学习电影名称的表示。下面会依次介绍文本卷积神经网络以及融合推荐模型。
+
+#### 文本卷积神经网络（CNN）
+
+卷积神经网络经常用来处理具有类似网格拓扑结构（grid-like topology）的数据。例如，图像可以视为二维网格的像素点，自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征，并对其进行组合抽象得到更高级的特征表示。实验表明，卷积神经网络能高效地对图像及文本问题进行建模处理。  
+
+卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
+
+<p align="center">
+<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
+图3. 卷积神经网络文本分类模型
+</p>
+
+假设待处理句子的长度为$n$，其中第$i$个词的词向量（word embedding）为$x_i\in\mathbb{R}^k$，$k$为维度大小。  
+
+首先，进行词向量的拼接操作：将每$h$个词拼接起来形成一个大小为$h$的词窗口，记为$x_{i:i+h-1}$，它表示词序列$x_{i},x_{i+1},\ldots,x_{i+h-1}$的拼接，其中，$i$表示词窗口中第一个词在整个句子中的位置，取值范围从$1$到$n-h+1$，$x_{i:i+h-1}\in\mathbb{R}^{hk}$。  
+
+其次，进行卷积操作：把卷积核(kernel)$w\in\mathbb{R}^{hk}$应用于包含$h$个词的窗口$x_{i:i+h-1}$，得到特征$c_i=f(w\cdot x_{i:i+h-1}+b)$，其中$b\in\mathbb{R}$为偏置项（bias），$f$为非线性激活函数，如$sigmoid$。将卷积核应用于句子中所有的词窗口${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$，产生一个特征图（feature map）：
+
+$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
+
+接下来，对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征$\hat c$，它是特征图中所有元素的最大值：
+
+$$\hat c=max(c)$$  
+
+#### 模型概览
+
+在融合推荐模型的电影推荐系统中：
+
+1. 首先，使用用户特征和电影特征作为神经网络的输入，其中：
+
+   - 用户特征融合了四个属性信息，分别是用户ID、性别、职业和年龄。
+
+   - 电影特征融合了三个属性信息，分别是电影ID、电影类型ID和电影名称。
+
+2. 对用户特征，将用户ID映射为维度大小为256的向量表示，输入全连接层，并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。
+
+3. 对电影特征，将电影ID以类似用户ID的方式进行处理，电影类型ID以向量的形式直接输入全连接层，电影名称用文本卷积神经网络得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。
+
+4. 得到用户和电影的向量表示后，计算二者的余弦相似度作为推荐系统的打分。最后，用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。
+
+<p align="center">
+
+<img src="image/rec_regression_network.png" width="90%" ><br/>
+图4. 融合推荐模型
+</p>
+
+## 数据准备
+
+### 数据介绍与下载
+
+我们以 [MovieLens 百万数据集（ml-1m）](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价（评分范围 1~5 分，均为整数），由 GroupLens Research 实验室搜集整理。
+
+Paddle在API中提供了自动加载数据的模块。数据模块为 `paddle.dataset.movielens`
+
+
+```python
+import paddle.v2 as paddle
+paddle.init(use_gpu=False)
+```
+
+
+```python
+# Run this block to show dataset's documentation
+# help(paddle.dataset.movielens)
+```
+
+在原始数据中包含电影的特征数据，用户的特征数据，和用户对电影的评分。
+
+例如，其中某一个电影特征为:
+
+
+```python
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+    <MovieInfo id(1), title(Toy Story ), categories(['Animation', "Children's", 'Comedy'])>
+
+
+这表示，电影的id是1，标题是《Toy Story》，该电影被分为到三个类别中。这三个类别是动画，儿童，喜剧。
+
+
+```python
+user_info = paddle.dataset.movielens.user_info()
+print user_info.values()[0]
+```
+
+    <UserInfo id(1), gender(F), age(1), job(10)>
+
+
+这表示，该用户ID是1，女性，年龄比18岁还年轻。职业ID是10。
+
+
+其中，年龄使用下列分布
+*  1:  "Under 18"
+* 18:  "18-24"
+* 25:  "25-34"
+* 35:  "35-44"
+* 45:  "45-49"
+* 50:  "50-55"
+* 56:  "56+"
+
+职业是从下面几种选项里面选则得出:
+*  0:  "other" or not specified
+*  1:  "academic/educator"
+*  2:  "artist"
+*  3:  "clerical/admin"
+*  4:  "college/grad student"
+*  5:  "customer service"
+*  6:  "doctor/health care"
+*  7:  "executive/managerial"
+*  8:  "farmer"
+*  9:  "homemaker"
+* 10:  "K-12 student"
+* 11:  "lawyer"
+* 12:  "programmer"
+* 13:  "retired"
+* 14:  "sales/marketing"
+* 15:  "scientist"
+* 16:  "self-employed"
+* 17:  "technician/engineer"
+* 18:  "tradesman/craftsman"
+* 19:  "unemployed"
+* 20:  "writer"
+
+而对于每一条训练/测试数据，均为 <用户特征> + <电影特征> + 评分。
+
+例如，我们获得第一条训练数据:
+
+
+```python
+train_set_creator = paddle.dataset.movielens.train()
+train_sample = next(train_set_creator())
+uid = train_sample[0]
+mov_id = train_sample[len(user_info[uid].value())]
+print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1])
+```
+
+    User <UserInfo id(1), gender(F), age(1), job(10)> rates Movie <MovieInfo id(1193), title(One Flew Over the Cuckoo's Nest ), categories(['Drama'])> with Score [5.0]
+
+
+即用户1对电影1193的评价为5分。
+
+## 模型配置说明
+
+下面我们开始根据输入数据的形式配置模型。
+
+
+```python
+uid = paddle.layer.data(
+    name='user_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_user_id() + 1))
+usr_emb = paddle.layer.embedding(input=uid, size=32)
+usr_fc = paddle.layer.fc(input=usr_emb, size=32)
+
+usr_gender_id = paddle.layer.data(
+    name='gender_id', type=paddle.data_type.integer_value(2))
+usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
+usr_gender_fc = paddle.layer.fc(input=usr_gender_emb, size=16)
+
+usr_age_id = paddle.layer.data(
+    name='age_id',
+    type=paddle.data_type.integer_value(
+        len(paddle.dataset.movielens.age_table)))
+usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
+usr_age_fc = paddle.layer.fc(input=usr_age_emb, size=16)
+
+usr_job_id = paddle.layer.data(
+    name='job_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_job_id() + 1))
+usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
+usr_job_fc = paddle.layer.fc(input=usr_job_emb, size=16)
+```
+
+如上述代码所示，对于每个用户，我们输入4维特征。其中包括`user_id`,`gender_id`,`age_id`,`job_id`。这几维特征均是简单的整数值。为了后续神经网络处理这些特征方便，我们借鉴NLP中的语言模型，将这几维离散的整数值，变换成embedding取出。分别形成`usr_emb`, `usr_gender_emb`, `usr_age_emb`, `usr_job_emb`。
+
+
+```python
+usr_combined_features = paddle.layer.fc(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
+        size=200,
+        act=paddle.activation.Tanh())
+```
+
+然后，我们对于所有的用户特征，均输入到一个全连接层(fc)中。将所有特征融合为一个200维度的特征。
+
+进而，我们对每一个电影特征做类似的变换，网络配置为:
+
+
+```python
+mov_id = paddle.layer.data(
+    name='movie_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_movie_id() + 1))
+mov_emb = paddle.layer.embedding(input=mov_id, size=32)
+mov_fc = paddle.layer.fc(input=mov_emb, size=32)
+
+mov_categories = paddle.layer.data(
+    name='category_id',
+    type=paddle.data_type.sparse_binary_vector(
+        len(paddle.dataset.movielens.movie_categories())))
+mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
+
+movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
+mov_title_id = paddle.layer.data(
+    name='movie_title',
+    type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
+mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
+mov_title_conv = paddle.networks.sequence_conv_pool(
+    input=mov_title_emb, hidden_size=32, context_len=3)
+
+mov_combined_features = paddle.layer.fc(
+    input=[mov_fc, mov_categories_hidden, mov_title_conv],
+    size=200,
+    act=paddle.activation.Tanh())
+```
+
+电影ID和电影类型分别映射到其对应的特征隐层。对于电影标题名称(title)，一个ID序列表示的词语序列，在输入卷积层后，将得到每个时间窗口的特征（序列特征），然后通过在时间维度降采样得到固定维度的特征，整个过程在sequence_conv_pool实现。
+
+最后再将电影的特征融合进`mov_combined_features`中。
+
+
+```python
+inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
+```
+
+进而，我们使用余弦相似度计算用户特征与电影特征的相似性。并将这个相似性拟合(回归)到用户评分上。
+
+
+```python
+cost = paddle.layer.square_error_cost(
+        input=inference,
+        label=paddle.layer.data(
+            name='score', type=paddle.data_type.dense_vector(1)))
+```
+
+至此，我们的优化目标就是这个网络配置中的`cost`了。
+
+## 训练模型
+
+### 定义参数
+神经网络的模型，我们可以简单的理解为网络拓朴结构+参数。之前一节，我们定义出了优化目标`cost`。这个`cost`即为网络模型的拓扑结构。我们开始训练模型，需要先定义出参数。定义方法为:
+
+
+```python
+parameters = paddle.parameters.create(cost)
+```
+
+    [INFO 2017-03-06 17:12:13,284 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
+    [INFO 2017-03-06 17:12:13,287 networks.py:1478] The output order is [__square_error_cost_0__]
+
+
+`parameters`是模型的所有参数集合。他是一个python的dict。我们可以查看到这个网络中的所有参数名称。因为之前定义模型的时候，我们没有指定参数名称，这里参数名称是自动生成的。当然，我们也可以指定每一个参数名称，方便日后维护。
+
+
+```python
+print parameters.keys()
+```
+
+    [u'___fc_layer_2__.wbias', u'___fc_layer_2__.w2', u'___embedding_layer_3__.w0', u'___embedding_layer_5__.w0', u'___embedding_layer_2__.w0', u'___embedding_layer_1__.w0', u'___fc_layer_1__.wbias', u'___fc_layer_0__.wbias', u'___fc_layer_1__.w0', u'___fc_layer_0__.w2', u'___fc_layer_0__.w3', u'___fc_layer_0__.w0', u'___fc_layer_0__.w1', u'___fc_layer_2__.w1', u'___fc_layer_2__.w0', u'___embedding_layer_4__.w0', u'___sequence_conv_pool_0___conv_fc.w0', u'___embedding_layer_0__.w0', u'___sequence_conv_pool_0___conv_fc.wbias']
+
+
+### 构造训练(trainer)
+
+下面，我们根据网络拓扑结构和模型参数来构造出一个本地训练(trainer)。在构造本地训练的时候，我们还需要指定这个训练的优化方法。这里我们使用Adam来作为优化算法。
+
+
+```python
+trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
+                            update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
+```
+
+    [INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
+    [INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__square_error_cost_0__]
+
+
+### 训练
+
+下面我们开始训练过程。
+
+我们直接使用Paddle提供的数据集读取程序。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和预测数据集。并且通过`feeding`来指定每一个数据和data_layer的对应关系。
+
+例如，这里的feeding表示的是，对于数据层 `user_id`，使用了reader中每一条数据的第0个元素。`gender_id`数据层使用了第1个元素。以此类推。
+
+```python
+feeding = {
+    'user_id': 0,
+    'gender_id': 1,
+    'age_id': 2,
+    'job_id': 3,
+    'movie_id': 4,
+    'category_id': 5,
+    'movie_title': 6,
+    'score': 7
+}
+```
+
+训练过程是完全自动的。我们可以使用event_handler与event_handler_plot来观察训练过程，或进行测试等。这里我们在event_handler_plot里面绘制了训练误差曲线和测试误差曲线。并且保存了模型。
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d Batch %d Cost %.2f" % (
+                event.pass_id, event.batch_id, event.cost)
+```
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            cost_ploter.append(train_title, step, event.cost)
+
+        if step % 1000 == 0: # every 1000 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.movielens.test(), batch_size=256),
+                feeding=feeding)
+            cost_ploter.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            cost_ploter.plot()
+
+        step += 1
+```
+
+```python
+trainer.train(
+    reader=paddle.batch(
+            paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+                            batch_size=256),
+    event_handler=event_handler_plot,
+    feeding=feeding,
+    num_passes=2)
+```
+
+
+![png](./image/output_32_0.png)
+
+## 应用模型
+
+在训练了几轮以后，您可以对模型进行推断。我们可以使用任意一个用户ID和电影ID，来预测该用户对该电影的评分。示例程序为:
+
+
+```python
+import copy
+user_id = 234
+movie_id = 345
+
+user = user_info[user_id]
+movie = movie_info[movie_id]
+
+feature = user.value() + movie.value()
+
+infer_dict = copy.copy(feeding)
+del infer_dict['score']
+
+prediction = paddle.infer(inference, parameters=parameters, input=[feature], feeding=infer_dict)
+score = (prediction[0][0] + 5.0) / 2
+print "[Predict] User %d Rating Movie %d With Score %.2f"%(user_id, movie_id, score)
+```
+
+    [INFO 2017-03-06 17:17:08,132 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title]
+    [INFO 2017-03-06 17:17:08,134 networks.py:1478] The output order is [__cos_sim_0__]
+
+
+    [Predict] User 234 Rating Movie 345 With Score 4.16
+
+
+## 总结
+
+本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统，并以电影推荐为例，使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面，而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术，也将会在推荐系统领域大放异彩。
+
+## 参考文献
+
+1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
+2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
+3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
+4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.
+5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
+6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
+7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
+
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/05.recommender_system/index.html b/05.recommender_system/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..ff0a8835aa1aec447e2a8fa6fce78aa8e5a49878
--- /dev/null
+++ b/05.recommender_system/index.html
@@ -0,0 +1,456 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Personalized Recommendation
+
+The source code from this tutorial is at [here](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system).  For instructions to run it, please refer to [this guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+
+## Background
+
+The recommender system is a component of e-commerce, online videos, and online reading services.  There are several different approaches for recommender systems to learn from user behavior and product properties and to understand users' interests.
+
+- User behavior-based approach.  A well-known method of this approach is collaborative filtering, which assumes that if two users made similar purchases, they share common interests and would likely go on making the same decision. Some variants of collaborative filtering are user-based[[3](#references)], item-based [[4](#references)], social network based[[5](#references)], and model-based.
+
+- Content-based approach[[1](#references)].  This approach represents product properties and user interests as feature vectors of the same space so that it could measure how much a user is interested in a product by the distance between two feature vectors.
+
+- Hybrid approach[[2](#references)]: This one combines above two to help with each other about the data sparsity problem[[6](#references)].
+
+This tutorial explains a deep learning based hybrid approach and its implement in PaddlePaddle.  We are going to train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
+
+
+## Model Overview
+
+To know more about deep learning based recommendation, let us start from going over the Youtube recommender system[[7](#references)] before introducing our hybrid model.
+
+
+### YouTube's Deep Learning Recommendation Model
+
+YouTube is a video-sharing Web site with one of the largest user base in the world.  Its recommender system serves more than a billion users.  This system is composed of two major parts: candidate generation and ranking.  The former selects few hundreds of candidates from millions of videos, and the latter ranks and outputs the top 10.
+
+<p align="center">
+<img src="image/YouTube_Overview.en.png" width="70%" ><br/>
+Figure 1. YouTube recommender system overview.
+</p>
+
+#### Candidate Generation Network
+
+YouTube models candidate generation as a multi-class classification problem with a huge number of classes equal to the number of videos.  The architecture of the model is as follows:
+
+<p align="center">
+<img src="image/Deep_candidate_generation_model_architecture.en.png" width="70%" ><br/>
+Figure 2. Deep candidate generation model.
+</p>
+
+The first stage of this model maps watching history and search queries into fixed-length representative features.  Then, an MLP (multi-layer Perceptron, as described in the [Recognize Digits](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md) tutorial) takes the concatenation of all representative vectors.  The output of the MLP represents the user' *intrinsic interests*.  At training time, it is used together with a softmax output layer for minimizing the classification error.   At serving time, it is used to compute the relevance of the user with all movies.
+
+For a user $U$, the predicted watching probability of video $i$ is
+
+$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
+
+where $u$ is the representative vector of user $U$, $V$ is the corpus of all videos, $v_i$ is the representative vector of the $i$-th video. $u$ and $v_i$ are vectors of the same length, so we can compute their dot product using a fully connected layer.
+
+This model could have a performance issue as the softmax output covers millions of classification labels.  To optimize performance, at the training time, the authors down-sample negative samples, so the actual number of classes is reduced to thousands.  At serving time, the authors ignore the normalization of the softmax outputs, because the results are just for ranking.
+
+#### Ranking Network
+
+The architecture of the ranking network is similar to that of the candidate generation network.  Similar to ranking models widely used in online advertising, it uses rich features like video ID, last watching time, etc.  The output layer of the ranking network is a weighted logistic regression, which rates all candidate videos.
+
+### Hybrid Model
+
+In the section, let us introduce our movie recommendation system. Especially, we feed moives titles into a text convolution network to get a fixed-length representative feature vector. Accordingly we will introduce the convolutional neural network for texts and the hybrid recommendation model respectively.
+
+#### Convolutional Neural Networks for Texts (CNN)
+
+**Convolutional Neural Networks** are frequently applied to data with grid-like topology such as two-dimensional images and one-dimensional texts. A CNN can extract multiple local features, combine them, and produce high-level abstractions, which correspond to semantic understanding. Empirically, CNN is shown to be efficient for image and text modeling.
+
+CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. Here, we briefly describe a CNN as shown in Figure 3.
+
+
+<p align="center">
+<img src="image/text_cnn_en.png" width = "80%" align="center"/><br/>
+Figure 3. CNN for text modeling.
+</p>
+
+Let $n$ be the length of the sentence to process, and the $i$-th word has embedding as $x_i\in\mathbb{R}^k$，where $k$ is the embedding dimensionality.
+
+First, we concatenate the words by piecing together every $h$ words, each as a window of length $h$. This window is denoted as $x_{i:i+h-1}$, consisting of $x_{i},x_{i+1},\ldots,x_{i+h-1}$, where $x_i$ is the first word in the window and $i$ takes value ranging from $1$ to $n-h+1$: $x_{i:i+h-1}\in\mathbb{R}^{hk}$.
+
+Next, we apply the convolution operation: we apply the kernel $w\in\mathbb{R}^{hk}$ in each window, extracting features $c_i=f(w\cdot x_{i:i+h-1}+b)$, where $b\in\mathbb{R}$ is the bias and $f$ is a non-linear activation function such as $sigmoid$. Convolving by the kernel at every window ${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$ produces a feature map in the following form:
+
+$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
+
+Next, we apply *max pooling* over time to represent the whole sentence $\hat c$, which is the maximum element across the feature map:
+
+$$\hat c=max(c)$$
+
+#### Model Structure Of The Hybrid Model
+
+In our network, the input includes features of users and movies.  The user feature includes four properties: user ID, gender, occupation, and age.  Movie features include their IDs, genres, and titles.
+
+We use fully-connected layers to map user features into representative feature vectors and concatenate them.  The process of movie features is similar, except that for movie titles -- we feed titles into a text convolution network as described in the above section to get a fixed-length representative feature vector.
+
+Given the feature vectors of users and movies, we compute the relevance using cosine similarity.  We minimize the squared error at training time.
+
+<p align="center">
+<img src="image/rec_regression_network_en.png" width="90%" ><br/>
+Figure 4. A hybrid recommendation model.
+</p>
+
+## Dataset
+
+We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m.zip) to train our model.  This dataset includes 10,000 ratings of 4,000 movies from 6,000 users to 4,000 movies.  Each rate is in the range of 1~5.  Thanks to GroupLens Research for collecting, processing and publishing the dataset.
+
+`paddle.v2.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need for us to manually download and preprocess `MovieLens` dataset.
+
+The raw `MoiveLens` contains movie ratings, relevant features from both movies and users.
+For instance, one movie's feature could be:
+
+```python
+import paddle.v2 as paddle
+movie_info = paddle.dataset.movielens.movie_info()
+print movie_info.values()[0]
+```
+
+```text
+<MovieInfo id(1), title(Toy Story), categories(['Animation', "Children's", 'Comedy'])>
+```
+
+One user's feature could be:
+
+```python
+user_info = paddle.dataset.movielens.user_info()
+print user_info.values()[0]
+```
+
+```text
+<UserInfo id(1), gender(F), age(1), job(10)>
+```
+
+In this dateset, the distribution of age is shown as follows:
+
+```text
+1: "Under 18"
+18: "18-24"
+25: "25-34"
+35: "35-44"
+45: "45-49"
+50: "50-55"
+56: "56+"
+```
+
+User's occupation is selected from the following options:
+
+```text
+0: "other" or not specified
+1: "academic/educator"
+2: "artist"
+3: "clerical/admin"
+4: "college/grad student"
+5: "customer service"
+6: "doctor/health care"
+7: "executive/managerial"
+8: "farmer"
+9: "homemaker"
+10: "K-12 student"
+11: "lawyer"
+12: "programmer"
+13: "retired"
+14: "sales/marketing"
+15: "scientist"
+16: "self-employed"
+17: "technician/engineer"
+18: "tradesman/craftsman"
+19: "unemployed"
+20: "writer"
+```
+
+Each record consists of three main components: user features, movie features and movie ratings.
+Likewise, as a simple example, consider the following:
+
+```python
+train_set_creator = paddle.dataset.movielens.train()
+train_sample = next(train_set_creator())
+uid = train_sample[0]
+mov_id = train_sample[len(user_info[uid].value())]
+print "User %s rates Movie %s with Score %s"%(user_info[uid], movie_info[mov_id], train_sample[-1])
+```
+
+```text
+User <UserInfo id(1), gender(F), age(1), job(10)> rates Movie <MovieInfo id(1193), title(One Flew Over the Cuckoo's Nest), categories(['Drama'])> with Score [5.0]
+```
+
+The output shows that user 1 gave movie `1193` a rating of 5.
+
+After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works.
+
+## Model Architecture
+
+### Initialize PaddlePaddle
+
+First, we must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).
+
+```python
+import paddle.v2 as paddle
+paddle.init(use_gpu=False)
+```
+
+### Model Configuration
+
+```python
+uid = paddle.layer.data(
+    name='user_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_user_id() + 1))
+usr_emb = paddle.layer.embedding(input=uid, size=32)
+usr_fc = paddle.layer.fc(input=usr_emb, size=32)
+
+usr_gender_id = paddle.layer.data(
+    name='gender_id', type=paddle.data_type.integer_value(2))
+usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
+usr_gender_fc = paddle.layer.fc(input=usr_gender_emb, size=16)
+
+usr_age_id = paddle.layer.data(
+    name='age_id',
+    type=paddle.data_type.integer_value(
+        len(paddle.dataset.movielens.age_table)))
+usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
+usr_age_fc = paddle.layer.fc(input=usr_age_emb, size=16)
+
+usr_job_id = paddle.layer.data(
+    name='job_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_job_id() + 1))
+usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
+usr_job_fc = paddle.layer.fc(input=usr_job_emb, size=16)
+```
+
+As shown in the above code, the input is four dimension integers for each user, that is,  `user_id`,`gender_id`, `age_id` and `job_id`. In order to deal with these features conveniently, we use the language model in NLP to transform these discrete values into embedding vaules `usr_emb`, `usr_gender_emb`, `usr_age_emb` and `usr_job_emb`.
+
+```python
+usr_combined_features = paddle.layer.fc(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
+        size=200,
+        act=paddle.activation.Tanh())
+```
+
+Then, employing user features as input, directly connecting to a fully-connected layer, which is used to reduce dimension to 200.
+
+Furthermore, we do a similar transformation for each movie feature. The model configuration is:
+
+```python
+mov_id = paddle.layer.data(
+    name='movie_id',
+    type=paddle.data_type.integer_value(
+        paddle.dataset.movielens.max_movie_id() + 1))
+mov_emb = paddle.layer.embedding(input=mov_id, size=32)
+mov_fc = paddle.layer.fc(input=mov_emb, size=32)
+
+mov_categories = paddle.layer.data(
+    name='category_id',
+    type=paddle.data_type.sparse_binary_vector(
+        len(paddle.dataset.movielens.movie_categories())))
+mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
+
+movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
+mov_title_id = paddle.layer.data(
+    name='movie_title',
+    type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
+mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
+mov_title_conv = paddle.networks.sequence_conv_pool(
+    input=mov_title_emb, hidden_size=32, context_len=3)
+
+mov_combined_features = paddle.layer.fc(
+    input=[mov_fc, mov_categories_hidden, mov_title_conv],
+    size=200,
+    act=paddle.activation.Tanh())
+```
+
+Movie title, a sequence of words represented by an integer word index sequence, will be feed into a `sequence_conv_pool` layer, which will apply convolution and pooling on time dimension. Because pooling is done on time dimension, the output will be a fixed-length vector regardless the length of the input sequence.
+
+Finally, we can use cosine similarity to calculate the similarity between user characteristics and movie features.
+
+```python
+inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
+cost = paddle.layer.square_error_cost(
+        input=inference,
+        label=paddle.layer.data(
+        name='score', type=paddle.data_type.dense_vector(1)))
+```
+
+## Model Training
+
+### Define Parameters
+
+First, we define the model parameters according to the previous model configuration `cost`.
+
+```python
+# Create parameters
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+Before jumping into creating a training module, algorithm setting is also necessary. Here we specified Adam optimization algorithm via `paddle.optimizer`.
+
+```python
+trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
+                             update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
+```
+
+```text
+[INFO 2017-03-06 17:12:13,378 networks.py:1472] The input order is [user_id, gender_id, age_id, job_id, movie_id, category_id, movie_title, score]
+[INFO 2017-03-06 17:12:13,379 networks.py:1478] The output order is [__square_error_cost_0__]
+```
+
+### Training
+
+`paddle.dataset.movielens.train` will yield records during each pass, after shuffling, a batch input is generated for training.
+
+```python
+reader=paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=256)
+```
+
+`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `movielens.train` corresponds to `user_id` feature.
+
+```python
+feeding = {
+    'user_id': 0,
+    'gender_id': 1,
+    'age_id': 2,
+    'job_id': 3,
+    'movie_id': 4,
+    'category_id': 5,
+    'movie_title': 6,
+    'score': 7
+}
+```
+
+Callback function `event_handler` and  `event_handler_plot` will be called during training when a pre-defined event happens.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d Batch %d Cost %.2f" % (
+                event.pass_id, event.batch_id, event.cost)
+```
+
+```python
+from paddle.v2.plot import Ploter
+
+train_title = "Train cost"
+test_title = "Test cost"
+cost_ploter = Ploter(train_title, test_title)
+
+step = 0
+
+def event_handler_plot(event):
+    global step
+    if isinstance(event, paddle.event.EndIteration):
+        if step % 10 == 0:  # every 10 batches, record a train cost
+            cost_ploter.append(train_title, step, event.cost)
+
+        if step % 1000 == 0: # every 1000 batches, record a test cost
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.movielens.test(), batch_size=256),
+                feeding=feeding)
+            cost_ploter.append(test_title, step, result.cost)
+
+        if step % 100 == 0: # every 100 batches, update cost plot
+            cost_ploter.plot()
+
+        step += 1
+```
+
+Finally, we can invoke `trainer.train` to start training:
+
+```python
+trainer.train(
+    reader=reader,
+    event_handler=event_handler_plot,
+    feeding=feeding,
+    num_passes=2)
+```
+
+## Conclusion
+
+This tutorial goes over traditional approaches in recommender system and a deep learning based approach.  We also show that how to train and use the model with PaddlePaddle.  Deep learning has been well used in computer vision and NLP, we look forward to its new successes in recommender systems.
+
+## References
+
+1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
+2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
+3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
+4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th International Conference on World Wide Web*. ACM, 2001.
+5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: Combining Social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
+6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
+7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/05.recommender_system/train.py b/05.recommender_system/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1f3853f5ed0f0d2b1f66494bd98e33479bf6601
--- /dev/null
+++ b/05.recommender_system/train.py
@@ -0,0 +1,135 @@
+import paddle.v2 as paddle
+import cPickle
+import copy
+import os
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
+
+def get_usr_combined_features():
+    uid = paddle.layer.data(
+        name='user_id',
+        type=paddle.data_type.integer_value(
+            paddle.dataset.movielens.max_user_id() + 1))
+    usr_emb = paddle.layer.embedding(input=uid, size=32)
+    usr_fc = paddle.layer.fc(input=usr_emb, size=32)
+
+    usr_gender_id = paddle.layer.data(
+        name='gender_id', type=paddle.data_type.integer_value(2))
+    usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
+    usr_gender_fc = paddle.layer.fc(input=usr_gender_emb, size=16)
+
+    usr_age_id = paddle.layer.data(
+        name='age_id',
+        type=paddle.data_type.integer_value(
+            len(paddle.dataset.movielens.age_table)))
+    usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
+    usr_age_fc = paddle.layer.fc(input=usr_age_emb, size=16)
+
+    usr_job_id = paddle.layer.data(
+        name='job_id',
+        type=paddle.data_type.integer_value(
+            paddle.dataset.movielens.max_job_id() + 1))
+    usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
+    usr_job_fc = paddle.layer.fc(input=usr_job_emb, size=16)
+
+    usr_combined_features = paddle.layer.fc(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
+        size=200,
+        act=paddle.activation.Tanh())
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+    movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
+    mov_id = paddle.layer.data(
+        name='movie_id',
+        type=paddle.data_type.integer_value(
+            paddle.dataset.movielens.max_movie_id() + 1))
+    mov_emb = paddle.layer.embedding(input=mov_id, size=32)
+    mov_fc = paddle.layer.fc(input=mov_emb, size=32)
+
+    mov_categories = paddle.layer.data(
+        name='category_id',
+        type=paddle.data_type.sparse_binary_vector(
+            len(paddle.dataset.movielens.movie_categories())))
+    mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
+
+    mov_title_id = paddle.layer.data(
+        name='movie_title',
+        type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
+    mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
+    mov_title_conv = paddle.networks.sequence_conv_pool(
+        input=mov_title_emb, hidden_size=32, context_len=3)
+
+    mov_combined_features = paddle.layer.fc(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv],
+        size=200,
+        act=paddle.activation.Tanh())
+    return mov_combined_features
+
+
+def main():
+    paddle.init(use_gpu=with_gpu)
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+    inference = paddle.layer.cos_sim(
+        a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
+    cost = paddle.layer.square_error_cost(
+        input=inference,
+        label=paddle.layer.data(
+            name='score', type=paddle.data_type.dense_vector(1)))
+
+    parameters = paddle.parameters.create(cost)
+
+    trainer = paddle.trainer.SGD(
+        cost=cost,
+        parameters=parameters,
+        update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
+    feeding = {
+        'user_id': 0,
+        'gender_id': 1,
+        'age_id': 2,
+        'job_id': 3,
+        'movie_id': 4,
+        'category_id': 5,
+        'movie_title': 6,
+        'score': 7
+    }
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d Batch %d Cost %.2f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.movielens.train(), buf_size=8192),
+            batch_size=256),
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=1)
+
+    user_id = 234
+    movie_id = 345
+
+    user = paddle.dataset.movielens.user_info()[user_id]
+    movie = paddle.dataset.movielens.movie_info()[movie_id]
+
+    feature = user.value() + movie.value()
+
+    infer_dict = copy.copy(feeding)
+    del infer_dict['score']
+
+    prediction = paddle.infer(
+        output_layer=inference,
+        parameters=parameters,
+        input=[feature],
+        feeding=infer_dict)
+    print(prediction + 5) / 2
+
+
+if __name__ == '__main__':
+    main()
diff --git a/understand_sentiment/.gitignore b/06.understand_sentiment/.gitignore
similarity index 100%
rename from understand_sentiment/.gitignore
rename to 06.understand_sentiment/.gitignore
diff --git a/understand_sentiment/README.md b/06.understand_sentiment/README.cn.md
similarity index 74%
rename from understand_sentiment/README.md
rename to 06.understand_sentiment/README.cn.md
index 2cfb0f2ad5c269d134eea621ea59a6bc6809cb61..0bd95e4781d7e46f21e2c4437758cfdee8fd8094 100644
--- a/understand_sentiment/README.md
+++ b/06.understand_sentiment/README.cn.md
@@ -1,8 +1,9 @@
 # 情感分析
 
-本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/177.html)。
 
 ## 背景介绍
+
 在自然语言处理中，情感分析一般是指判断一段文本所表达的情绪状态。其中，一段文本可以是一个句子，一个段落或一个文档。情绪状态可以是两类，如（正面，负面），（高兴，悲伤）；也可以是三类，如（积极，消极，中性）等等。情感分析的应用场景十分广泛，如把用户在购物网站（亚马逊、天猫、淘宝等）、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论；或为了分析用户对于某一产品的整体使用感受，抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子：
 
 | 电影评论       | 类别  |
@@ -19,48 +20,37 @@
 对于一段文本，BOW表示会忽略其词顺序、语法和句法，将这段文本仅仅看做是一个词集合，因此BOW方法并不能充分表示文本的语义信息。例如，句子“这部电影糟糕透了”和“一个乏味，空洞，没有内涵的作品”在情感分析中具有很高的语义相似度，但是它们的BOW表示的相似度为0。又如，句子“一个空洞，没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高，但实际上它们的意思很不一样。  
 
 本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷，它在考虑词顺序的基础上把文本映射到低维度的语义空间，并且以端对端（end to end）的方式进行文本表示及分类，其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。
+
 ## 模型概览
 本章所使用的文本表示模型为卷积神经网络（Convolutional Neural Networks）和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。
-### 文本卷积神经网络（CNN）
-卷积神经网络经常用来处理具有类似网格拓扑结构（grid-like topology）的数据。例如，图像可以视为二维网格的像素点，自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征，并对其进行组合抽象得到更高级的特征表示。实验表明，卷积神经网络能高效地对图像及文本问题进行建模处理。  
-
-卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以一种简单的文本分类卷积神经网络为例进行讲解\[[1](#参考文献)\]，如图1所示：
-<p align="center">
-<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
-图1. 卷积神经网络文本分类模型
-</p>
-假设待处理句子的长度为$n$，其中第$i$个词的词向量（word embedding）为$x_i\in\mathbb{R}^k$，$k$为维度大小。  
-
-首先，进行词向量的拼接操作：将每$h$个词拼接起来形成一个大小为$h$的词窗口，记为$x_{i:i+h-1}$，它表示词序列$x_{i},x_{i+1},\ldots,x_{i+h-1}$的拼接，其中，$i$表示词窗口中第一个词在整个句子中的位置，取值范围从$1$到$n-h+1$，$x_{i:i+h-1}\in\mathbb{R}^{hk}$。  
-
-其次，进行卷积操作：把卷积核(kernel)$w\in\mathbb{R}^{hk}$应用于包含$h$个词的窗口$x_{i:i+h-1}$，得到特征$c_i=f(w\cdot x_{i:i+h-1}+b)$，其中$b\in\mathbb{R}$为偏置项（bias），$f$为非线性激活函数，如$sigmoid$。将卷积核应用于句子中所有的词窗口${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$，产生一个特征图（feature map）：
-
-$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
 
-接下来，对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征$\hat c$，它是特征图中所有元素的最大值：
+### 文本卷积神经网络简介（CNN）
 
-$$\hat c=max(c)$$
+我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程，这里进行一个简单的回顾。
 
-在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵（上文中的单个卷积核参数$w$相当于矩阵的某一行），这样可以更高效的完成运算。另外，我们也可使用窗口大小不同的卷积核来处理句子（图1作为示意画了四个卷积核，不同颜色表示不同大小的卷积核操作）。  
-
-最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。
+对卷积神经网络来说，首先使用卷积处理输入的词向量序列，产生一个特征图（feature map），对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征，最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵，这样可以更高效的完成运算。另外，我们也可使用窗口大小不同的卷积核来处理句子，[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核，不同颜色表示不同大小的卷积核操作。
 
 对于一般的短文本分类问题，上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示，可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。
+
 ### 循环神经网络（RNN）
+
 循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
+
 <p align="center">
 <img src="image/rnn.png" width = "60%" align="center"/><br/>
-图2. 循环神经网络按时间展开的示意图
+图1. 循环神经网络按时间展开的示意图
 </p>
-循环神经网络按时间展开后如图2所示：在第$t$时刻，网络读入第$t$个输入$x_t$（向量表示）及前一时刻隐层的状态值$h_{t-1}$（向量表示，$h_0$一般初始化为$0$向量），计算得出本时刻隐层的状态值$h_t$，重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为$f$，则其公式可表示为：
+
+循环神经网络按时间展开后如图1所示：在第$t$时刻，网络读入第$t$个输入$x_t$（向量表示）及前一时刻隐层的状态值$h_{t-1}$（向量表示，$h_0$一般初始化为$0$向量），计算得出本时刻隐层的状态值$h_t$，重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为$f$，则其公式可表示为：
 
 $$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{h-1}+b_h)$$
 
 其中$W_{xh}$是输入到隐层的矩阵参数，$W_{hh}$是隐层到隐层的矩阵参数，$b_h$为隐层的偏置向量（bias）参数，$\sigma$为$sigmoid$函数。  
-  
+
 在处理自然语言时，一般会先将词（one-hot表示）映射为其词向量（word embedding）表示，然后再作为循环神经网络每一时刻的输入$x_t$。此外，可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如，可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层（deep or stacked）循环神经网络，或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。  
 
 ### 长短期记忆网络（LSTM）
+
 对于较长的序列数据，循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题，Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。  
 
 相比于简单的循环神经网络，LSTM增加了记忆单元$c$、输入门$i$、遗忘门$f$及输出门$o$。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为$F$，则其公式为：
@@ -75,27 +65,34 @@ c_t & = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{h-1}+b_c)\\\\
 o_t & = \sigma(W_{xo}x_t+W_{ho}h_{h-1}+W_{co}c_{t}+b_o)\\\\
 h_t & = o_t\odot tanh(c_t)\\\\
 \end{align}
-其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图3所示：
+其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图2所示：
+
 <p align="center">
 <img src="image/lstm.png" width = "65%" align="center"/><br/>
-图3. 时刻$t$的LSTM [7]
+图2. 时刻$t$的LSTM [7]
 </p>
+
 LSTM通过给简单的循环神经网络增加记忆及控制门的方式，增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\]，其设计更为简洁一些。**这些改进虽然各有不同，但是它们的宏观描述却与简单的循环神经网络一样（如图2所示），即隐状态依据当前输入及前一时刻的隐状态来改变，不断地循环这一过程直至输入处理完毕：**
 
 $$ h_t=Recrurent(x_t,h_{t-1})$$
 
 其中，$Recrurent$可以表示简单的循环神经网络、GRU或LSTM。
+
 ### 栈式双向LSTM（Stacked Bidirectional LSTM）
+
 对于正常顺序的循环神经网络，$h_t$包含了$t$时刻之前的输入信息，也就是上文信息。同样，为了得到下文信息，我们可以使用反方向（将输入逆序处理）的循环神经网络。结合构建深层循环神经网络的方法（深层神经网络往往能得到更抽象和高级的特征表示），我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\]，来对时序数据进行建模。  
 
-如图4所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
+如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
+
 <p align="center">
 <img src="image/stacked_lstm.jpg" width=450><br/>
-图4. 栈式双向LSTM用于文本分类
+图3. 栈式双向LSTM用于文本分类
 </p>
 
 ## 示例程序
+
 ### 数据集介绍
+
 我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中，负面评论的得分小于等于4，正面评论的得分大于等于7，满分10分。
 ```text
 aclImdb
@@ -108,20 +105,22 @@ aclImdb
 ```
 Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取，并提供了读取字典、训练数据、测试数据等API。
 
-```
+```python
 import sys
-import paddle.trainer_config_helpers.attrs as attrs
-from paddle.trainer_config_helpers.poolings import MaxPooling
 import paddle.v2 as paddle
 ```
 ## 配置模型
-在该示例中，我们实现了两种文本分类算法，分别基于上文所述的[文本卷积神经网络](#文本卷积神经网络（CNN）)和[栈式双向LSTM](#栈式双向LSTM（Stacked Bidirectional LSTM）)。
+
+在该示例中，我们实现了两种文本分类算法，分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络，以及[栈式双向LSTM](#栈式双向LSTM（Stacked Bidirectional LSTM）)。
+
 ### 文本卷积神经网络
-```
+
+```python
 def convolution_net(input_dim,
                     class_dim=2,
                     emb_dim=128,
-                    hid_dim=128):
+                    hid_dim=128,
+                    is_predict=False):
     data = paddle.layer.data("word",
                              paddle.data_type.integer_value_sequence(input_dim))
     emb = paddle.layer.embedding(input=data, size=emb_dim)
@@ -132,18 +131,24 @@ def convolution_net(input_dim,
     output = paddle.layer.fc(input=[conv_3, conv_4],
                              size=class_dim,
                              act=paddle.activation.Softmax())
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
 ```
 网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
+
 ### 栈式双向LSTM
-```
+
+```python
 def stacked_lstm_net(input_dim,
                      class_dim=2,
                      emb_dim=128,
                      hid_dim=512,
-                     stacked_num=3):
+                     stacked_num=3,
+                     is_predict=False):
     """
     A Wrapper for sentiment classification task.
     This network uses bi-directional recurrent network,
@@ -159,11 +164,10 @@ def stacked_lstm_net(input_dim,
     """
     assert stacked_num % 2 == 1
 
-    layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5)
-    fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3)
-    lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.)
+    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
+    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
     para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.)
+    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
     relu = paddle.activation.Relu()
     linear = paddle.activation.Linear()
 
@@ -176,7 +180,7 @@ def stacked_lstm_net(input_dim,
                           act=linear,
                           bias_attr=bias_attr)
     lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+        input=fc1, act=relu, bias_attr=bias_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
@@ -189,40 +193,46 @@ def stacked_lstm_net(input_dim,
             input=fc,
             reverse=(i % 2) == 0,
             act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
+            bias_attr=bias_attr)
         inputs = [fc, lstm]
 
-    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling())
-    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling())
+    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=paddle.pooling.Max())
+    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=paddle.pooling.Max())
     output = paddle.layer.fc(input=[fc_last, lstm_last],
                              size=class_dim,
                              act=paddle.activation.Softmax(),
                              bias_attr=bias_attr,
                              param_attr=para_attr)
 
-    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
-    cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
 ```
 网络的输入`stacked_num`表示的是LSTM的层数，需要是奇数，确保最高层LSTM正向。Paddle里面是通过一个fc和一个lstmemory来实现基于LSTM的循环神经网络。
+
 ## 训练模型
-```
+
+```python
 if __name__ == '__main__':
     # init
     paddle.init(use_gpu=False)
 ```
 启动paddle程序，use_gpu=False表示用CPU训练，如果系统支持GPU也可以修改成True使用GPU训练。
+
 ### 训练数据
+
 使用Paddle提供的数据集`dataset.imdb`中的API来读取训练数据。
-```
+```python
     print 'load dictionary...'
     word_dict = paddle.dataset.imdb.word_dict()
     dict_dim = len(word_dict)
     class_dim = 2
 ```
 加载数据字典，这里通过`word_dict()`API可以直接构造字典。`class_dim`是指样本类别数，该示例中样本只有正负两类。
-```
+```python
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
@@ -232,26 +242,32 @@ if __name__ == '__main__':
         batch_size=100)
 ```
 这里，`dataset.imdb.train()`和`dataset.imdb.test()`分别是`dataset.imdb`中的训练数据和测试数据API。`train_reader`在训练时使用，意义是将读取的训练数据进行shuffle后，组成一个batch数据。同理，`test_reader`是在测试的时候使用，将读取的测试数据组成一个batch。
+```python
+    feeding={'word': 0, 'label': 1}
 ```
-    reader_dict={'word': 0, 'label': 1}
-```
-`reader_dict`用来指定`train_reader`和`test_reader`返回的数据与模型配置中data_layer的对应关系。这里表示reader返回的第0列数据对应`word`层，第1列数据对应`label`层。
+`feeding`用来指定`train_reader`和`test_reader`返回的数据与模型配置中data_layer的对应关系。这里表示reader返回的第0列数据对应`word`层，第1列数据对应`label`层。
+
 ### 构造模型
-```
+
+```python
     # Please choose the way to build the network
     # by uncommenting the corresponding line.
     cost = convolution_net(dict_dim, class_dim=class_dim)
     # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
 ```
 该示例中默认使用`convolution_net`网络，如果使用`stacked_lstm_net`网络，注释相应的行即可。其中cost是网络的优化目标，同时cost包含了整个网络的拓扑信息。
+
 ### 网络参数
-```
+
+```python
     # create parameters
     parameters = paddle.parameters.create(cost)
 ```
 根据网络的拓扑构造网络参数。这里parameters是整个网络的参数集。
+
 ### 优化算法
-```
+
+```python
     # create optimizer
     adam_optimizer = paddle.optimizer.Adam(
         learning_rate=2e-3,
@@ -259,9 +275,11 @@ if __name__ == '__main__':
         model_average=paddle.optimizer.ModelAverage(average_window=0.5))
 ```
 Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
+
 ### 训练
-可以通过`paddle.trainer.SGD`构造一个sgd trainer，并调用`trainer.train`来训练模型。
-```
+
+可以通过`paddle.trainer.SGD`构造一个sgd trainer，并调用`trainer.train`来训练模型。另外，通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。
+```python
     # End batch and end pass event handler
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
@@ -272,11 +290,28 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
                 sys.stdout.write('.')
                 sys.stdout.flush()
         if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=test_reader, reader_dict=reader_dict)
+            with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+            result = trainer.test(reader=test_reader, feeding=feeding)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
-可以通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。比如构造如下一个`event_handler`可以在每100个batch结束后输出cost和error；在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
+比如，构造如下一个`event_handler`可以在每100个batch结束后输出cost和error；在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
+```python
+    from paddle.v2.plot import Ploter
+
+    train_title = "Train cost"
+    cost_ploter = Ploter(train_title)
+    step = 0
+    def event_handler_plot(event):
+        global step
+        if isinstance(event, paddle.event.EndIteration):
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+            step += 1
 ```
+或者构造一个`event_handler_plot`画出cost曲线。
+```python
     # create trainer
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
@@ -285,11 +320,11 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
     trainer.train(
         reader=train_reader,
         event_handler=event_handler,
-        reader_dict=reader_dict,
+        feeding=feeding,
         num_passes=2)
 ```
 程序运行之后的输出如下。
-```
+```text
 Pass 0, Batch 0, Cost 0.693721, {'classification_error_evaluator': 0.5546875}
 ...................................................................................................
 Pass 0, Batch 100, Cost 0.294321, {'classification_error_evaluator': 0.1015625}
@@ -297,8 +332,42 @@ Pass 0, Batch 100, Cost 0.294321, {'classification_error_evaluator': 0.1015625}
 Test with Pass 0, {'classification_error_evaluator': 0.11432000249624252}
 ```
 
+## 应用模型
+
+可以使用训练好的模型对电影评论进行分类，下面程序展示了如何使用`paddle.infer`接口进行推断。
+```python
+    import numpy as np
+
+    # Movie Reviews, from imdb test
+    reviews = [
+        'Read the book, forget the movie!',
+        'This is a great movie.'
+    ]
+    reviews = [c.split() for c in reviews]
+
+    UNK = word_dict['<unk>']
+    input = []
+    for c in reviews:
+        input.append([[word_dict.get(words, UNK) for words in c]])
+
+    # 0 stands for positive sample, 1 stands for negative sample
+    label = {0:'pos', 1:'neg'}
+    # Use the network used by trainer
+    out = convolution_net(dict_dim, class_dim=class_dim, is_predict=True)
+    # out = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3, is_predict=True)
+    probs = paddle.infer(output_layer=out, parameters=parameters, input=input)
+
+    labs = np.argsort(-probs)
+    for idx, lab in enumerate(labs):
+        print idx, "predicting probability is", probs[idx], "label is", label[lab[0]]
+```
+
+
 ## 总结
+
 本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
+
+
 ## 参考文献
 1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
 2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
@@ -311,4 +380,4 @@ Test with Pass 0, {'classification_error_evaluator': 0.11432000249624252}
 9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/06.understand_sentiment/README.md b/06.understand_sentiment/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d976a0c2a9d1c06008b0cf076f5c6bbf1b1d13f3
--- /dev/null
+++ b/06.understand_sentiment/README.md
@@ -0,0 +1,347 @@
+# Sentiment Analysis
+
+The source codes of this section is located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Background
+
+In natural language processing, sentiment analysis refers to determining the emotion expressed in a piece of text. The text can be a sentence, a paragraph, or a document. Emotion categorization can be binary -- positive/negative or happy/sad -- or in three classes -- positive/neutral/negative. Sentiment analysis is applicable in a wide range of services, such as e-commerce sites like Amazon and Taobao, hospitality services like Airbnb and hotels.com, and movie rating sites like Rotten Tomatoes and IMDB. It can be used to gauge from the reviews how the customers feel about the product. Table 1 illustrates an example of sentiment analysis in movie reviews:
+
+| Movie Review       | Category  |
+| --------     | -----  |
+| Best movie of Xiaogang Feng in recent years!| Positive |
+| Pretty bad. Feels like a tv-series from a local TV-channel     | Negative |
+| Politically correct version of Taken ... and boring as Heck| Negative|
+|delightful, mesmerizing, and completely unexpected. The plot is nicely designed.|Positive|
+
+<p align="center">Table 1 Sentiment Analysis in Movie Reviews</p>
+
+In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before the emergence of deep learning techniques, the mainstream methods for text representation include BOW (*bag of words*) and topic modeling, while the latter contains SVM (*support vector machine*) and LR (*logistic regression*).
+
+The BOW model does not capture all the information in a piece of text, as it ignores syntax and grammar and just treats the text as a set of words. For example, “this movie is extremely bad“ and “boring, dull, and empty work” describe very similar semantic meaning, yet their BOW representations have very little similarity. Furthermore, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW features, but they express completely opposite semantics.
+
+This chapter introduces a deep learning model that handles these issues in BOW. Our model embeds texts into a low-dimensional space and takes word order into consideration. It is an end-to-end framework and it has large performance improvement over traditional methods \[[1](#references)\].
+
+## Model Overview
+
+The model we used in this chapter uses **Convolutional Neural Networks** (**CNNs**) and **Recurrent Neural Networks** (**RNNs**) with some specific extensions.
+
+
+### Revisit to the Convolutional Neural Networks for Texts (CNN)
+
+The convolutional neural network for texts is introduced in chapter [recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), here is a brief overview.
+
+CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. We firstly apply the convolution operation: we apply the kernel in each window, extracting features. Convolving by the kernel at every window produces a feature map. Next, we apply *max pooling* over time to represent the whole sentence, which is the maximum element across the feature map. In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size. Finally, concatenating the resulting features produces a fixed-length representation, which can be combined with a softmax to form the model for the sentiment analysis problem.
+
+For short texts, the aforementioned CNN model can achieve very high accuracy \[[1](#references)\]. If we want to extract more abstract representations, we may apply a deeper CNN model \[[2](#references),[3](#references)\].
+
+### Recurrent Neural Network (RNN)
+
+RNN is an effective model for sequential data. In terms of computability, the RNN is Turing-complete \[[4](#references)\]. Since NLP is a classical problem of sequential data, the RNN, especially its variant LSTM\[[5](#references)\]), achieves state-of-the-art performance on various NLP tasks, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation, and so forth.
+
+<p align="center">
+<img src="image/rnn.png" width = "60%" align="center"/><br/>
+Figure 1. An illustration of an unfolded RNN in time.
+</p>
+
+As shown in Figure 1, we unfold an RNN: at the $t$-th time step, the network takes two inputs: the $t$-th input vector $\vec{x_t}$ and the latent state from the last time-step $\vec{h_{t-1}}$. From those, it computes the latent state of the current step $\vec{h_t}$. This process is repeated until all inputs are consumed. Denoting the RNN as function $f$, it can be formulated as follows:
+
+$$\vec{h_t}=f(\vec{x_t},\vec{h_{t-1}})=\sigma(W_{xh}\vec{x_t}+W_{hh}\vec{h_{h-1}}+\vec{b_h})$$
+
+where $W_{xh}$ is the weight matrix to feed into the latent layer; $W_{hh}$ is the latent-to-latent matrix; $b_h$ is the latent bias and $\sigma$ refers to the $sigmoid$ function.
+
+In NLP, words are often represented as one-hot vectors and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN, such as a deep or stacked RNN. Finally, the last latent state may be used as a feature for sentence classification.
+
+### Long-Short Term Memory (LSTM)
+
+Training an RNN on long sequential data sometimes leads to the gradient vanishing or exploding\[[6](#references)\]. To solve this problem Hochreiter S, Schmidhuber J. (1997) proposed **Long Short Term Memory** (LSTM)\[[5](#references)\]).
+
+Compared to the structure of a simple RNN, an LSTM includes memory cell $c$, input gate $i$, forget gate $f$ and output gate $o$. These gates and memory cells dramatically improve the ability for the network to handle long sequences. We can formulate the **LSTM-RNN**, denoted as a function $F$, as follows：
+
+$$ h_t=F(x_t,h_{t-1})$$
+
+$F$ contains following formulations\[[7](#references)\]：
+\begin{align}
+i_t & = \sigma(W_{xi}x_t+W_{hi}h_{h-1}+W_{ci}c_{t-1}+b_i)\\\\
+f_t & = \sigma(W_{xf}x_t+W_{hf}h_{h-1}+W_{cf}c_{t-1}+b_f)\\\\
+c_t & = f_t\odot c_{t-1}+i_t\odot \tanh(W_{xc}x_t+W_{hc}h_{h-1}+b_c)\\\\
+o_t & = \sigma(W_{xo}x_t+W_{ho}h_{h-1}+W_{co}c_{t}+b_o)\\\\
+h_t & = o_t\odot \tanh(c_t)\\\\
+\end{align}
+
+In the equation，$i_t, f_t, c_t, o_t$ stand for input gate, forget gate, memory cell and output gate, respectively. $W$ and $b$ are model parameters, $\tanh$ is a hyperbolic tangent, and $\odot$ denotes an element-wise product operation. The input gate controls the magnitude of the new input into the memory cell $c$; the forget gate controls the memory propagated from the last time step; the output gate controls the magnitutde of the output. The three gates are computed similarly with different parameters, and they influence memory cell $c$ separately, as shown in Figure 2:
+
+<p align="center">
+<img src="image/lstm_en.png" width = "65%" align="center"/><br/>
+Figure 2. LSTM at time step $t$ [7].
+</p>
+
+LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with a simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of the last time step, and the process goes on recurrently until all inputs are consumed:**
+
+$$ h_t=Recrurent(x_t,h_{t-1})$$
+where $Recrurent$ is a simple RNN, GRU or LSTM.
+
+### Stacked Bidirectional LSTM
+
+For vanilla LSTM, $h_t$ contains input information from previous time-step $1..t-1$ context. We can also apply an RNN with reverse-direction to take successive context $t+1…n$ into consideration. Combining constructing deep RNN (deeper RNN can contain more abstract and higher level semantic), we can design structures with deep stacked bidirectional LSTM to model sequential data\[[9](#references)\].
+
+As shown in Figure 3 (3-layer RNN), odd/even layers are forward/reverse LSTM. Higher layers of LSTM take lower-layers LSTM as input, and the top-layer LSTM produces a fixed length vector by max-pooling (this representation considers contexts from previous and successive words for higher-level abstractions). Finally, we concatenate the output to a softmax layer for classification.
+
+<p align="center">
+<img src="image/stacked_lstm_en.png" width=450><br/>
+Figure 3. Stacked Bidirectional LSTM for NLP modeling.
+</p>
+
+## Dataset
+
+We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for sentiment analysis in this tutorial, which consists of 50,000 movie reviews split evenly into a 25k train set and a 25k test set. In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10.
+
+`paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens`, and `wmt14`, etc. There's no need for us to manually download and preprocess IMDB.
+
+After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works.
+
+
+## Model Structure
+
+### Initialize PaddlePaddle
+
+We must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).
+
+```python
+import sys
+import paddle.v2 as paddle
+
+# PaddlePaddle init
+paddle.init(use_gpu=False, trainer_count=1)
+```
+
+As alluded to in section [Model Overview](#model-overview), here we provide the implementations of both Text CNN and Stacked-bidirectional LSTM models.
+
+### Text Convolution Neural Network (Text CNN)
+
+We create a neural network `convolution_net` as the following snippet code.
+
+Note: `paddle.networks.sequence_conv_pool` includes both convolution and pooling layer operations.
+
+```python
+def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost, output
+```
+
+1. Define input data and its dimension
+
+    Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `convolution_net`, the input to the network is defined in `paddle.layer.data`.
+
+1. Define Classifier
+
+    The above Text CNN network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category.
+
+1. Define Loss Function
+
+    In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier.
+
+#### Stacked bidirectional LSTM
+
+We create a neural network `stacked_lstm_net` as below.
+
+```python
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses a bi-directional recurrent network,
+    consisting of three LSTM layers. This configuration is
+    motivated from the following paper, but uses few layers.
+        http://www.aclweb.org/anthology/P15-1109
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    """
+    assert stacked_num % 2 == 1
+
+    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
+    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr)
+        inputs = [fc, lstm]
+
+    fc_last = paddle.layer.pooling(
+        input=inputs[0], pooling_type=paddle.pooling.Max())
+    lstm_last = paddle.layer.pooling(
+        input=inputs[1], pooling_type=paddle.pooling.Max())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost, output
+```
+
+1. Define input data and its dimension
+
+    Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `stacked_lstm_net`, the input to the network is defined in `paddle.layer.data`.
+
+1. Define Classifier
+
+    The above stacked bidirectional LSTM network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category.
+
+1. Define Loss Function
+
+    In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier.
+
+
+To reiterate, we can either invoke `convolution_net` or `stacked_lstm_net`.
+
+```python
+word_dict = paddle.dataset.imdb.word_dict()
+dict_dim = len(word_dict)
+class_dim = 2
+
+# option 1
+[cost, output] = convolution_net(dict_dim, class_dim=class_dim)
+# option 2
+# [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+```
+
+## Model Training
+
+### Define Parameters
+
+First, we create the model parameters according to the previous model configuration `cost`.
+
+```python
+# create parameters
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+Before jumping into creating a training module, algorithm setting is also necessary.
+Here we specified `Adam` optimization algorithm via `paddle.optimizer`.
+
+```python
+# create optimizer
+adam_optimizer = paddle.optimizer.Adam(
+    learning_rate=2e-3,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+    model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+
+# create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                                parameters=parameters,
+                                update_equation=adam_optimizer)
+```
+
+### Training
+
+`paddle.dataset.imdb.train()` will yield records during each pass, after shuffling, a batch input is generated for training.
+
+```python
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+    batch_size=100)
+
+test_reader = paddle.batch(
+    lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
+```
+
+`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `paddle.dataset.imdb.train()` corresponds to `word` feature.
+
+```python
+feeding = {'word': 0, 'label': 1}
+```
+
+Callback function `event_handler` will be invoked to track training progress when a pre-defined event happens.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "\nPass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        else:
+            sys.stdout.write('.')
+            sys.stdout.flush()
+    if isinstance(event, paddle.event.EndPass):
+        with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=test_reader, feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+Finally, we can invoke `trainer.train` to start training:
+
+```python
+trainer.train(
+    reader=train_reader,
+    event_handler=event_handler,
+    feeding=feeding,
+    num_passes=10)
+```
+
+
+## Conclusion
+
+In this chapter, we use sentiment analysis as an example to introduce applying deep learning models on end-to-end short text classification, as well as how to use PaddlePaddle to implement the model. Meanwhile, we briefly introduce two models for text processing: CNN and RNN. In following chapters, we will see how these models can be applied in other tasks.
+
+## References
+
+1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
+2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modeling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
+3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
+4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
+5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
+6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
+7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
+8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/understand_sentiment/image/lstm.png b/06.understand_sentiment/image/lstm.png
similarity index 100%
rename from understand_sentiment/image/lstm.png
rename to 06.understand_sentiment/image/lstm.png
diff --git a/understand_sentiment/image/lstm_en.png b/06.understand_sentiment/image/lstm_en.png
similarity index 100%
rename from understand_sentiment/image/lstm_en.png
rename to 06.understand_sentiment/image/lstm_en.png
diff --git a/understand_sentiment/image/rnn.png b/06.understand_sentiment/image/rnn.png
similarity index 100%
rename from understand_sentiment/image/rnn.png
rename to 06.understand_sentiment/image/rnn.png
diff --git a/understand_sentiment/image/stacked_lstm.jpg b/06.understand_sentiment/image/stacked_lstm.jpg
similarity index 100%
rename from understand_sentiment/image/stacked_lstm.jpg
rename to 06.understand_sentiment/image/stacked_lstm.jpg
diff --git a/understand_sentiment/image/stacked_lstm_en.png b/06.understand_sentiment/image/stacked_lstm_en.png
similarity index 100%
rename from understand_sentiment/image/stacked_lstm_en.png
rename to 06.understand_sentiment/image/stacked_lstm_en.png
diff --git a/06.understand_sentiment/index.cn.html b/06.understand_sentiment/index.cn.html
new file mode 100644
index 0000000000000000000000000000000000000000..5c5343caf24df2c194224f09bce2c91b636bc7e6
--- /dev/null
+++ b/06.understand_sentiment/index.cn.html
@@ -0,0 +1,447 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 情感分析
+
+本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/177.html)。
+
+## 背景介绍
+
+在自然语言处理中，情感分析一般是指判断一段文本所表达的情绪状态。其中，一段文本可以是一个句子，一个段落或一个文档。情绪状态可以是两类，如（正面，负面），（高兴，悲伤）；也可以是三类，如（积极，消极，中性）等等。情感分析的应用场景十分广泛，如把用户在购物网站（亚马逊、天猫、淘宝等）、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论；或为了分析用户对于某一产品的整体使用感受，抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子：
+
+| 电影评论       | 类别  |
+| --------     | -----  |
+| 在冯小刚这几年的电影里，算最好的一部的了| 正面 |
+| 很不好看，好像一个地方台的电视剧     | 负面 |
+| 圆方镜头全程炫技，色调背景美则美矣，但剧情拖沓，口音不伦不类，一直努力却始终无法入戏| 负面|
+|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉，看得实在太舒服了。。|正面|
+
+<p align="center">表格 1 电影评论情感分析</p>
+
+在自然语言处理中，情感分析属于典型的**文本分类**问题，即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前，主流的文本表示方法为词袋模型BOW(bag of words)，话题模型等等；分类方法有SVM(support vector machine), LR(logistic regression)等等。  
+
+对于一段文本，BOW表示会忽略其词顺序、语法和句法，将这段文本仅仅看做是一个词集合，因此BOW方法并不能充分表示文本的语义信息。例如，句子“这部电影糟糕透了”和“一个乏味，空洞，没有内涵的作品”在情感分析中具有很高的语义相似度，但是它们的BOW表示的相似度为0。又如，句子“一个空洞，没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高，但实际上它们的意思很不一样。  
+
+本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷，它在考虑词顺序的基础上把文本映射到低维度的语义空间，并且以端对端（end to end）的方式进行文本表示及分类，其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。
+
+## 模型概览
+本章所使用的文本表示模型为卷积神经网络（Convolutional Neural Networks）和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。
+
+### 文本卷积神经网络简介（CNN）
+
+我们在[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过应用于文本数据的卷积神经网络模型的计算过程，这里进行一个简单的回顾。
+
+对卷积神经网络来说，首先使用卷积处理输入的词向量序列，产生一个特征图（feature map），对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征，最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵，这样可以更高效的完成运算。另外，我们也可使用窗口大小不同的卷积核来处理句子，[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节的图3作为示意画了四个卷积核，不同颜色表示不同大小的卷积核操作。
+
+对于一般的短文本分类问题，上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示，可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。
+
+### 循环神经网络（RNN）
+
+循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
+
+<p align="center">
+<img src="image/rnn.png" width = "60%" align="center"/><br/>
+图1. 循环神经网络按时间展开的示意图
+</p>
+
+循环神经网络按时间展开后如图1所示：在第$t$时刻，网络读入第$t$个输入$x_t$（向量表示）及前一时刻隐层的状态值$h_{t-1}$（向量表示，$h_0$一般初始化为$0$向量），计算得出本时刻隐层的状态值$h_t$，重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为$f$，则其公式可表示为：
+
+$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{h-1}+b_h)$$
+
+其中$W_{xh}$是输入到隐层的矩阵参数，$W_{hh}$是隐层到隐层的矩阵参数，$b_h$为隐层的偏置向量（bias）参数，$\sigma$为$sigmoid$函数。  
+
+在处理自然语言时，一般会先将词（one-hot表示）映射为其词向量（word embedding）表示，然后再作为循环神经网络每一时刻的输入$x_t$。此外，可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如，可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层（deep or stacked）循环神经网络，或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。  
+
+### 长短期记忆网络（LSTM）
+
+对于较长的序列数据，循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题，Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。  
+
+相比于简单的循环神经网络，LSTM增加了记忆单元$c$、输入门$i$、遗忘门$f$及输出门$o$。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为$F$，则其公式为：
+
+$$ h_t=F(x_t,h_{t-1})$$
+
+$F$由下列公式组合而成\[[7](#参考文献)\]：
+\begin{align}
+i_t & = \sigma(W_{xi}x_t+W_{hi}h_{h-1}+W_{ci}c_{t-1}+b_i)\\\\
+f_t & = \sigma(W_{xf}x_t+W_{hf}h_{h-1}+W_{cf}c_{t-1}+b_f)\\\\
+c_t & = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{h-1}+b_c)\\\\
+o_t & = \sigma(W_{xo}x_t+W_{ho}h_{h-1}+W_{co}c_{t}+b_o)\\\\
+h_t & = o_t\odot tanh(c_t)\\\\
+\end{align}
+其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图2所示：
+
+<p align="center">
+<img src="image/lstm.png" width = "65%" align="center"/><br/>
+图2. 时刻$t$的LSTM [7]
+</p>
+
+LSTM通过给简单的循环神经网络增加记忆及控制门的方式，增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\]，其设计更为简洁一些。**这些改进虽然各有不同，但是它们的宏观描述却与简单的循环神经网络一样（如图2所示），即隐状态依据当前输入及前一时刻的隐状态来改变，不断地循环这一过程直至输入处理完毕：**
+
+$$ h_t=Recrurent(x_t,h_{t-1})$$
+
+其中，$Recrurent$可以表示简单的循环神经网络、GRU或LSTM。
+
+### 栈式双向LSTM（Stacked Bidirectional LSTM）
+
+对于正常顺序的循环神经网络，$h_t$包含了$t$时刻之前的输入信息，也就是上文信息。同样，为了得到下文信息，我们可以使用反方向（将输入逆序处理）的循环神经网络。结合构建深层循环神经网络的方法（深层神经网络往往能得到更抽象和高级的特征表示），我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\]，来对时序数据进行建模。  
+
+如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
+
+<p align="center">
+<img src="image/stacked_lstm.jpg" width=450><br/>
+图3. 栈式双向LSTM用于文本分类
+</p>
+
+## 示例程序
+
+### 数据集介绍
+
+我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中，负面评论的得分小于等于4，正面评论的得分大于等于7，满分10分。
+```text
+aclImdb
+|- test
+   |-- neg
+   |-- pos
+|- train
+   |-- neg
+   |-- pos
+```
+Paddle在`dataset/imdb.py`中提实现了imdb数据集的自动下载和读取，并提供了读取字典、训练数据、测试数据等API。
+
+```python
+import sys
+import paddle.v2 as paddle
+```
+## 配置模型
+
+在该示例中，我们实现了两种文本分类算法，分别基于[推荐系统](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system)一节介绍过的文本卷积神经网络，以及[栈式双向LSTM](#栈式双向LSTM（Stacked Bidirectional LSTM）)。
+
+### 文本卷积神经网络
+
+```python
+def convolution_net(input_dim,
+                    class_dim=2,
+                    emb_dim=128,
+                    hid_dim=128,
+                    is_predict=False):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
+```
+网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
+
+### 栈式双向LSTM
+
+```python
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3,
+                     is_predict=False):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    """
+    assert stacked_num % 2 == 1
+
+    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
+    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr)
+        inputs = [fc, lstm]
+
+    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=paddle.pooling.Max())
+    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=paddle.pooling.Max())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+
+    if not is_predict:
+        lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+        cost = paddle.layer.classification_cost(input=output, label=lbl)
+        return cost
+    else:
+        return output
+```
+网络的输入`stacked_num`表示的是LSTM的层数，需要是奇数，确保最高层LSTM正向。Paddle里面是通过一个fc和一个lstmemory来实现基于LSTM的循环神经网络。
+
+## 训练模型
+
+```python
+if __name__ == '__main__':
+    # init
+    paddle.init(use_gpu=False)
+```
+启动paddle程序，use_gpu=False表示用CPU训练，如果系统支持GPU也可以修改成True使用GPU训练。
+
+### 训练数据
+
+使用Paddle提供的数据集`dataset.imdb`中的API来读取训练数据。
+```python
+    print 'load dictionary...'
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+```
+加载数据字典，这里通过`word_dict()`API可以直接构造字典。`class_dim`是指样本类别数，该示例中样本只有正负两类。
+```python
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=100)
+    test_reader = paddle.batch(
+        lambda: paddle.dataset.imdb.test(word_dict),
+        batch_size=100)
+```
+这里，`dataset.imdb.train()`和`dataset.imdb.test()`分别是`dataset.imdb`中的训练数据和测试数据API。`train_reader`在训练时使用，意义是将读取的训练数据进行shuffle后，组成一个batch数据。同理，`test_reader`是在测试的时候使用，将读取的测试数据组成一个batch。
+```python
+    feeding={'word': 0, 'label': 1}
+```
+`feeding`用来指定`train_reader`和`test_reader`返回的数据与模型配置中data_layer的对应关系。这里表示reader返回的第0列数据对应`word`层，第1列数据对应`label`层。
+
+### 构造模型
+
+```python
+    # Please choose the way to build the network
+    # by uncommenting the corresponding line.
+    cost = convolution_net(dict_dim, class_dim=class_dim)
+    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+```
+该示例中默认使用`convolution_net`网络，如果使用`stacked_lstm_net`网络，注释相应的行即可。其中cost是网络的优化目标，同时cost包含了整个网络的拓扑信息。
+
+### 网络参数
+
+```python
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+```
+根据网络的拓扑构造网络参数。这里parameters是整个网络的参数集。
+
+### 优化算法
+
+```python
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=2e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+```
+Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
+
+### 训练
+
+可以通过`paddle.trainer.SGD`构造一个sgd trainer，并调用`trainer.train`来训练模型。另外，通过给train函数传递一个`event_handler`来获取每个batch和每个pass结束的状态。
+```python
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+比如，构造如下一个`event_handler`可以在每100个batch结束后输出cost和error；在每个pass结束后调用`trainer.test`计算一遍测试集并获得当前模型在测试集上的error。
+```python
+    from paddle.v2.plot import Ploter
+
+    train_title = "Train cost"
+    cost_ploter = Ploter(train_title)
+    step = 0
+    def event_handler_plot(event):
+        global step
+        if isinstance(event, paddle.event.EndIteration):
+            cost_ploter.append(train_title, step, event.cost)
+            cost_ploter.plot()
+            step += 1
+```
+或者构造一个`event_handler_plot`画出cost曲线。
+```python
+    # create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=adam_optimizer)
+
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=2)
+```
+程序运行之后的输出如下。
+```text
+Pass 0, Batch 0, Cost 0.693721, {'classification_error_evaluator': 0.5546875}
+...................................................................................................
+Pass 0, Batch 100, Cost 0.294321, {'classification_error_evaluator': 0.1015625}
+...............................................................................................
+Test with Pass 0, {'classification_error_evaluator': 0.11432000249624252}
+```
+
+## 应用模型
+
+可以使用训练好的模型对电影评论进行分类，下面程序展示了如何使用`paddle.infer`接口进行推断。
+```python
+    import numpy as np
+
+    # Movie Reviews, from imdb test
+    reviews = [
+        'Read the book, forget the movie!',
+        'This is a great movie.'
+    ]
+    reviews = [c.split() for c in reviews]
+
+    UNK = word_dict['<unk>']
+    input = []
+    for c in reviews:
+        input.append([[word_dict.get(words, UNK) for words in c]])
+
+    # 0 stands for positive sample, 1 stands for negative sample
+    label = {0:'pos', 1:'neg'}
+    # Use the network used by trainer
+    out = convolution_net(dict_dim, class_dim=class_dim, is_predict=True)
+    # out = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3, is_predict=True)
+    probs = paddle.infer(output_layer=out, parameters=parameters, input=input)
+
+    labs = np.argsort(-probs)
+    for idx, lab in enumerate(labs):
+        print idx, "predicting probability is", probs[idx], "label is", label[lab[0]]
+```
+
+
+## 总结
+
+本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
+
+
+## 参考文献
+1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
+2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
+3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
+4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
+5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
+6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
+7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
+8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/06.understand_sentiment/index.html b/06.understand_sentiment/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..1f23d4abcd20a3cf9cad08671381e2d278511b2c
--- /dev/null
+++ b/06.understand_sentiment/index.html
@@ -0,0 +1,411 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Sentiment Analysis
+
+The source codes of this section is located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Background
+
+In natural language processing, sentiment analysis refers to determining the emotion expressed in a piece of text. The text can be a sentence, a paragraph, or a document. Emotion categorization can be binary -- positive/negative or happy/sad -- or in three classes -- positive/neutral/negative. Sentiment analysis is applicable in a wide range of services, such as e-commerce sites like Amazon and Taobao, hospitality services like Airbnb and hotels.com, and movie rating sites like Rotten Tomatoes and IMDB. It can be used to gauge from the reviews how the customers feel about the product. Table 1 illustrates an example of sentiment analysis in movie reviews:
+
+| Movie Review       | Category  |
+| --------     | -----  |
+| Best movie of Xiaogang Feng in recent years!| Positive |
+| Pretty bad. Feels like a tv-series from a local TV-channel     | Negative |
+| Politically correct version of Taken ... and boring as Heck| Negative|
+|delightful, mesmerizing, and completely unexpected. The plot is nicely designed.|Positive|
+
+<p align="center">Table 1 Sentiment Analysis in Movie Reviews</p>
+
+In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before the emergence of deep learning techniques, the mainstream methods for text representation include BOW (*bag of words*) and topic modeling, while the latter contains SVM (*support vector machine*) and LR (*logistic regression*).
+
+The BOW model does not capture all the information in a piece of text, as it ignores syntax and grammar and just treats the text as a set of words. For example, “this movie is extremely bad“ and “boring, dull, and empty work” describe very similar semantic meaning, yet their BOW representations have very little similarity. Furthermore, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW features, but they express completely opposite semantics.
+
+This chapter introduces a deep learning model that handles these issues in BOW. Our model embeds texts into a low-dimensional space and takes word order into consideration. It is an end-to-end framework and it has large performance improvement over traditional methods \[[1](#references)\].
+
+## Model Overview
+
+The model we used in this chapter uses **Convolutional Neural Networks** (**CNNs**) and **Recurrent Neural Networks** (**RNNs**) with some specific extensions.
+
+
+### Revisit to the Convolutional Neural Networks for Texts (CNN)
+
+The convolutional neural network for texts is introduced in chapter [recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), here is a brief overview.
+
+CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. We firstly apply the convolution operation: we apply the kernel in each window, extracting features. Convolving by the kernel at every window produces a feature map. Next, we apply *max pooling* over time to represent the whole sentence, which is the maximum element across the feature map. In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size. Finally, concatenating the resulting features produces a fixed-length representation, which can be combined with a softmax to form the model for the sentiment analysis problem.
+
+For short texts, the aforementioned CNN model can achieve very high accuracy \[[1](#references)\]. If we want to extract more abstract representations, we may apply a deeper CNN model \[[2](#references),[3](#references)\].
+
+### Recurrent Neural Network (RNN)
+
+RNN is an effective model for sequential data. In terms of computability, the RNN is Turing-complete \[[4](#references)\]. Since NLP is a classical problem of sequential data, the RNN, especially its variant LSTM\[[5](#references)\]), achieves state-of-the-art performance on various NLP tasks, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation, and so forth.
+
+<p align="center">
+<img src="image/rnn.png" width = "60%" align="center"/><br/>
+Figure 1. An illustration of an unfolded RNN in time.
+</p>
+
+As shown in Figure 1, we unfold an RNN: at the $t$-th time step, the network takes two inputs: the $t$-th input vector $\vec{x_t}$ and the latent state from the last time-step $\vec{h_{t-1}}$. From those, it computes the latent state of the current step $\vec{h_t}$. This process is repeated until all inputs are consumed. Denoting the RNN as function $f$, it can be formulated as follows:
+
+$$\vec{h_t}=f(\vec{x_t},\vec{h_{t-1}})=\sigma(W_{xh}\vec{x_t}+W_{hh}\vec{h_{h-1}}+\vec{b_h})$$
+
+where $W_{xh}$ is the weight matrix to feed into the latent layer; $W_{hh}$ is the latent-to-latent matrix; $b_h$ is the latent bias and $\sigma$ refers to the $sigmoid$ function.
+
+In NLP, words are often represented as one-hot vectors and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN, such as a deep or stacked RNN. Finally, the last latent state may be used as a feature for sentence classification.
+
+### Long-Short Term Memory (LSTM)
+
+Training an RNN on long sequential data sometimes leads to the gradient vanishing or exploding\[[6](#references)\]. To solve this problem Hochreiter S, Schmidhuber J. (1997) proposed **Long Short Term Memory** (LSTM)\[[5](#references)\]).
+
+Compared to the structure of a simple RNN, an LSTM includes memory cell $c$, input gate $i$, forget gate $f$ and output gate $o$. These gates and memory cells dramatically improve the ability for the network to handle long sequences. We can formulate the **LSTM-RNN**, denoted as a function $F$, as follows：
+
+$$ h_t=F(x_t,h_{t-1})$$
+
+$F$ contains following formulations\[[7](#references)\]：
+\begin{align}
+i_t & = \sigma(W_{xi}x_t+W_{hi}h_{h-1}+W_{ci}c_{t-1}+b_i)\\\\
+f_t & = \sigma(W_{xf}x_t+W_{hf}h_{h-1}+W_{cf}c_{t-1}+b_f)\\\\
+c_t & = f_t\odot c_{t-1}+i_t\odot \tanh(W_{xc}x_t+W_{hc}h_{h-1}+b_c)\\\\
+o_t & = \sigma(W_{xo}x_t+W_{ho}h_{h-1}+W_{co}c_{t}+b_o)\\\\
+h_t & = o_t\odot \tanh(c_t)\\\\
+\end{align}
+
+In the equation，$i_t, f_t, c_t, o_t$ stand for input gate, forget gate, memory cell and output gate, respectively. $W$ and $b$ are model parameters, $\tanh$ is a hyperbolic tangent, and $\odot$ denotes an element-wise product operation. The input gate controls the magnitude of the new input into the memory cell $c$; the forget gate controls the memory propagated from the last time step; the output gate controls the magnitutde of the output. The three gates are computed similarly with different parameters, and they influence memory cell $c$ separately, as shown in Figure 2:
+
+<p align="center">
+<img src="image/lstm_en.png" width = "65%" align="center"/><br/>
+Figure 2. LSTM at time step $t$ [7].
+</p>
+
+LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with a simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of the last time step, and the process goes on recurrently until all inputs are consumed:**
+
+$$ h_t=Recrurent(x_t,h_{t-1})$$
+where $Recrurent$ is a simple RNN, GRU or LSTM.
+
+### Stacked Bidirectional LSTM
+
+For vanilla LSTM, $h_t$ contains input information from previous time-step $1..t-1$ context. We can also apply an RNN with reverse-direction to take successive context $t+1…n$ into consideration. Combining constructing deep RNN (deeper RNN can contain more abstract and higher level semantic), we can design structures with deep stacked bidirectional LSTM to model sequential data\[[9](#references)\].
+
+As shown in Figure 3 (3-layer RNN), odd/even layers are forward/reverse LSTM. Higher layers of LSTM take lower-layers LSTM as input, and the top-layer LSTM produces a fixed length vector by max-pooling (this representation considers contexts from previous and successive words for higher-level abstractions). Finally, we concatenate the output to a softmax layer for classification.
+
+<p align="center">
+<img src="image/stacked_lstm_en.png" width=450><br/>
+Figure 3. Stacked Bidirectional LSTM for NLP modeling.
+</p>
+
+## Dataset
+
+We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for sentiment analysis in this tutorial, which consists of 50,000 movie reviews split evenly into a 25k train set and a 25k test set. In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10.
+
+`paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens`, and `wmt14`, etc. There's no need for us to manually download and preprocess IMDB.
+
+After issuing a command `python train.py`, training will start immediately. The details will be unpacked by the following sessions to see how it works.
+
+
+## Model Structure
+
+### Initialize PaddlePaddle
+
+We must import and initialize PaddlePaddle (enable/disable GPU, set the number of trainers, etc).
+
+```python
+import sys
+import paddle.v2 as paddle
+
+# PaddlePaddle init
+paddle.init(use_gpu=False, trainer_count=1)
+```
+
+As alluded to in section [Model Overview](#model-overview), here we provide the implementations of both Text CNN and Stacked-bidirectional LSTM models.
+
+### Text Convolution Neural Network (Text CNN)
+
+We create a neural network `convolution_net` as the following snippet code.
+
+Note: `paddle.networks.sequence_conv_pool` includes both convolution and pooling layer operations.
+
+```python
+def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost, output
+```
+
+1. Define input data and its dimension
+
+    Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `convolution_net`, the input to the network is defined in `paddle.layer.data`.
+
+1. Define Classifier
+
+    The above Text CNN network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category.
+
+1. Define Loss Function
+
+    In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier.
+
+#### Stacked bidirectional LSTM
+
+We create a neural network `stacked_lstm_net` as below.
+
+```python
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses a bi-directional recurrent network,
+    consisting of three LSTM layers. This configuration is
+    motivated from the following paper, but uses few layers.
+        http://www.aclweb.org/anthology/P15-1109
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    """
+    assert stacked_num % 2 == 1
+
+    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
+    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr)
+        inputs = [fc, lstm]
+
+    fc_last = paddle.layer.pooling(
+        input=inputs[0], pooling_type=paddle.pooling.Max())
+    lstm_last = paddle.layer.pooling(
+        input=inputs[1], pooling_type=paddle.pooling.Max())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost, output
+```
+
+1. Define input data and its dimension
+
+    Parameter `input_dim` denotes the dictionary size, and `class_dim` is the number of categories. In `stacked_lstm_net`, the input to the network is defined in `paddle.layer.data`.
+
+1. Define Classifier
+
+    The above stacked bidirectional LSTM network extracts high-level features and maps them to a vector of the same size as the categories. `paddle.activation.Softmax` function or classifier is then used for calculating the probability of the sentence belonging to each category.
+
+1. Define Loss Function
+
+    In the context of supervised learning, labels of the training set are defined in `paddle.layer.data`, too. During training, cross-entropy is used as loss function in `paddle.layer.classification_cost` and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier.
+
+
+To reiterate, we can either invoke `convolution_net` or `stacked_lstm_net`.
+
+```python
+word_dict = paddle.dataset.imdb.word_dict()
+dict_dim = len(word_dict)
+class_dim = 2
+
+# option 1
+[cost, output] = convolution_net(dict_dim, class_dim=class_dim)
+# option 2
+# [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+```
+
+## Model Training
+
+### Define Parameters
+
+First, we create the model parameters according to the previous model configuration `cost`.
+
+```python
+# create parameters
+parameters = paddle.parameters.create(cost)
+```
+
+### Create Trainer
+
+Before jumping into creating a training module, algorithm setting is also necessary.
+Here we specified `Adam` optimization algorithm via `paddle.optimizer`.
+
+```python
+# create optimizer
+adam_optimizer = paddle.optimizer.Adam(
+    learning_rate=2e-3,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+    model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+
+# create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                                parameters=parameters,
+                                update_equation=adam_optimizer)
+```
+
+### Training
+
+`paddle.dataset.imdb.train()` will yield records during each pass, after shuffling, a batch input is generated for training.
+
+```python
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+    batch_size=100)
+
+test_reader = paddle.batch(
+    lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
+```
+
+`feeding` is devoted to specifying the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `paddle.dataset.imdb.train()` corresponds to `word` feature.
+
+```python
+feeding = {'word': 0, 'label': 1}
+```
+
+Callback function `event_handler` will be invoked to track training progress when a pre-defined event happens.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "\nPass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        else:
+            sys.stdout.write('.')
+            sys.stdout.flush()
+    if isinstance(event, paddle.event.EndPass):
+        with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=test_reader, feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+Finally, we can invoke `trainer.train` to start training:
+
+```python
+trainer.train(
+    reader=train_reader,
+    event_handler=event_handler,
+    feeding=feeding,
+    num_passes=10)
+```
+
+
+## Conclusion
+
+In this chapter, we use sentiment analysis as an example to introduce applying deep learning models on end-to-end short text classification, as well as how to use PaddlePaddle to implement the model. Meanwhile, we briefly introduce two models for text processing: CNN and RNN. In following chapters, we will see how these models can be applied in other tasks.
+
+## References
+
+1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
+2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modeling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
+3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
+4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
+5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
+6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
+7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
+8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/understand_sentiment/train.py b/06.understand_sentiment/train.py
similarity index 62%
rename from understand_sentiment/train.py
rename to 06.understand_sentiment/train.py
index 8641535ed20d9a89ff6e8d183106ccfdf0732232..58f61700c682b9c8210aba4ea9700cd1ddd76976 100644
--- a/understand_sentiment/train.py
+++ b/06.understand_sentiment/train.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import paddle.trainer_config_helpers.attrs as attrs
-from paddle.trainer_config_helpers.poolings import MaxPooling
+import sys, os
 import paddle.v2 as paddle
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 
 def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
     data = paddle.layer.data("word",
@@ -26,12 +26,11 @@ def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
         input=emb, context_len=3, hidden_size=hid_dim)
     conv_4 = paddle.networks.sequence_conv_pool(
         input=emb, context_len=4, hidden_size=hid_dim)
-    output = paddle.layer.fc(input=[conv_3, conv_4],
-                             size=class_dim,
-                             act=paddle.activation.Softmax())
+    output = paddle.layer.fc(
+        input=[conv_3, conv_4], size=class_dim, act=paddle.activation.Softmax())
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 
 
 def stacked_lstm_net(input_dim,
@@ -54,11 +53,10 @@ def stacked_lstm_net(input_dim,
     """
     assert stacked_num % 2 == 1
 
-    layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5)
-    fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3)
-    lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.)
+    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
+    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
     para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.)
+    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
     relu = paddle.activation.Relu()
     linear = paddle.activation.Linear()
 
@@ -66,44 +64,41 @@ def stacked_lstm_net(input_dim,
                              paddle.data_type.integer_value_sequence(input_dim))
     emb = paddle.layer.embedding(input=data, size=emb_dim)
 
-    fc1 = paddle.layer.fc(input=emb,
-                          size=hid_dim,
-                          act=linear,
-                          bias_attr=bias_attr)
-    lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+    fc1 = paddle.layer.fc(
+        input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(input=fc1, act=relu, bias_attr=bias_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
-        fc = paddle.layer.fc(input=inputs,
-                             size=hid_dim,
-                             act=linear,
-                             param_attr=para_attr,
-                             bias_attr=bias_attr)
+        fc = paddle.layer.fc(
+            input=inputs,
+            size=hid_dim,
+            act=linear,
+            param_attr=para_attr,
+            bias_attr=bias_attr)
         lstm = paddle.layer.lstmemory(
-            input=fc,
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
+            input=fc, reverse=(i % 2) == 0, act=relu, bias_attr=bias_attr)
         inputs = [fc, lstm]
 
-    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling())
-    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling())
-    output = paddle.layer.fc(input=[fc_last, lstm_last],
-                             size=class_dim,
-                             act=paddle.activation.Softmax(),
-                             bias_attr=bias_attr,
-                             param_attr=para_attr)
+    fc_last = paddle.layer.pooling(
+        input=inputs[0], pooling_type=paddle.pooling.Max())
+    lstm_last = paddle.layer.pooling(
+        input=inputs[1], pooling_type=paddle.pooling.Max())
+    output = paddle.layer.fc(
+        input=[fc_last, lstm_last],
+        size=class_dim,
+        act=paddle.activation.Softmax(),
+        bias_attr=bias_attr,
+        param_attr=para_attr)
 
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 
 
 if __name__ == '__main__':
     # init
-    paddle.init(use_gpu=False)
+    paddle.init(use_gpu=with_gpu)
 
     #data
     print 'load dictionary...'
@@ -112,18 +107,18 @@ if __name__ == '__main__':
     class_dim = 2
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=100)
     test_reader = paddle.batch(
-        lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
+        paddle.dataset.imdb.test(word_dict), batch_size=100)
 
-    reader_dict = {'word': 0, 'label': 1}
+    feeding = {'word': 0, 'label': 1}
 
     # network config
     # Please choose the way to build the network
     # by uncommenting the corresponding line.
-    cost = convolution_net(dict_dim, class_dim=class_dim)
-    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+    [cost, output] = convolution_net(dict_dim, class_dim=class_dim)
+    # [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
 
     # create parameters
     parameters = paddle.parameters.create(cost)
@@ -134,6 +129,10 @@ if __name__ == '__main__':
         regularization=paddle.optimizer.L2Regularization(rate=8e-4),
         model_average=paddle.optimizer.ModelAverage(average_window=0.5))
 
+    # create trainer
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=adam_optimizer)
+
     # End batch and end pass event handler
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
@@ -144,16 +143,19 @@ if __name__ == '__main__':
                 sys.stdout.write('.')
                 sys.stdout.flush()
         if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=test_reader, reader_dict=reader_dict)
+            with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+            result = trainer.test(reader=test_reader, feeding=feeding)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 
-    # create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=adam_optimizer)
+    # Save the inference topology to protobuf.
+    inference_topology = paddle.topology.Topology(layers=output)
+    with open("./inference_topology.pkl", 'wb') as f:
+        inference_topology.serialize_for_inference(f)
 
     trainer.train(
         reader=train_reader,
         event_handler=event_handler,
-        reader_dict=reader_dict,
-        num_passes=2)
+        feeding=feeding,
+        num_passes=20)
diff --git a/label_semantic_roles/.gitignore b/07.label_semantic_roles/.gitignore
similarity index 100%
rename from label_semantic_roles/.gitignore
rename to 07.label_semantic_roles/.gitignore
diff --git a/label_semantic_roles/README.md b/07.label_semantic_roles/README.cn.md
similarity index 72%
rename from label_semantic_roles/README.md
rename to 07.label_semantic_roles/README.cn.md
index 0bead4228652b1b781a9bfe5beccc3c483928ad4..9555e42fef85ea49bf64c9e7cb47d8867538b57a 100644
--- a/label_semantic_roles/README.md
+++ b/07.label_semantic_roles/README.cn.md
@@ -1,6 +1,6 @@
 # 语义角色标注
 
-本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/178.html)。
 
 ## 背景介绍
 
@@ -40,7 +40,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 
 ## 模型概览
 
-循环神经网络（Recurrent Neural Network）是一种对序列建模的重要模型，在自然语言处理任务中有着广泛地应用。不同于前馈神经网络（Feed-forward Neural Network），RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种，常用来学习长序列中蕴含的长程依赖关系，我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment)一篇中已经介绍过，这一篇中我们依然利用LSTM来解决SRL问题。
+循环神经网络（Recurrent Neural Network）是一种对序列建模的重要模型，在自然语言处理任务中有着广泛地应用。不同于前馈神经网络（Feed-forward Neural Network），RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种，常用来学习长序列中蕴含的长程依赖关系，我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过，这一篇中我们依然利用LSTM来解决SRL问题。
 
 ### 栈式循环神经网络（Stacked Recurrent Neural Network）
 
@@ -52,7 +52,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 
 图3是最终得到的栈式循环神经网络结构示意图。
 
-<p align="center">    
+<p align="center">  
 <img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
 图3. 基于LSTM的栈式循环神经网络结构示意图
 </p>
@@ -63,12 +63,12 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 
 为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
 
-<p align="center">    
+<p align="center">  
 <img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
 图4. 基于LSTM的双向循环神经网络结构示意图
 </p>
 
-需要说明的是，这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同，我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md)任务中，介绍另一种双向循环神经网络。
+需要说明的是，这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同，我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中，介绍另一种双向循环神经网络。
 
 ### 条件随机场 (Conditional Random Field)
 
@@ -78,7 +78,7 @@ CRF是一种概率化结构模型，可以看作是一个概率无向图模型
 
 序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
 
-<p align="center">    
+<p align="center">  
 <img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
 图5. 序列标注任务中使用的线性链条件随机场
 </p>
@@ -93,7 +93,7 @@ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
 
 $\omega$是特征函数对应的权值，是CRF模型要学习的参数。训练时，对于给定的输入序列和对应的标记序列集合$D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ，通过正则化的极大似然估计，求解如下优化目标：
 
-$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
+$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
 
 这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时，对于给定的输入序列$X$，通过解码算法（通常有：维特比算法、Beam Search）求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。
 
@@ -122,7 +122,7 @@ $$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \fra
 3. 第2步的4个词向量序列作为双向LSTM模型的输入；LSTM模型学习输入序列的特征表示，得到新的特性表示序列；
 4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
 
-<div  align="center">    
+<div  align="center">  
 <img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
 图6. SRL任务上的深层双向LSTM模型
 </div>
@@ -161,7 +161,7 @@ conll05st-release/
 
 预处理完成之后一条训练样本包含9个特征，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
 
-| 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 | 
+| 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 |
 |---|---|---|---|---|
 | A | set | n't been set . × | 0 | B-A1 |
 | record | set | n't been set . × | 0 | I-A1 |
@@ -182,143 +182,152 @@ conll05st-release/
 | predicate_dict | 谓词的词典，共计3162个词 |
 | emb | 一个训练好的词表，32维 |
 
-我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中，词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token，词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中，我们将它们全部看作未登录词，用`<unk>`表示。
+我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中，词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token，词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中，我们将它们全部看作未登录词，用`<unk>`表示。
 
 获取词典，打印词典大小：
 
 ```python
+import math
+import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator
+
+paddle.init(use_gpu=False, trainer_count=1)
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
 label_dict_len = len(label_dict)
 pred_len = len(verb_dict)
 
-print len(word_dict_len)
-print len(label_dict_len)
-print len(pred_len)
+print word_dict_len
+print label_dict_len
+print pred_len
 ```
 
 ## 模型配置说明
 
-1. 定义输入数据维度及模型超参数。
-
-	```python
-	mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
-	word_dim = 32        # 词向量维度
-	mark_dim = 5         # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
-	hidden_dim = 512     # LSTM隐层向量的维度 ： 512 / 4
-	depth = 8            # 栈式LSTM的深度
-	
-    # 一条样本总共9个特征，下面定义了9个data层，每个层类型为integer_value_sequence，表示整数ID的序列类型.
-    def d_type(size):
-        return paddle.data_type.integer_value_sequence(size)
-
-    # 句子序列
-    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
-    # 谓词
-    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) 
-
-    # 谓词上下文5个特征
-    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) 
-    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
-    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
-    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
-    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
-    
-    # 谓词上下区域标志
-    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
-    
-    # 标注序列
-    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-	```
-	
-   这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
-
-2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
-
-	```python   
-   
-    # 在本教程中，我们加载了预训练的词向量，这里设置了：is_static=True
-    # is_static 为 True 时保证了在训练 SRL 模型过程中，词表不再更新
-    emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
-    # 设置超参数
-    default_std = 1 / math.sqrt(hidden_dim) / 3.0
-    std_default = paddle.attr.Param(initial_std=default_std)
-    std_0 = paddle.attr.Param(initial_std=0.)
-
-    predicate_embedding = paddle.layer.embedding(
-        size=word_dim,
-        input=predicate,
-        param_attr=paddle.attr.Param(
-            name='vemb', initial_std=default_std))
-    mark_embedding = paddle.layer.embedding(
-        size=mark_dim, input=mark, param_attr=std_0)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        paddle.layer.embedding(
-            size=word_dim, input=x, param_attr=emb_para) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-	```
-
-3. 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
-
-	```python  
-	hidden_0 = paddle.layer.mixed(
+- 定义输入数据维度及模型超参数。
+
+```python
+mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
+word_dim = 32        # 词向量维度
+mark_dim = 5         # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
+hidden_dim = 512     # LSTM隐层向量的维度 ： 512 / 4
+depth = 8            # 栈式LSTM的深度
+
+# 一条样本总共9个特征，下面定义了9个data层，每个层类型为integer_value_sequence，表示整数ID的序列类型.
+def d_type(size):
+    return paddle.data_type.integer_value_sequence(size)
+
+# 句子序列
+word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+# 谓词
+predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+# 谓词上下文5个特征
+ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+
+# 谓词上下区域标志
+mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+# 标注序列
+target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+```
+
+这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
+
+- 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
+
+```python  
+
+# 在本教程中，我们加载了预训练的词向量，这里设置了：is_static=True
+# is_static 为 True 时保证了在训练 SRL 模型过程中，词表不再更新
+emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
+# 设置超参数
+default_std = 1 / math.sqrt(hidden_dim) / 3.0
+std_default = paddle.attr.Param(initial_std=default_std)
+std_0 = paddle.attr.Param(initial_std=0.)
+
+predicate_embedding = paddle.layer.embedding(
+    size=word_dim,
+    input=predicate,
+    param_attr=paddle.attr.Param(
+        name='vemb', initial_std=default_std))
+mark_embedding = paddle.layer.embedding(
+    size=mark_dim, input=mark, param_attr=std_0)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [
+    paddle.layer.embedding(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
+```
+
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+
+```python  
+hidden_0 = paddle.layer.mixed(
+    size=hidden_dim,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])
+
+mix_hidden_lr = 1e-3
+lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = paddle.attr.Param(
+    initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = paddle.layer.lstmemory(
+    input=hidden_0,
+    act=paddle.activation.Relu(),
+    gate_act=paddle.activation.Sigmoid(),
+    state_act=paddle.activation.Sigmoid(),
+    bias_attr=std_0,
+    param_attr=lstm_para_attr)
+
+#stack L-LSTM and R-LSTM with direct edges
+input_tmp = [hidden_0, lstm_0]
+
+for i in range(1, depth):
+    mix_hidden = paddle.layer.mixed(
         size=hidden_dim,
         bias_attr=std_default,
         input=[
             paddle.layer.full_matrix_projection(
-                input=emb, param_attr=std_default) for emb in emb_layers
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
         ])
 
-    mix_hidden_lr = 1e-3
-    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
-    hidden_para_attr = paddle.attr.Param(
-        initial_std=default_std, learning_rate=mix_hidden_lr)
-
-    lstm_0 = paddle.layer.lstmemory(
-        input=hidden_0,
+    lstm = paddle.layer.lstmemory(
+        input=mix_hidden,
         act=paddle.activation.Relu(),
         gate_act=paddle.activation.Sigmoid(),
         state_act=paddle.activation.Sigmoid(),
+        reverse=((i % 2) == 1),
         bias_attr=std_0,
         param_attr=lstm_para_attr)
 
-    #stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = paddle.layer.mixed(
-            size=hidden_dim,
-            bias_attr=std_default,
-            input=[
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[0], param_attr=hidden_para_attr),
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[1], param_attr=lstm_para_attr)
-            ])
-
-        lstm = paddle.layer.lstmemory(
-            input=mix_hidden,
-            act=paddle.activation.Relu(),
-            gate_act=paddle.activation.Sigmoid(),
-            state_act=paddle.activation.Sigmoid(),
-            reverse=((i % 2) == 1),
-            bias_attr=std_0,
-            param_attr=lstm_para_attr)
-
-        input_tmp = [mix_hidden, lstm]
-	```
-
-4. 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
-
-	```python
-    feature_out = paddle.layer.mixed(
+    input_tmp = [mix_hidden, lstm]
+```
+
+- 在PaddlePaddle中，CRF的状态特征和转移特征分别由一个全连接层和一个PaddlePaddle中的CRF层分别学习。在这个例子中，我们用线性激活的paddle.layer.mixed 来学习CRF的状态特征（也可以使用paddle.layer.fc），而 paddle.layer.crf只学习转移特征。paddle.layer.crf层是一个 cost 层，处于整个网络的末端，输出给定输入序列下，标记序列的log probability作为代价。训练阶段，该层需要输入正确的标记序列作为学习目标。
+
+```python
+
+# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，
+# 经过一个全连接层映射到标记字典的维度，来学习 CRF 的状态特征
+
+feature_out = paddle.layer.mixed(
     size=label_dict_len,
     bias_attr=std_default,
     input=[
@@ -327,31 +336,28 @@ print len(pred_len)
         paddle.layer.full_matrix_projection(
             input=input_tmp[1], param_attr=lstm_para_attr)
     ], )
-	```
-
-5.  网络的末端定义CRF层计算损失(cost)，指定参数名字为 `crfw`，该层需要输入正确的数据标签(target)。
-
-	```python
-    crf_cost = paddle.layer.crf(
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=paddle.attr.Param(
-            name='crfw',
-            initial_std=default_std,
-            learning_rate=mix_hidden_lr))
-	```
-
-6.  CRF译码层和CRF层参数名字相同，即共享权重。如果输入了正确的数据标签(target)，会统计错误标签的个数，可以用来评估模型。如果没有输入正确的数据标签，该层可以推到出最优解，可以用来预测模型。
-
-    ```python
-    crf_dec = paddle.layer.crf_decoding(
-       name='crf_dec_l',
-       size=label_dict_len,
-       input=feature_out,
-       label=target,
-       param_attr=paddle.attr.Param(name='crfw'))
-    ```
+
+# 学习 CRF 的转移特征
+crf_cost = paddle.layer.crf(
+    size=label_dict_len,
+    input=feature_out,
+    label=target,
+    param_attr=paddle.attr.Param(
+        name='crfw',
+        initial_std=default_std,
+        learning_rate=mix_hidden_lr))
+```
+
+- CRF解码和CRF层参数名字相同，即：加载了`paddle.layer.crf`层学习到的参数。在训练阶段，为`paddle.layer.crf_decoding` 输入了正确的标记序列(target)，这一层会输出是否正确标记，`evaluator.sum` 用来计算序列上的标记错误率，可以用来评估模型。解码阶段，没有输入正确的数据标签，该层通过寻找概率最高的标记序列，解码出标记结果。
+
+```python
+crf_dec = paddle.layer.crf_decoding(
+   size=label_dict_len,
+   input=feature_out,
+   label=target,
+   param_attr=paddle.attr.Param(name='crfw'))
+evaluator.sum(input=crf_dec)
+```
 
 ## 训练模型
 
@@ -361,11 +367,11 @@ print len(pred_len)
 
 ```python
 # create parameters
-parameters = paddle.parameters.create([crf_cost, crf_dec])
+parameters = paddle.parameters.create(crf_cost)
 ```
 
 可以打印参数名字，如果在网络配置中没有指定名字，则默认生成。
-   
+
 ```python
 print parameters.keys()
 ```
@@ -376,8 +382,8 @@ print parameters.keys()
 # 这里加载PaddlePaddle上版保存的二进制模型
 def load_parameter(file_name, h, w):
     with open(file_name, 'rb') as f:
-	     f.read(16)
-	     return np.fromfile(f, dtype=np.float32).reshape(h, w)
+        f.read(16)
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
 parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
 ```
 
@@ -389,14 +395,15 @@ parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
 # create optimizer
 optimizer = paddle.optimizer.Momentum(
     momentum=0,
-    learning_rate=2e-2,
+    learning_rate=1e-3,
     regularization=paddle.optimizer.L2Regularization(rate=8e-4),
     model_average=paddle.optimizer.ModelAverage(
         average_window=0.5, max_average_window=10000), )
 
 trainer = paddle.trainer.SGD(cost=crf_cost,
                              parameters=parameters,
-                             update_equation=optimizer)
+                             update_equation=optimizer,
+                             extra_layers=crf_dec)
 ```
 
 ### 训练
@@ -406,7 +413,7 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
 ```python
 reader = paddle.batch(
     paddle.reader.shuffle(
-        conll05.test(), buf_size=8192), batch_size=20)
+        conll05.test(), buf_size=8192), batch_size=2)
 ```
 
 通过`feeding`来指定每一个数据和data_layer的对应关系。 例如 下面`feeding`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。
@@ -431,9 +438,20 @@ feeding = {
 ```python
 def event_handler(event):
     if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
-            print "Pass %d, Batch %d, Cost %f" % (
-                event.pass_id, event.batch_id, event.cost)
+        if event.batch_id and event.batch_id % 10 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        if event.batch_id % 400 == 0:
+            result = trainer.test(reader=reader, feeding=feeding)
+            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=reader, feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
 
 通过`trainer.train`函数训练：
@@ -442,10 +460,45 @@ def event_handler(event):
 trainer.train(
     reader=reader,
     event_handler=event_handler,
-    num_passes=10000,
+    num_passes=1,
     feeding=feeding)
 ```
 
+### 应用模型
+
+训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。预测时使用 `paddle.layer.crf_decoding`，和训练不同的是，该层没有正确的标签层作为输入。如下所示：
+
+```python
+predict = paddle.layer.crf_decoding(
+    size=label_dict_len,
+    input=feature_out,
+    param_attr=paddle.attr.Param(name='crfw'))
+```
+
+这里选用测试集的一条数据作为示例。
+
+```python
+test_creator = paddle.dataset.conll05.test()
+test_data = []
+for item in test_creator():
+    test_data.append(item[0:8])
+    if len(test_data) == 1:
+        break
+```
+
+推断接口`paddle.infer`返回标签的索引，并查询词典`labels_reverse`，打印出标记的结果。
+
+```python
+labs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data, field='id')
+assert len(labs) == len(test_data[0][0])
+labels_reverse={}
+for (k,v) in label_dict.items():
+    labels_reverse[v]=k
+pre_lab = [labels_reverse[i] for i in labs]
+print pre_lab
+```
+
 ## 总结
 
 语义角色标注是许多自然语言理解任务的重要中间步骤。这篇教程中我们以语义角色标注任务为例，介绍如何利用PaddlePaddle进行序列标注任务。教程中所介绍的模型来自我们发表的论文\[[10](#参考文献)\]。由于 CoNLL 2005 SRL任务的训练数据目前并非完全开放，教程中只使用测试数据作为示例。在这个过程中，我们希望减少对其它自然语言处理工具的依赖，利用神经网络数据驱动、端到端学习的能力，得到一个和传统方法可比、甚至更好的模型。在论文中我们证实了这种可能性。关于模型更多的信息和讨论可以在论文中找到。
@@ -463,4 +516,4 @@ trainer.train(
 10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/07.label_semantic_roles/README.md b/07.label_semantic_roles/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d3cf39c57b30c2c9a0770f5ee9cba84692668055
--- /dev/null
+++ b/07.label_semantic_roles/README.md
@@ -0,0 +1,538 @@
+# Semantic Role Labeling
+
+The source code of this chapter locates at [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles).
+
+For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Background
+
+Natural language analysis techniques consist of lexical, syntactic, and semantic analysis. **Semantic Role Labeling (SRL)** is an instance of **Shallow Semantic Analysis**.
+
+In a sentence, a **predicate** states a property or a characterization of a *subject*, such as what it does and what it is like. The predicate represents the core of an event, whereas the words accompanying the predicate are **arguments**. A **semantic role** refers to the abstract role an argument of a predicate take on in the event, including *agent*, *patient*, *theme*, *experiencer*, *beneficiary*, *instrument*, *location*, *goal*, and *source*.
+
+In the following example of a Chinese sentence, "to encounter" is the predicate (*pred*); "Ming" is the *agent*; "Hong" is the *patient*; "yesterday" and "evening" are the *time*; finally, "the park" is the *location*.
+
+$$\mbox{[小明 Ming]}_{\mbox{Agent}}\mbox{[昨天 yesterday]}_{\mbox{Time}}\mbox{[晚上 evening]}_\mbox{Time}\mbox{在[公园 a park]}_{\mbox{Location}}\mbox{[遇到 to encounter]}_{\mbox{Predicate}}\mbox{了[小红 Hong]}_{\mbox{Patient}}\mbox{。}$$
+
+Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relationship between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
+
+Conventional SRL systems mostly build on top of syntactic analysis, usually consisting of five steps:
+
+1. Construct a syntax tree, as shown in Fig. 1
+2. Identity the candidate arguments of the given predicate on the tree.
+3. Prune the most unlikely candidate arguments.
+4. Identify the real arguments, often by a binary classifier.
+5. Multi-classify on results from step 4 to label the semantic roles. Steps 2 and 3 usually introduce hand-designed features based on syntactic analysis (step 1).
+
+
+<div  align="center">
+<img src="image/dependency_parsing_en.png" width = "80%" align=center /><br>
+Fig 1. Syntax tree
+</div>
+
+
+However, a complete syntactic analysis requires identifying the relationship among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out will receive the tag O.
+
+The BIO representation of above example is shown in Fig.1.
+
+<div  align="center">
+<img src="image/bio_example_en.png" width = "90%"  align=center /><br>
+Fig 2. BIO representation
+</div>
+
+This example illustrates the simplicity of sequence tagging, since
+
+1. It only relies on shallow syntactic analysis, reduces the precision requirement of syntactic analysis;
+2. Pruning the candidate arguments is no longer necessary;
+3. Arguments are identified and tagged at the same time. Simplifying the workflow reduces the risk of accumulating errors; oftentimes, methods that unify multiple steps boost performance.
+
+In this tutorial, our SRL system is built as an end-to-end system via a neural network. The system takes only text sequences as input, without using any syntactic parsing results or complex hand-designed features. The public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) is used for the following task: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles through sequence tagging.
+
+## Model
+
+**Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim modeling long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
+
+### Stacked Recurrent Neural Network
+
+*Deep Neural Networks* can extract hierarchical representations. The higher layers can form relatively abstract/complex representations, based on primitive features discovered through the lower layers. Unfolding LSTMs through time results in a deep feed-forward neural network. This is because any computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. On the other hand, due to parameter sharing over time, LSTMs are also *shallow*; that is, the computation carried out at each time-step is just a linear transformation. Deep LSTM networks are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be efficient at representing some functions and modeling varying-length dependencies\[[2](#reference)\].
+
+
+However, in a deep LSTM network, any gradient propagated back in depth needs to traverse a large number of nonlinear steps. As a result, while LSTMs of 4 layers can be trained properly, those with 4-8 have much worse performance. Conventional LSTMs prevent back-propagated errors from vanishing or exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well.
+
+
+A single LSTM cell has three operations:
+
+1. input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping);
+2. hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs;
+3. hidden-to-output: this part typically involves an activation operation on hidden states.
+
+Based on the stacked LSTMs, we add shortcut connections: take the input-to-hidden from the previous layer as a new input and learn another linear transformation.
+
+Fig.3 illustrates the final stacked recurrent neural networks.
+
+<p align="center">
+<img src="./image/stacked_lstm_en.png" width = "40%"  align=center><br>
+Fig 3. Stacked Recurrent Neural Networks
+</p>
+
+### Bidirectional Recurrent Neural Network
+
+While LSTMs can summarize the history, they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
+
+To address this, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
+
+
+<p align="center">
+<img src="./image/bidirectional_stacked_lstm_en.png" width = "60%" align=center><br>
+Fig 4. Bidirectional LSTMs
+</p>
+
+Note that, this bidirectional RNNs is different from the one proposed by Bengio et al. in machine translation tasks \[[3](#reference), [4](#reference)\]. We will introduce another bidirectional RNNs in the following chapter [machine translation](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md)
+
+### Conditional Random Field (CRF)
+
+Typically, a neural network's lower layers learn representations while its very top layer accomplishes the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes representations provided by the last LSTM layer as input.
+
+
+The CRF is an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. In essence, CRFs learn the conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input and $Y = (y_1, y_2, ... , y_n)$ are label sequences; to decode, simply search through $Y$ for a sequence that maximizes the conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
+
+Sequence tagging tasks do not assume a lot of conditional independence, because they only concern about the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
+
+<p align="center">
+<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
+Fig 5. Linear Chain Conditional Random Field used in SRL tasks
+</p>
+
+By the fundamental theorem of random fields \[[5](#reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form:
+
+$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
+
+
+where, $Z(X)$ is normalization constant, ${t_j}$ represents the feature functions defined on edges called the *transition feature*, which denotes the transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. ${s_k}$ represents the feature function defined on nodes, called the state feature, denoting the probability of $y_i$ given input sequence $X$. In addition, $\lambda_j$ and $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Alternatively, $t$ and $s$ can be written in the same form that depends on $y_{i - 1}$, $y_i$, $X$, and $i$. Taking its summation over all nodes $i$, we have: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, which defines the *feature function* $f$. Thus, $P(Y|X)$ can be written as:
+
+$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
+
+where $\omega$ are the weights to the feature function that the CRF learns. While training, given input sequences and label sequences $D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, by maximum likelihood estimation (**MLE**), we construct the following objective function:
+
+
+$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
+
+
+This objective function can be solved via back-propagation in an end-to-end manner. While decoding, given input sequences $X$, search for sequence $\bar{Y}$ to maximize the conditional probability $\bar{P}(Y|X)$ via decoding methods (such as *Viterbi*, or [Beam Search Algorithm](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md#beam-search-algorithm)).
+
+### Deep Bidirectional LSTM (DB-LSTM) SRL model
+
+Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has $n$ predicates, we will process this sequence $n$ times. Here is the breakdown of a straight-forward model:
+
+1. Construct inputs;
+ - input 1: predicate, input 2: sentence
+ - expand input 1 into a sequence of the same length with input 2's sentence, using one-hot representation;
+2. Convert the one-hot sequences from step 1 to vector sequences via a word embedding's lookup table;
+3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
+4. Take the representation from step 3 as input, label sequence as a supervisory signal, and realize sequence tagging tasks.
+
+Here, we propose some improvements by introducing two simple but effective features:
+
+- predicate context (**ctx-p**): A single predicate word may not describe all the predicate information, especially when the same words appear multiple times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk.
+
+- region mark ($m_r$): The binary marker on a word, $m_r$, takes the value of $1$ when the word is in the predicate context region, and $0$ if not.
+
+After these modifications, the model is as follows, as illustrated in Figure 6:
+
+1. Construct inputs
+ - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if the word is located in the predicate context region, 0 otherwise.
+ - expand input 2~3 into sequences with the same length with input 1
+2. Convert input 1~4 to vector sequences via word embedding lookup tables; While input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables.
+3. Take the four vector sequences from step 2 as inputs to bidirectional LSTMs; Train the LSTMs to update representations.
+4. Take the representation from step 3 as input to CRF, label sequence as a supervisory signal, and complete sequence tagging tasks.
+
+
+<div  align="center">
+<img src="image/db_lstm_network_en.png" width = "60%"  align=center /><br>
+Fig 6. DB-LSTM for SRL tasks
+</div>
+
+## Data Preparation
+
+In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. Note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, for a usable neural network SRL system, please consider paying for the full corpus.
+
+The original data includes a variety of information such as POS tagging, naming entity recognition, syntax tree, etc. In this tutorial, we only use the data under `test.wsj/words/` (text sequence) and `test.wsj/props/` (label results). The data directory used in this tutorial is as follows:
+
+```text
+conll05st-release/
+└── test.wsj
+    ├── props  # label results
+    └── words  # text sequence
+```
+
+The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](#references)\]. The labeling of the PropBank is different from the labeling methods mentioned before, but shares with it the same underlying principle. For descriptions of the labeling, please refer to the paper \[[9](#references)\].
+
+The raw data needs to be preprocessed into formats that PaddlePaddle can handle. The preprocessing consists of the following steps:
+
+1. Merge the text sequence and the tag sequence into the same record;
+2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate;
+3. Extract the predicate context and construct the predicate context region marker;
+4. Construct the markings in BIO format;
+5. Obtain the integer index corresponding to the word according to the dictionary.
+
+```python
+# import paddle.v2.dataset.conll05 as conll05
+# conll05.corpus_reader does step 1 and 2 as mentioned above.
+# conll05.reader_creator does step 3 to 5.
+# conll05.test gets preprocessed training instances.
+```
+
+After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample.
+
+| word sequence | predicate | predicate context（5 columns） | region mark sequence | label sequence|
+|---|---|---|---|---|
+| A | set | n't been set . × | 0 | B-A1 |
+| record | set | n't been set . × | 0 | I-A1 |
+| date | set | n't been set . × | 0 | I-A1 |
+| has | set | n't been set . × | 0 | O |
+| n't | set | n't been set . × | 1 | B-AM-NEG |
+| been | set | n't been set . × | 1 | O |
+| set | set | n't been set . × | 1 | B-V |
+| . | set | n't been set . × | 1 | O |
+
+In addition to the data, we provide following resources:
+
+| filename | explanation |
+|---|---|
+| word_dict | dictionary of input sentences, total 44068 words |
+| label_dict | dictionary of labels, total 106 labels |
+| predicate_dict | predicate dictionary, total 3162 predicates |
+| emb | a pre-trained word vector lookup table, 32-dimensional |
+
+We trained a language model on the English Wikipedia to get a word vector lookup table used to initialize the SRL model. While training the SRL model, the word vector lookup table is no longer updated. To learn more about the language model and the word vector lookup table, please refer to the tutorial [word vector](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.md). There are 995,000,000 tokens in the training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
+
+Here we fetch the dictionary, and print its size:
+
+```python
+import math
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator
+
+paddle.init(use_gpu=False, trainer_count=1)
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+print word_dict_len
+print label_dict_len
+print pred_len
+```
+
+## Model Configuration
+
+- Define input data dimensions and model hyperparameters.
+
+```python
+mark_dict_len = 2    # value range of region mark. Region mark is either 0 or 1, so range is 2
+word_dim = 32        # word vector dimension
+mark_dim = 5         # adjacent dimension
+hidden_dim = 512     # the dimension of LSTM hidden layer vector is 128 (512/4)
+depth = 8            # depth of stacked LSTM
+
+# There are 9 features per sample, so we will define 9 data layers.
+# They type for each layer is integer_value_sequence.
+def d_type(value_range):
+    return paddle.data_type.integer_value_sequence(value_range)
+
+# word sequence
+word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+# predicate
+predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+# 5 features for predicate context
+ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+
+# region marker sequence
+mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+# label sequence
+target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+```
+
+Note that `hidden_dim = 512` means a LSTM hidden vector of 128 dimension (512/4). Please refer to PaddlePaddle's official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
+
+- Transform the word sequence itself, the predicate, the predicate context, and the region mark sequence into embedded vector sequences.
+
+```python
+
+# Since word vectorlookup table is pre-trained, we won't update it this time.
+# is_static being True prevents updating the lookup table during training.
+emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
+# hyperparameter configurations
+default_std = 1 / math.sqrt(hidden_dim) / 3.0
+std_default = paddle.attr.Param(initial_std=default_std)
+std_0 = paddle.attr.Param(initial_std=0.)
+
+predicate_embedding = paddle.layer.embedding(
+    size=word_dim,
+    input=predicate,
+    param_attr=paddle.attr.Param(
+        name='vemb', initial_std=default_std))
+mark_embedding = paddle.layer.embedding(
+    size=mark_dim, input=mark, param_attr=std_0)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [
+    paddle.layer.embedding(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
+```
+
+- 8 LSTM units are trained through alternating left-to-right / right-to-left order denoted by the variable `reverse`.
+
+```python
+hidden_0 = paddle.layer.mixed(
+    size=hidden_dim,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])
+
+mix_hidden_lr = 1e-3
+lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = paddle.attr.Param(
+    initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = paddle.layer.lstmemory(
+    input=hidden_0,
+    act=paddle.activation.Relu(),
+    gate_act=paddle.activation.Sigmoid(),
+    state_act=paddle.activation.Sigmoid(),
+    bias_attr=std_0,
+    param_attr=lstm_para_attr)
+
+# stack L-LSTM and R-LSTM with direct edges
+input_tmp = [hidden_0, lstm_0]
+
+for i in range(1, depth):
+    mix_hidden = paddle.layer.mixed(
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ])
+
+    lstm = paddle.layer.lstmemory(
+        input=mix_hidden,
+        act=paddle.activation.Relu(),
+        gate_act=paddle.activation.Sigmoid(),
+        state_act=paddle.activation.Sigmoid(),
+        reverse=((i % 2) == 1),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
+
+    input_tmp = [mix_hidden, lstm]
+```
+
+- In PaddlePaddle, state features and transition features of a CRF are implemented by a fully connected layer and a CRF layer seperately. The fully connected layer with linear activation learns the state features, here we use paddle.layer.mixed (paddle.layer.fc can be uesed as well), and the CRF layer in PaddlePaddle: paddle.layer.crf only learns the transition features, which is a cost layer and is the last layer of the network. paddle.layer.crf outputs the log probability of true tag sequence as the cost by given the input sequence and it requires the true tag sequence as target in the learning process.
+
+```python
+
+# The output of the top LSTM unit and its input are feed into a fully connected layer,
+# size of which equals to size of tag labels.
+# The fully connected layer learns the state features
+
+feature_out = paddle.layer.mixed(
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)], )
+
+crf_cost = paddle.layer.crf(
+    size=label_dict_len,
+    input=feature_out,
+    label=target,
+    param_attr=paddle.attr.Param(
+        name='crfw',
+        initial_std=default_std,
+        learning_rate=mix_hidden_lr))
+```
+
+- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer.  The sharing of parameters among multiple layers is specified by using the same parameter name in these layers. If true tag sequence is provided in training process, `paddle.layer.crf_decoding` calculates labelling error for each input token and `evaluator.sum` sum the error over the entire sequence. Otherwise, `paddle.layer.crf_decoding`  generates the labelling tags.
+
+```python
+crf_dec = paddle.layer.crf_decoding(
+   size=label_dict_len,
+   input=feature_out,
+   label=target,
+   param_attr=paddle.attr.Param(name='crfw'))
+evaluator.sum(input=crf_dec)
+```
+
+## Train model
+
+### Create Parameters
+
+All necessary parameters will be traced created given output layers that we need to use.
+
+```python
+parameters = paddle.parameters.create(crf_cost)
+```
+
+We can print out parameter name. It will be generated if not specified.
+
+```python
+print parameters.keys()
+```
+
+Now we load the pre-trained word lookup tables from word embeddings trained on the English language Wikipedia.
+
+```python
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
+```
+
+### Create Trainer
+
+We will create trainer given model topology, parameters, and optimization method. We will use the most basic **SGD** method, which is a momentum optimizer with 0 momentum. Meanwhile, we will set learning rate and regularization.
+
+```python
+optimizer = paddle.optimizer.Momentum(
+    momentum=0,
+    learning_rate=1e-3,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+    model_average=paddle.optimizer.ModelAverage(
+        average_window=0.5, max_average_window=10000), )
+
+trainer = paddle.trainer.SGD(cost=crf_cost,
+                             parameters=parameters,
+                             update_equation=optimizer,
+                             extra_layers=crf_dec)
+```
+
+### Trainer
+
+As mentioned in data preparation section, we will use CoNLL 2005 test corpus as the training data set. `conll05.test()` outputs one training instance at a time. It is shuffled and batched into mini batches, and used as input.
+
+```python
+reader = paddle.batch(
+    paddle.reader.shuffle(
+        conll05.test(), buf_size=8192), batch_size=2)
+```
+
+`feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
+
+```python
+feeding = {
+    'word_data': 0,
+    'ctx_n2_data': 1,
+    'ctx_n1_data': 2,
+    'ctx_0_data': 3,
+    'ctx_p1_data': 4,
+    'ctx_p2_data': 5,
+    'verb_data': 6,
+    'mark_data': 7,
+    'target': 8
+}
+```
+
+`event_handler` can be used as callback for training events, it will be used as an argument for the `train` method. Following `event_handler` prints cost during training.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id and event.batch_id % 10 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        if event.batch_id % 400 == 0:
+            result = trainer.test(reader=reader, feeding=feeding)
+            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=reader, feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+`trainer.train` will train the model.
+
+```python
+trainer.train(
+    reader=reader,
+    event_handler=event_handler,
+    num_passes=10000,
+    feeding=feeding)
+```
+
+### Application
+
+When training is completed, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs do not include the ground truth label.
+
+```python
+predict = paddle.layer.crf_decoding(
+    size=label_dict_len,
+    input=feature_out,
+    param_attr=paddle.attr.Param(name='crfw'))
+```
+
+Here, using one testing sample as an example.
+
+```python
+test_creator = paddle.dataset.conll05.test()
+test_data = []
+for item in test_creator():
+    test_data.append(item[0:8])
+    if len(test_data) == 1:
+        break
+```
+
+The inference interface `paddle.infer` returns the index of predicting labels. Then printing the tagging results based dictionary `labels_reverse`.
+
+
+```python
+labs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data, field='id')
+assert len(labs) == len(test_data[0][0])
+labels_reverse={}
+for (k,v) in label_dict.items():
+    labels_reverse[v]=k
+pre_lab = [labels_reverse[i] for i in labs]
+print pre_lab
+```
+
+## Conclusion
+
+Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we use SRL as an example to illustrate using PaddlePaddle to do sequence tagging tasks. The models proposed are from our published paper\[[10](#Reference)\]. We only use test data for illustration since the training data on the CoNLL 2005 dataset is not completely public. This aims to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models in terms of performance. Please check out our paper for more information and discussions.
+
+## References
+1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
+2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013.
+3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014.
+5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289.
+6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012.
+7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330.
+8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106.
+9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164.
+10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/label_semantic_roles/image/bidirectional_stacked_lstm.png b/07.label_semantic_roles/image/bidirectional_stacked_lstm.png
similarity index 100%
rename from label_semantic_roles/image/bidirectional_stacked_lstm.png
rename to 07.label_semantic_roles/image/bidirectional_stacked_lstm.png
diff --git a/07.label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/07.label_semantic_roles/image/bidirectional_stacked_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..f0a195c24d9ee493f96bb93c28a99e70566be7a4
Binary files /dev/null and b/07.label_semantic_roles/image/bidirectional_stacked_lstm_en.png differ
diff --git a/07.label_semantic_roles/image/bio_example.png b/07.label_semantic_roles/image/bio_example.png
new file mode 100755
index 0000000000000000000000000000000000000000..e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20
Binary files /dev/null and b/07.label_semantic_roles/image/bio_example.png differ
diff --git a/07.label_semantic_roles/image/bio_example_en.png b/07.label_semantic_roles/image/bio_example_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..93b44dd4874402ef29ad7bd7d94147609b92e309
Binary files /dev/null and b/07.label_semantic_roles/image/bio_example_en.png differ
diff --git a/label_semantic_roles/image/db_lstm_network.png b/07.label_semantic_roles/image/db_lstm_network.png
similarity index 100%
rename from label_semantic_roles/image/db_lstm_network.png
rename to 07.label_semantic_roles/image/db_lstm_network.png
diff --git a/07.label_semantic_roles/image/db_lstm_network_en.png b/07.label_semantic_roles/image/db_lstm_network_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..c3646312e48db977402fb353dc0c9b4d02269bf4
Binary files /dev/null and b/07.label_semantic_roles/image/db_lstm_network_en.png differ
diff --git a/07.label_semantic_roles/image/dependency_parsing.png b/07.label_semantic_roles/image/dependency_parsing.png
new file mode 100755
index 0000000000000000000000000000000000000000..9265b671735940ed6549e2980064d2ce08baae64
Binary files /dev/null and b/07.label_semantic_roles/image/dependency_parsing.png differ
diff --git a/07.label_semantic_roles/image/dependency_parsing_en.png b/07.label_semantic_roles/image/dependency_parsing_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..23f4f45b603e3d60702af2b2464d10fc8deed061
Binary files /dev/null and b/07.label_semantic_roles/image/dependency_parsing_en.png differ
diff --git a/label_semantic_roles/image/linear_chain_crf.png b/07.label_semantic_roles/image/linear_chain_crf.png
similarity index 100%
rename from label_semantic_roles/image/linear_chain_crf.png
rename to 07.label_semantic_roles/image/linear_chain_crf.png
diff --git a/label_semantic_roles/image/stacked_lstm.png b/07.label_semantic_roles/image/stacked_lstm.png
similarity index 100%
rename from label_semantic_roles/image/stacked_lstm.png
rename to 07.label_semantic_roles/image/stacked_lstm.png
diff --git a/07.label_semantic_roles/image/stacked_lstm_en.png b/07.label_semantic_roles/image/stacked_lstm_en.png
new file mode 100755
index 0000000000000000000000000000000000000000..0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61
Binary files /dev/null and b/07.label_semantic_roles/image/stacked_lstm_en.png differ
diff --git a/label_semantic_roles/index.html b/07.label_semantic_roles/index.cn.html
similarity index 59%
rename from label_semantic_roles/index.html
rename to 07.label_semantic_roles/index.cn.html
index fca99fedb6d5b5788ab4b31e75287192cd175584..540fb0b13a32609bfe3ce2f0a8f93bf34de99b67 100644
--- a/label_semantic_roles/index.html
+++ b/07.label_semantic_roles/index.cn.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,14 +35,14 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
 # 语义角色标注
 
-本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
+本教程源代码目录在[book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/178.html)。
 
 ## 背景介绍
 
@@ -81,7 +82,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 
 ## 模型概览
 
-循环神经网络（Recurrent Neural Network）是一种对序列建模的重要模型，在自然语言处理任务中有着广泛地应用。不同于前馈神经网络（Feed-forward Neural Network），RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种，常用来学习长序列中蕴含的长程依赖关系，我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment)一篇中已经介绍过，这一篇中我们依然利用LSTM来解决SRL问题。
+循环神经网络（Recurrent Neural Network）是一种对序列建模的重要模型，在自然语言处理任务中有着广泛地应用。不同于前馈神经网络（Feed-forward Neural Network），RNN能够处理输入之间前后关联的问题。LSTM是RNN的一种重要变种，常用来学习长序列中蕴含的长程依赖关系，我们在[情感分析](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment)一篇中已经介绍过，这一篇中我们依然利用LSTM来解决SRL问题。
 
 ### 栈式循环神经网络（Stacked Recurrent Neural Network）
 
@@ -93,7 +94,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 
 图3是最终得到的栈式循环神经网络结构示意图。
 
-<p align="center">    
+<p align="center">  
 <img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
 图3. 基于LSTM的栈式循环神经网络结构示意图
 </p>
@@ -104,12 +105,12 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 
 为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
 
-<p align="center">    
+<p align="center">  
 <img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
 图4. 基于LSTM的双向循环神经网络结构示意图
 </p>
 
-需要说明的是，这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同，我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md)任务中，介绍另一种双向循环神经网络。
+需要说明的是，这种双向RNN结构和Bengio等人在机器翻译任务中使用的双向RNN结构\[[3](#参考文献), [4](#参考文献)\] 并不相同，我们会在后续[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)任务中，介绍另一种双向循环神经网络。
 
 ### 条件随机场 (Conditional Random Field)
 
@@ -119,7 +120,7 @@ CRF是一种概率化结构模型，可以看作是一个概率无向图模型
 
 序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
 
-<p align="center">    
+<p align="center">  
 <img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
 图5. 序列标注任务中使用的线性链条件随机场
 </p>
@@ -134,7 +135,7 @@ $$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
 
 $\omega$是特征函数对应的权值，是CRF模型要学习的参数。训练时，对于给定的输入序列和对应的标记序列集合$D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$ ，通过正则化的极大似然估计，求解如下优化目标：
 
-$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
+$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
 
 这个优化目标可以通过反向传播算法和整个神经网络一起求解。解码时，对于给定的输入序列$X$，通过解码算法（通常有：维特比算法、Beam Search）求令出条件概率$\bar{P}(Y|X)$最大的输出序列 $\bar{Y}$。
 
@@ -163,15 +164,15 @@ $$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \fra
 3. 第2步的4个词向量序列作为双向LSTM模型的输入；LSTM模型学习输入序列的特征表示，得到新的特性表示序列；
 4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
 
-<div  align="center">    
+<div  align="center">  
 <img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
 图6. SRL任务上的深层双向LSTM模型
 </div>
 
-## 数据准备
-### 数据介绍与下载
 
-在此教程中，我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。运行 `sh ./get_data.sh` 会自动从官方网站上下载原始数据。需要特别说明的是，CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开，目前，能够获取到的只有测试集，包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中，我们以测试集中的WSJ数据为训练集来讲解模型。但是，由于测试集中样本的数量远远不够，如果希望训练一个可用的神经网络SRL系统，请考虑付费获取全量数据。
+## 数据介绍
+
+在此教程中，我们选用[CoNLL 2005](http://www.cs.upc.edu/~srlconll/)SRL任务开放出的数据集作为示例。需要特别说明的是，CoNLL 2005 SRL任务的训练数集和开发集在比赛之后并非免费进行公开，目前，能够获取到的只有测试集，包括Wall Street Journal的23节和Brown语料集中的3节。在本教程中，我们以测试集中的WSJ数据为训练集来讲解模型。但是，由于测试集中样本的数量远远不够，如果希望训练一个可用的神经网络SRL系统，请考虑付费获取全量数据。
 
 原始数据中同时包括了词性标注、命名实体识别、语法解析树等多种信息。本教程中，我们使用test.wsj文件夹中的数据进行训练和测试，并只会用到words文件夹（文本序列）和props文件夹（标注结果）下的数据。本教程使用的数据目录如下：
 
@@ -184,28 +185,25 @@ conll05st-release/
 
 标注信息源自Penn TreeBank\[[7](#参考文献)\]和PropBank\[[8](#参考文献)\]的标注结果。PropBank标注结果的标签和我们在文章一开始示例中使用的标注结果标签不同，但原理是相同的，关于标注结果标签含义的说明，请参考论文\[[9](#参考文献)\]。
 
-除数据之外，`get_data.sh`同时下载了以下资源：
-
-| 文件名称 | 说明 |
-|---|---|
-| word_dict | 输入句子的词典，共计44068个词 |
-| label_dict | 标记的词典，共计106个标记 |
-| predicate_dict | 谓词的词典，共计3162个词 |
-| emb | 一个训练好的词表，32维 |
-
-我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中，词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token，词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中，我们将它们全部看作未登录词，用`<unk>`表示。
-
-### 数据预处理
-脚本在下载数据之后，又调用了`extract_pair.py`和`extract_dict_feature.py`两个子脚本进行数据预处理，前者完成了下面的第1步，后者完成了下面的2~4步：
+原始数据需要进行数据预处理才能被PaddlePaddle处理，预处理包括下面几个步骤:
 
 1. 将文本序列和标记序列其合并到一条记录中；
 2. 一个句子如果含有$n$个谓词，这个句子会被处理$n$次，变成$n$条独立的训练样本，每个样本一个不同的谓词；
 3. 抽取谓词上下文和构造谓词上下文区域标记；
 4. 构造以BIO法表示的标记；
+5. 依据词典获取词对应的整数索引。
+
+
+```python
+# import paddle.v2.dataset.conll05 as conll05
+# conll05.corpus_reader函数完成上面第1步和第2步.
+# conll05.reader_creator函数完成上面第3步到第5步.
+# conll05.test函数可以获取处理之后的每条样本来供PaddlePaddle训练.
+```
 
-`data/feature`文件是处理好的模型输入，一行是一条训练样本，以"\t"分隔，共9列，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
+预处理完成之后一条训练样本包含9个特征，分别是：句子序列、谓词、谓词上下文（占 5 列）、谓词上下区域标志、标注序列。下表是一条训练样本的示例。
 
-| 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 | 
+| 句子序列 | 谓词 | 谓词上下文（窗口 = 5） | 谓词上下文区域标记 | 标注序列 |
 |---|---|---|---|---|
 | A | set | n't been set . × | 0 | B-A1 |
 | record | set | n't been set . × | 0 | I-A1 |
@@ -216,288 +214,331 @@ conll05st-release/
 | set | set | n't been set . × | 1 | B-V |
 | . | set | n't been set . × | 1 | O |
 
-### 提供数据给 PaddlePaddle
-1. 使用hook函数进行PaddlePaddle输入字段的格式定义。
-
-	```python
-	def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
-	    settings.word_dict = word_dict   # 获取句子序列的字典
-	    settings.label_dict = label_dict  # 获取标记序列的字典
-	    settings.predicate_dict = predicate_dict  # 获取谓词的字典
-	
-	    #  所有输入特征都是使用one-hot表示序列，在PaddlePaddle中是interger_value_sequence类型
-	    #  input_types是一个字典，字典中每个元素对应着配置中的一个data_layer，key恰好就是data_layer的名字
-	    
-	    settings.input_types = {
-		        'word_data': integer_value_sequence(len(word_dict)),    # 句子序列
-		        'ctx_n2_data': integer_value_sequence(len(word_dict)),  # 谓词上下文中的第1个词
-		        'ctx_n1_data': integer_value_sequence(len(word_dict)),  # 谓词上下文中的第2个词
-		        'ctx_0_data': integer_value_sequence(len(word_dict)),   # 谓词上下文中的第3个词
-		        'ctx_p1_data': integer_value_sequence(len(word_dict)),  # 谓词上下文中的第4个词
-		        'ctx_p2_data': integer_value_sequence(len(word_dict)),  # 谓词上下文中的第5个词
-		        'verb_data': integer_value_sequence(len(predicate_dict)),  # 谓词
-		        'mark_data': integer_value_sequence(2),  # 谓词上下文区域标记
-		        'target': integer_value_sequence(len(label_dict))  # 标记序列
-        }
-	```
-
-2. 使用process将数据逐一提供给PaddlePaddle，只需要考虑如何从原始数据文件中返回一条训练样本。
-
-	```python
-	def process(settings, file_name):
-	    with open(file_name, 'r') as fdata:
-	        for line in fdata:
-	            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-	                line.strip().split('\t')
-	
-	            # 句子文本
-	            words = sentence.split()
-	            sen_len = len(words)
-	            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-	
-	            # 一个谓词，这里将谓词扩展成一个和句子一样长的序列
-	            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-	            
-	            # 在教程中，我们使用一个窗口为 5 的谓词上下文窗口：谓词和这个谓词前后隔两个词
-	            # 这里会将窗口中的每一个词，扩展成和输入句子一样长的序列
-	            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-	            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-	            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-	            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-	            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-	
-	            # 谓词上下文区域标记，是一个二值特征
-	            marks = mark.split()
-	            mark_slot = [int(w) for w in marks]
-	
-	            label_list = label.split()
-	            label_slot = [settings.label_dict.get(w) for w in label_list]
-	            yield {
-	                'word_data': word_slot,
-	                'ctx_n2_data': ctx_n2_slot,
-	                'ctx_n1_data': ctx_n1_slot,
-	                'ctx_0_data': ctx_0_slot,
-	                'ctx_p1_data': ctx_p1_slot,
-	                'ctx_p2_data': ctx_p2_slot,
-	                'verb_data': predicate_slot,
-	                'mark_data': mark_slot,
-	                'target': label_slot
-	            }	
-	```
 
-## 模型配置说明
+除数据之外，我们同时提供了以下资源：
+
+| 文件名称 | 说明 |
+|---|---|
+| word_dict | 输入句子的词典，共计44068个词 |
+| label_dict | 标记的词典，共计106个标记 |
+| predicate_dict | 谓词的词典，共计3162个词 |
+| emb | 一个训练好的词表，32维 |
 
-### 数据定义
+我们在英文维基百科上训练语言模型得到了一份词向量用来初始化SRL模型。在SRL模型训练过程中，词向量不再被更新。关于语言模型和词向量可以参考[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md) 这篇教程。我们训练语言模型的语料共有995,000,000个token，词典大小控制为4900,000词。CoNLL 2005训练语料中有5%的词不在这4900,000个词中，我们将它们全部看作未登录词，用`<unk>`表示。
 
-首先通过 define_py_data_sources2 从dataprovider中读入数据。配置文件中会读取三个字典：输入文本序列的字典、标记的字典、谓词的字典，并传给data provider，data provider会利用这三个字典，将相应的文本输入转换成one-hot序列。
+获取词典，打印词典大小：
 
 ```python
-define_py_data_sources2(
-        train_list=train_list_file,
-        test_list=test_list_file,
-        module='dataprovider',
-        obj='process',
-        args={
-            'word_dict': word_dict,   # 输入文本序列的字典
-            'label_dict': label_dict, # 标记的字典
-            'predicate_dict': predicate_dict  # 谓词的词典
-        }
-)
+import math
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator
+
+paddle.init(use_gpu=False, trainer_count=1)
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+print word_dict_len
+print label_dict_len
+print pred_len
 ```
-### 算法配置
 
-在这里，我们指定了模型的训练参数，选择了$L_2$正则、学习率和batch size，并使用带Momentum的随机梯度下降法作为优化算法。
+## 模型配置说明
+
+- 定义输入数据维度及模型超参数。
 
 ```python
-settings(
-    batch_size=150,
-    learning_method=MomentumOptimizer(momentum=0),
-    learning_rate=2e-2,
-    regularization=L2Regularization(8e-4),
-    model_average=ModelAverage(average_window=0.5, max_average_window=10000)
-)
+mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
+word_dim = 32        # 词向量维度
+mark_dim = 5         # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
+hidden_dim = 512     # LSTM隐层向量的维度 ： 512 / 4
+depth = 8            # 栈式LSTM的深度
+
+# 一条样本总共9个特征，下面定义了9个data层，每个层类型为integer_value_sequence，表示整数ID的序列类型.
+def d_type(size):
+    return paddle.data_type.integer_value_sequence(size)
+
+# 句子序列
+word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+# 谓词
+predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+# 谓词上下文5个特征
+ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+
+# 谓词上下区域标志
+mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+# 标注序列
+target = paddle.layer.data(name='target', type=d_type(label_dict_len))
 ```
 
-### 模型结构
-
-1. 定义输入数据维度及模型超参数。
-
-	```python
-	mark_dict_len = 2    # 谓上下文区域标志的维度，是一个0-1 2值特征，因此维度为2
-	word_dim = 32        # 词向量维度
-	mark_dim = 5         # 谓词上下文区域通过词表被映射为一个实向量，这个是相邻的维度
-	hidden_dim = 512     # LSTM隐层向量的维度 ： 512 / 4
-	depth = 8            # 栈式LSTM的深度
-	
-	word = data_layer(name='word_data', size=word_dict_len)
-   predicate = data_layer(name='verb_data', size=pred_len)
-
-	ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
-	ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
-	ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
-	ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
-	ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
-	mark = data_layer(name='mark_data', size=mark_dict_len)
-	
-	if not is_predict:
-	    target = data_layer(name='target', size=label_dict_len)    # 标记序列只在训练和测试流程中定义
-	```
 这里需要特别说明的是hidden_dim = 512指定了LSTM隐层向量的维度为128维，关于这一点请参考PaddlePaddle官方文档中[lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)的说明。
 
-2. 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
-
-	```python
-	
-	# 在本教程中，我们加载了预训练的词向量，这里设置了：is_static=True
-	# is_static 为 True 时保证了在训练 SRL 模型过程中，词表不再更新
-	emb_para = ParameterAttribute(name='emb', initial_std=0., is_static=True)
-		
-	word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-	emb_layers = [
-		    embedding_layer(
-		        size=word_dim, input=x, param_attr=emb_para) for x in word_input
-	]
-	emb_layers.append(predicate_embedding)
-	mark_embedding = embedding_layer(
-		    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
-   emb_layers.append(mark_embedding)
-	```
-
-3. 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
-
-	```python
-	#  std_0 指定的参数以均值为0的高斯分布初始化，用在LSTM的bias初始化中  
-	std_0 = ParameterAttribute(initial_std=0.)
-	
-	hidden_0 = mixed_layer(
-	    name='hidden0',
-	    size=hidden_dim,
-	    bias_attr=std_default,
-	    input=[
-	        full_matrix_projection(
-	            input=emb, param_attr=std_default) for emb in emb_layers
-	    ])
-	lstm_0 = lstmemory(
-	    name='lstm0',
-	    input=hidden_0,
-	    act=ReluActivation(),
-	    gate_act=SigmoidActivation(),
-	    state_act=SigmoidActivation(),
-	    bias_attr=std_0,
-	    param_attr=lstm_para_attr)
-		input_tmp = [hidden_0, lstm_0]
-
-	for i in range(1, depth):
-	    mix_hidden = mixed_layer(
-	        name='hidden' + str(i),
-	        size=hidden_dim,
-	        bias_attr=std_default,
-	        input=[
-	            full_matrix_projection(
-	                input=input_tmp[0], param_attr=hidden_para_attr),
-	            full_matrix_projection(
-	                input=input_tmp[1], param_attr=lstm_para_attr)
-	        ])
-	    lstm = lstmemory(
-	        name='lstm' + str(i),
-	        input=mix_hidden,
-	        act=ReluActivation(),
-	        gate_act=SigmoidActivation(),
-	        state_act=SigmoidActivation(),
-	        reverse=((i % 2) == 1),
-	        bias_attr=std_0,
-	        param_attr=lstm_para_attr)
-	
-	    input_tmp = [mix_hidden, lstm]
-	```
-
-4. 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，经过一个全连接层映射到标记字典的维度，得到最终的特征向量表示。
-
-	```python
-	feature_out = mixed_layer(
-	    name='output',
-	    size=label_dict_len,
-	    bias_attr=std_default,
-	    input=[
-	        full_matrix_projection(
-	            input=input_tmp[0], param_attr=hidden_para_attr),
-	        full_matrix_projection(
-	            input=input_tmp[1], param_attr=lstm_para_attr)
-	    ], ) 
-	```
-
-5.  CRF层在网络的末端，完成序列标注。
-
-	```python
-	crf_l = crf_layer(
-	        name='crf',
-	        size=label_dict_len,
-	        input=feature_out,
-	        label=target,
-	        param_attr=ParameterAttribute(
-	            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
-	```
+- 将句子序列、谓词、谓词上下文、谓词上下文区域标记通过词表，转换为实向量表示的词向量序列。
+
+```python  
+
+# 在本教程中，我们加载了预训练的词向量，这里设置了：is_static=True
+# is_static 为 True 时保证了在训练 SRL 模型过程中，词表不再更新
+emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
+# 设置超参数
+default_std = 1 / math.sqrt(hidden_dim) / 3.0
+std_default = paddle.attr.Param(initial_std=default_std)
+std_0 = paddle.attr.Param(initial_std=0.)
+
+predicate_embedding = paddle.layer.embedding(
+    size=word_dim,
+    input=predicate,
+    param_attr=paddle.attr.Param(
+        name='vemb', initial_std=default_std))
+mark_embedding = paddle.layer.embedding(
+    size=mark_dim, input=mark, param_attr=std_0)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [
+    paddle.layer.embedding(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
+```
+
+- 8个LSTM单元以“正向/反向”的顺序对所有输入序列进行学习。
+
+```python  
+hidden_0 = paddle.layer.mixed(
+    size=hidden_dim,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])
+
+mix_hidden_lr = 1e-3
+lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = paddle.attr.Param(
+    initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = paddle.layer.lstmemory(
+    input=hidden_0,
+    act=paddle.activation.Relu(),
+    gate_act=paddle.activation.Sigmoid(),
+    state_act=paddle.activation.Sigmoid(),
+    bias_attr=std_0,
+    param_attr=lstm_para_attr)
+
+#stack L-LSTM and R-LSTM with direct edges
+input_tmp = [hidden_0, lstm_0]
+
+for i in range(1, depth):
+    mix_hidden = paddle.layer.mixed(
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ])
+
+    lstm = paddle.layer.lstmemory(
+        input=mix_hidden,
+        act=paddle.activation.Relu(),
+        gate_act=paddle.activation.Sigmoid(),
+        state_act=paddle.activation.Sigmoid(),
+        reverse=((i % 2) == 1),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
+
+    input_tmp = [mix_hidden, lstm]
+```
+
+- 在PaddlePaddle中，CRF的状态特征和转移特征分别由一个全连接层和一个PaddlePaddle中的CRF层分别学习。在这个例子中，我们用线性激活的paddle.layer.mixed 来学习CRF的状态特征（也可以使用paddle.layer.fc），而 paddle.layer.crf只学习转移特征。paddle.layer.crf层是一个 cost 层，处于整个网络的末端，输出给定输入序列下，标记序列的log probability作为代价。训练阶段，该层需要输入正确的标记序列作为学习目标。
+
+```python
+
+# 取最后一个栈式LSTM的输出和这个LSTM单元的输入到隐层映射，
+# 经过一个全连接层映射到标记字典的维度，来学习 CRF 的状态特征
+
+feature_out = paddle.layer.mixed(
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)
+    ], )
+
+# 学习 CRF 的转移特征
+crf_cost = paddle.layer.crf(
+    size=label_dict_len,
+    input=feature_out,
+    label=target,
+    param_attr=paddle.attr.Param(
+        name='crfw',
+        initial_std=default_std,
+        learning_rate=mix_hidden_lr))
+```
+
+- CRF解码和CRF层参数名字相同，即：加载了`paddle.layer.crf`层学习到的参数。在训练阶段，为`paddle.layer.crf_decoding` 输入了正确的标记序列(target)，这一层会输出是否正确标记，`evaluator.sum` 用来计算序列上的标记错误率，可以用来评估模型。解码阶段，没有输入正确的数据标签，该层通过寻找概率最高的标记序列，解码出标记结果。
+
+```python
+crf_dec = paddle.layer.crf_decoding(
+   size=label_dict_len,
+   input=feature_out,
+   label=target,
+   param_attr=paddle.attr.Param(name='crfw'))
+evaluator.sum(input=crf_dec)
+```
 
 ## 训练模型
-执行`sh train.sh`进行模型的训练，其中指定了总共需要训练150个pass。
-
-```bash
-paddle train \
-  --config=./db_lstm.py \
-  --save_dir=./output \
-  --trainer_count=1 \
-  --dot_period=500 \
-  --log_period=10 \
-  --num_passes=200 \
-  --use_gpu=false \
-  --show_parameter_stats_period=10 \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
+
+### 定义参数
+
+首先依据模型配置的`crf_cost`定义模型参数。
+
+```python
+# create parameters
+parameters = paddle.parameters.create(crf_cost)
 ```
 
-训练日志示例如下。
+可以打印参数名字，如果在网络配置中没有指定名字，则默认生成。
 
-```text
-I1224 18:11:53.661479  1433 TrainerInternal.cpp:165]  Batch=880 samples=145305 AvgCost=2.11541 CurrentCost=1.8645 Eval: __sum_evaluator_0__=0.607942  CurrentEval: __sum_evaluator_0__=0.59322
-I1224 18:11:55.254021  1433 TrainerInternal.cpp:165]  Batch=885 samples=146134 AvgCost=2.11408 CurrentCost=1.88156 Eval: __sum_evaluator_0__=0.607299  CurrentEval: __sum_evaluator_0__=0.494572
-I1224 18:11:56.867604  1433 TrainerInternal.cpp:165]  Batch=890 samples=146987 AvgCost=2.11277 CurrentCost=1.88839 Eval: __sum_evaluator_0__=0.607203  CurrentEval: __sum_evaluator_0__=0.590856
-I1224 18:11:58.424069  1433 TrainerInternal.cpp:165]  Batch=895 samples=147793 AvgCost=2.11129 CurrentCost=1.84247 Eval: __sum_evaluator_0__=0.607099  CurrentEval: __sum_evaluator_0__=0.588089
-I1224 18:12:00.006893  1433 TrainerInternal.cpp:165]  Batch=900 samples=148611 AvgCost=2.11148 CurrentCost=2.14526 Eval: __sum_evaluator_0__=0.607882  CurrentEval: __sum_evaluator_0__=0.749389
-I1224 18:12:00.164089  1433 TrainerInternal.cpp:181]  Pass=0 Batch=901 samples=148647 AvgCost=2.11195 Eval: __sum_evaluator_0__=0.60793
+```python
+print parameters.keys()
 ```
-经过150个 pass 后，得到平均 error 约为 0.0516055。
 
-## 应用模型
+如上文提到，我们用基于英文维基百科训练好的词向量来初始化序列输入、谓词上下文总共6个特征的embedding层参数，在训练中不更新。
 
-训练好的$N$个pass，会得到$N$个模型，我们需要从中选择一个最优模型进行预测。通常做法是在开发集上进行调参，并基于我们关心的某个性能指标选择最优模型。本教程的`predict.sh`脚本简单地选择了测试集上标记错误最少的那个pass（这里是pass-00100）用于预测。
+```python
+# 这里加载PaddlePaddle上版保存的二进制模型
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
+```
+
+### 构造训练(Trainer)
 
-预测时，我们需要将配置中的 `crf_layer` 删掉，替换为 `crf_decoding_layer`，如下所示：
+然后根据网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法(momentum设置为0)，同时设定了学习率、正则等。
 
 ```python
-crf_dec_l = crf_decoding_layer(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        param_attr=ParameterAttribute(name='crfw'))
+# create optimizer
+optimizer = paddle.optimizer.Momentum(
+    momentum=0,
+    learning_rate=1e-3,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+    model_average=paddle.optimizer.ModelAverage(
+        average_window=0.5, max_average_window=10000), )
+
+trainer = paddle.trainer.SGD(cost=crf_cost,
+                             parameters=parameters,
+                             update_equation=optimizer,
+                             extra_layers=crf_dec)
 ```
 
-运行`python predict.py`脚本，便可使用指定的模型进行预测。
-
-```bash
-python predict.py
-     -c db_lstm.py  # 指定配置文件
-     -w output/pass-00100  # 指定预测使用的模型所在的路径
-     -l data/targetDict.txt  # 指定标记的字典
-     -p data/verbDict.txt  # 指定谓词的词典
-     -d data/wordDict.txt # 指定输入文本序列的字典
-     -i data/feature  # 指定输入数据的路径
-     -o predict.res  # 指定标记结果输出到文件的路径
+### 训练
+
+数据介绍部分提到CoNLL 2005训练集付费，这里我们使用测试集训练供大家学习。`conll05.test()`每次产生一条样本，包含9个特征，shuffle和组完batch后作为训练的输入。
+
+```python
+reader = paddle.batch(
+    paddle.reader.shuffle(
+        conll05.test(), buf_size=8192), batch_size=2)
 ```
 
-预测结束后，在 - o 参数所指定的标记结果文件中，我们会得到如下格式的输出：每行是一条样本，以 “\t” 分隔的 2 列，第一列是输入文本，第二列是标记的结果。通过BIO标记可以直接得到论元的语义角色标签。
+通过`feeding`来指定每一个数据和data_layer的对应关系。 例如 下面`feeding`表示: `conll05.test()`产生数据的第0列对应`word_data`层的特征。
 
-```text
-The interest-only securities were priced at 35 1\/2 to yield 10.72 % .  B-A0 I-A0 I-A0 O O O O O O B-V B-A1 I-A1 O
+
+```python
+feeding = {
+    'word_data': 0,
+    'ctx_n2_data': 1,
+    'ctx_n1_data': 2,
+    'ctx_0_data': 3,
+    'ctx_p1_data': 4,
+    'ctx_p2_data': 5,
+    'verb_data': 6,
+    'mark_data': 7,
+    'target': 8
+}
+```
+
+可以使用`event_handler`回调函数来观察训练过程，或进行测试等。这里我们打印了训练过程的cost，该回调函数是`trainer.train`函数里设定。
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id and event.batch_id % 10 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        if event.batch_id % 400 == 0:
+            result = trainer.test(reader=reader, feeding=feeding)
+            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=reader, feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+通过`trainer.train`函数训练：
+
+```python
+trainer.train(
+    reader=reader,
+    event_handler=event_handler,
+    num_passes=1,
+    feeding=feeding)
+```
+
+### 应用模型
+
+训练完成之后，需要依据某个我们关心的性能指标选择最优的模型进行预测，可以简单的选择测试集上标记错误最少的那个模型。预测时使用 `paddle.layer.crf_decoding`，和训练不同的是，该层没有正确的标签层作为输入。如下所示：
+
+```python
+predict = paddle.layer.crf_decoding(
+    size=label_dict_len,
+    input=feature_out,
+    param_attr=paddle.attr.Param(name='crfw'))
+```
+
+这里选用测试集的一条数据作为示例。
+
+```python
+test_creator = paddle.dataset.conll05.test()
+test_data = []
+for item in test_creator():
+    test_data.append(item[0:8])
+    if len(test_data) == 1:
+        break
+```
+
+推断接口`paddle.infer`返回标签的索引，并查询词典`labels_reverse`，打印出标记的结果。
+
+```python
+labs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data, field='id')
+assert len(labs) == len(test_data[0][0])
+labels_reverse={}
+for (k,v) in label_dict.items():
+    labels_reverse[v]=k
+pre_lab = [labels_reverse[i] for i in labs]
+print pre_lab
 ```
 
 ## 总结
@@ -517,7 +558,8 @@ The interest-only securities were priced at 35 1\/2 to yield 10.72 % .  B-A0 I-A
 10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
 
 <br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -536,6 +578,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/07.label_semantic_roles/index.html b/07.label_semantic_roles/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..ca6f60cb428a4f473f888e9918116f911515b63f
--- /dev/null
+++ b/07.label_semantic_roles/index.html
@@ -0,0 +1,602 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Semantic Role Labeling
+
+The source code of this chapter locates at [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles).
+
+For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+
+## Background
+
+Natural language analysis techniques consist of lexical, syntactic, and semantic analysis. **Semantic Role Labeling (SRL)** is an instance of **Shallow Semantic Analysis**.
+
+In a sentence, a **predicate** states a property or a characterization of a *subject*, such as what it does and what it is like. The predicate represents the core of an event, whereas the words accompanying the predicate are **arguments**. A **semantic role** refers to the abstract role an argument of a predicate take on in the event, including *agent*, *patient*, *theme*, *experiencer*, *beneficiary*, *instrument*, *location*, *goal*, and *source*.
+
+In the following example of a Chinese sentence, "to encounter" is the predicate (*pred*); "Ming" is the *agent*; "Hong" is the *patient*; "yesterday" and "evening" are the *time*; finally, "the park" is the *location*.
+
+$$\mbox{[小明 Ming]}_{\mbox{Agent}}\mbox{[昨天 yesterday]}_{\mbox{Time}}\mbox{[晚上 evening]}_\mbox{Time}\mbox{在[公园 a park]}_{\mbox{Location}}\mbox{[遇到 to encounter]}_{\mbox{Predicate}}\mbox{了[小红 Hong]}_{\mbox{Patient}}\mbox{。}$$
+
+Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relationship between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
+
+Conventional SRL systems mostly build on top of syntactic analysis, usually consisting of five steps:
+
+1. Construct a syntax tree, as shown in Fig. 1
+2. Identity the candidate arguments of the given predicate on the tree.
+3. Prune the most unlikely candidate arguments.
+4. Identify the real arguments, often by a binary classifier.
+5. Multi-classify on results from step 4 to label the semantic roles. Steps 2 and 3 usually introduce hand-designed features based on syntactic analysis (step 1).
+
+
+<div  align="center">
+<img src="image/dependency_parsing_en.png" width = "80%" align=center /><br>
+Fig 1. Syntax tree
+</div>
+
+
+However, a complete syntactic analysis requires identifying the relationship among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out will receive the tag O.
+
+The BIO representation of above example is shown in Fig.1.
+
+<div  align="center">
+<img src="image/bio_example_en.png" width = "90%"  align=center /><br>
+Fig 2. BIO representation
+</div>
+
+This example illustrates the simplicity of sequence tagging, since
+
+1. It only relies on shallow syntactic analysis, reduces the precision requirement of syntactic analysis;
+2. Pruning the candidate arguments is no longer necessary;
+3. Arguments are identified and tagged at the same time. Simplifying the workflow reduces the risk of accumulating errors; oftentimes, methods that unify multiple steps boost performance.
+
+In this tutorial, our SRL system is built as an end-to-end system via a neural network. The system takes only text sequences as input, without using any syntactic parsing results or complex hand-designed features. The public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) is used for the following task: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles through sequence tagging.
+
+## Model
+
+**Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim modeling long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
+
+### Stacked Recurrent Neural Network
+
+*Deep Neural Networks* can extract hierarchical representations. The higher layers can form relatively abstract/complex representations, based on primitive features discovered through the lower layers. Unfolding LSTMs through time results in a deep feed-forward neural network. This is because any computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. On the other hand, due to parameter sharing over time, LSTMs are also *shallow*; that is, the computation carried out at each time-step is just a linear transformation. Deep LSTM networks are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be efficient at representing some functions and modeling varying-length dependencies\[[2](#reference)\].
+
+
+However, in a deep LSTM network, any gradient propagated back in depth needs to traverse a large number of nonlinear steps. As a result, while LSTMs of 4 layers can be trained properly, those with 4-8 have much worse performance. Conventional LSTMs prevent back-propagated errors from vanishing or exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well.
+
+
+A single LSTM cell has three operations:
+
+1. input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping);
+2. hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs;
+3. hidden-to-output: this part typically involves an activation operation on hidden states.
+
+Based on the stacked LSTMs, we add shortcut connections: take the input-to-hidden from the previous layer as a new input and learn another linear transformation.
+
+Fig.3 illustrates the final stacked recurrent neural networks.
+
+<p align="center">
+<img src="./image/stacked_lstm_en.png" width = "40%"  align=center><br>
+Fig 3. Stacked Recurrent Neural Networks
+</p>
+
+### Bidirectional Recurrent Neural Network
+
+While LSTMs can summarize the history, they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
+
+To address this, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
+
+
+<p align="center">
+<img src="./image/bidirectional_stacked_lstm_en.png" width = "60%" align=center><br>
+Fig 4. Bidirectional LSTMs
+</p>
+
+Note that, this bidirectional RNNs is different from the one proposed by Bengio et al. in machine translation tasks \[[3](#reference), [4](#reference)\]. We will introduce another bidirectional RNNs in the following chapter [machine translation](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md)
+
+### Conditional Random Field (CRF)
+
+Typically, a neural network's lower layers learn representations while its very top layer accomplishes the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes representations provided by the last LSTM layer as input.
+
+
+The CRF is an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. In essence, CRFs learn the conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input and $Y = (y_1, y_2, ... , y_n)$ are label sequences; to decode, simply search through $Y$ for a sequence that maximizes the conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
+
+Sequence tagging tasks do not assume a lot of conditional independence, because they only concern about the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
+
+<p align="center">
+<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
+Fig 5. Linear Chain Conditional Random Field used in SRL tasks
+</p>
+
+By the fundamental theorem of random fields \[[5](#reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form:
+
+$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
+
+
+where, $Z(X)$ is normalization constant, ${t_j}$ represents the feature functions defined on edges called the *transition feature*, which denotes the transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. ${s_k}$ represents the feature function defined on nodes, called the state feature, denoting the probability of $y_i$ given input sequence $X$. In addition, $\lambda_j$ and $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Alternatively, $t$ and $s$ can be written in the same form that depends on $y_{i - 1}$, $y_i$, $X$, and $i$. Taking its summation over all nodes $i$, we have: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, which defines the *feature function* $f$. Thus, $P(Y|X)$ can be written as:
+
+$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
+
+where $\omega$ are the weights to the feature function that the CRF learns. While training, given input sequences and label sequences $D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, by maximum likelihood estimation (**MLE**), we construct the following objective function:
+
+
+$$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
+
+
+This objective function can be solved via back-propagation in an end-to-end manner. While decoding, given input sequences $X$, search for sequence $\bar{Y}$ to maximize the conditional probability $\bar{P}(Y|X)$ via decoding methods (such as *Viterbi*, or [Beam Search Algorithm](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md#beam-search-algorithm)).
+
+### Deep Bidirectional LSTM (DB-LSTM) SRL model
+
+Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has $n$ predicates, we will process this sequence $n$ times. Here is the breakdown of a straight-forward model:
+
+1. Construct inputs;
+ - input 1: predicate, input 2: sentence
+ - expand input 1 into a sequence of the same length with input 2's sentence, using one-hot representation;
+2. Convert the one-hot sequences from step 1 to vector sequences via a word embedding's lookup table;
+3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
+4. Take the representation from step 3 as input, label sequence as a supervisory signal, and realize sequence tagging tasks.
+
+Here, we propose some improvements by introducing two simple but effective features:
+
+- predicate context (**ctx-p**): A single predicate word may not describe all the predicate information, especially when the same words appear multiple times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk.
+
+- region mark ($m_r$): The binary marker on a word, $m_r$, takes the value of $1$ when the word is in the predicate context region, and $0$ if not.
+
+After these modifications, the model is as follows, as illustrated in Figure 6:
+
+1. Construct inputs
+ - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if the word is located in the predicate context region, 0 otherwise.
+ - expand input 2~3 into sequences with the same length with input 1
+2. Convert input 1~4 to vector sequences via word embedding lookup tables; While input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables.
+3. Take the four vector sequences from step 2 as inputs to bidirectional LSTMs; Train the LSTMs to update representations.
+4. Take the representation from step 3 as input to CRF, label sequence as a supervisory signal, and complete sequence tagging tasks.
+
+
+<div  align="center">
+<img src="image/db_lstm_network_en.png" width = "60%"  align=center /><br>
+Fig 6. DB-LSTM for SRL tasks
+</div>
+
+## Data Preparation
+
+In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. Note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, for a usable neural network SRL system, please consider paying for the full corpus.
+
+The original data includes a variety of information such as POS tagging, naming entity recognition, syntax tree, etc. In this tutorial, we only use the data under `test.wsj/words/` (text sequence) and `test.wsj/props/` (label results). The data directory used in this tutorial is as follows:
+
+```text
+conll05st-release/
+└── test.wsj
+    ├── props  # label results
+    └── words  # text sequence
+```
+
+The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](#references)\]. The labeling of the PropBank is different from the labeling methods mentioned before, but shares with it the same underlying principle. For descriptions of the labeling, please refer to the paper \[[9](#references)\].
+
+The raw data needs to be preprocessed into formats that PaddlePaddle can handle. The preprocessing consists of the following steps:
+
+1. Merge the text sequence and the tag sequence into the same record;
+2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate;
+3. Extract the predicate context and construct the predicate context region marker;
+4. Construct the markings in BIO format;
+5. Obtain the integer index corresponding to the word according to the dictionary.
+
+```python
+# import paddle.v2.dataset.conll05 as conll05
+# conll05.corpus_reader does step 1 and 2 as mentioned above.
+# conll05.reader_creator does step 3 to 5.
+# conll05.test gets preprocessed training instances.
+```
+
+After preprocessing, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. The following table is an example of a training sample.
+
+| word sequence | predicate | predicate context（5 columns） | region mark sequence | label sequence|
+|---|---|---|---|---|
+| A | set | n't been set . × | 0 | B-A1 |
+| record | set | n't been set . × | 0 | I-A1 |
+| date | set | n't been set . × | 0 | I-A1 |
+| has | set | n't been set . × | 0 | O |
+| n't | set | n't been set . × | 1 | B-AM-NEG |
+| been | set | n't been set . × | 1 | O |
+| set | set | n't been set . × | 1 | B-V |
+| . | set | n't been set . × | 1 | O |
+
+In addition to the data, we provide following resources:
+
+| filename | explanation |
+|---|---|
+| word_dict | dictionary of input sentences, total 44068 words |
+| label_dict | dictionary of labels, total 106 labels |
+| predicate_dict | predicate dictionary, total 3162 predicates |
+| emb | a pre-trained word vector lookup table, 32-dimensional |
+
+We trained a language model on the English Wikipedia to get a word vector lookup table used to initialize the SRL model. While training the SRL model, the word vector lookup table is no longer updated. To learn more about the language model and the word vector lookup table, please refer to the tutorial [word vector](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.md). There are 995,000,000 tokens in the training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
+
+Here we fetch the dictionary, and print its size:
+
+```python
+import math
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator
+
+paddle.init(use_gpu=False, trainer_count=1)
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+print word_dict_len
+print label_dict_len
+print pred_len
+```
+
+## Model Configuration
+
+- Define input data dimensions and model hyperparameters.
+
+```python
+mark_dict_len = 2    # value range of region mark. Region mark is either 0 or 1, so range is 2
+word_dim = 32        # word vector dimension
+mark_dim = 5         # adjacent dimension
+hidden_dim = 512     # the dimension of LSTM hidden layer vector is 128 (512/4)
+depth = 8            # depth of stacked LSTM
+
+# There are 9 features per sample, so we will define 9 data layers.
+# They type for each layer is integer_value_sequence.
+def d_type(value_range):
+    return paddle.data_type.integer_value_sequence(value_range)
+
+# word sequence
+word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+# predicate
+predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+# 5 features for predicate context
+ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+
+# region marker sequence
+mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+# label sequence
+target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+```
+
+Note that `hidden_dim = 512` means a LSTM hidden vector of 128 dimension (512/4). Please refer to PaddlePaddle's official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
+
+- Transform the word sequence itself, the predicate, the predicate context, and the region mark sequence into embedded vector sequences.
+
+```python
+
+# Since word vectorlookup table is pre-trained, we won't update it this time.
+# is_static being True prevents updating the lookup table during training.
+emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
+# hyperparameter configurations
+default_std = 1 / math.sqrt(hidden_dim) / 3.0
+std_default = paddle.attr.Param(initial_std=default_std)
+std_0 = paddle.attr.Param(initial_std=0.)
+
+predicate_embedding = paddle.layer.embedding(
+    size=word_dim,
+    input=predicate,
+    param_attr=paddle.attr.Param(
+        name='vemb', initial_std=default_std))
+mark_embedding = paddle.layer.embedding(
+    size=mark_dim, input=mark, param_attr=std_0)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [
+    paddle.layer.embedding(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
+emb_layers.append(predicate_embedding)
+emb_layers.append(mark_embedding)
+```
+
+- 8 LSTM units are trained through alternating left-to-right / right-to-left order denoted by the variable `reverse`.
+
+```python
+hidden_0 = paddle.layer.mixed(
+    size=hidden_dim,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])
+
+mix_hidden_lr = 1e-3
+lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+hidden_para_attr = paddle.attr.Param(
+    initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = paddle.layer.lstmemory(
+    input=hidden_0,
+    act=paddle.activation.Relu(),
+    gate_act=paddle.activation.Sigmoid(),
+    state_act=paddle.activation.Sigmoid(),
+    bias_attr=std_0,
+    param_attr=lstm_para_attr)
+
+# stack L-LSTM and R-LSTM with direct edges
+input_tmp = [hidden_0, lstm_0]
+
+for i in range(1, depth):
+    mix_hidden = paddle.layer.mixed(
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ])
+
+    lstm = paddle.layer.lstmemory(
+        input=mix_hidden,
+        act=paddle.activation.Relu(),
+        gate_act=paddle.activation.Sigmoid(),
+        state_act=paddle.activation.Sigmoid(),
+        reverse=((i % 2) == 1),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
+
+    input_tmp = [mix_hidden, lstm]
+```
+
+- In PaddlePaddle, state features and transition features of a CRF are implemented by a fully connected layer and a CRF layer seperately. The fully connected layer with linear activation learns the state features, here we use paddle.layer.mixed (paddle.layer.fc can be uesed as well), and the CRF layer in PaddlePaddle: paddle.layer.crf only learns the transition features, which is a cost layer and is the last layer of the network. paddle.layer.crf outputs the log probability of true tag sequence as the cost by given the input sequence and it requires the true tag sequence as target in the learning process.
+
+```python
+
+# The output of the top LSTM unit and its input are feed into a fully connected layer,
+# size of which equals to size of tag labels.
+# The fully connected layer learns the state features
+
+feature_out = paddle.layer.mixed(
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        paddle.layer.full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)], )
+
+crf_cost = paddle.layer.crf(
+    size=label_dict_len,
+    input=feature_out,
+    label=target,
+    param_attr=paddle.attr.Param(
+        name='crfw',
+        initial_std=default_std,
+        learning_rate=mix_hidden_lr))
+```
+
+- The CRF decoding layer is used for evaluation and inference. It shares weights with CRF layer.  The sharing of parameters among multiple layers is specified by using the same parameter name in these layers. If true tag sequence is provided in training process, `paddle.layer.crf_decoding` calculates labelling error for each input token and `evaluator.sum` sum the error over the entire sequence. Otherwise, `paddle.layer.crf_decoding`  generates the labelling tags.
+
+```python
+crf_dec = paddle.layer.crf_decoding(
+   size=label_dict_len,
+   input=feature_out,
+   label=target,
+   param_attr=paddle.attr.Param(name='crfw'))
+evaluator.sum(input=crf_dec)
+```
+
+## Train model
+
+### Create Parameters
+
+All necessary parameters will be traced created given output layers that we need to use.
+
+```python
+parameters = paddle.parameters.create(crf_cost)
+```
+
+We can print out parameter name. It will be generated if not specified.
+
+```python
+print parameters.keys()
+```
+
+Now we load the pre-trained word lookup tables from word embeddings trained on the English language Wikipedia.
+
+```python
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
+```
+
+### Create Trainer
+
+We will create trainer given model topology, parameters, and optimization method. We will use the most basic **SGD** method, which is a momentum optimizer with 0 momentum. Meanwhile, we will set learning rate and regularization.
+
+```python
+optimizer = paddle.optimizer.Momentum(
+    momentum=0,
+    learning_rate=1e-3,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+    model_average=paddle.optimizer.ModelAverage(
+        average_window=0.5, max_average_window=10000), )
+
+trainer = paddle.trainer.SGD(cost=crf_cost,
+                             parameters=parameters,
+                             update_equation=optimizer,
+                             extra_layers=crf_dec)
+```
+
+### Trainer
+
+As mentioned in data preparation section, we will use CoNLL 2005 test corpus as the training data set. `conll05.test()` outputs one training instance at a time. It is shuffled and batched into mini batches, and used as input.
+
+```python
+reader = paddle.batch(
+    paddle.reader.shuffle(
+        conll05.test(), buf_size=8192), batch_size=2)
+```
+
+`feeding` is used to specify the correspondence between data instance and data layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` is matched to the data layer named `word_data`.
+
+```python
+feeding = {
+    'word_data': 0,
+    'ctx_n2_data': 1,
+    'ctx_n1_data': 2,
+    'ctx_0_data': 3,
+    'ctx_p1_data': 4,
+    'ctx_p2_data': 5,
+    'verb_data': 6,
+    'mark_data': 7,
+    'target': 8
+}
+```
+
+`event_handler` can be used as callback for training events, it will be used as an argument for the `train` method. Following `event_handler` prints cost during training.
+
+```python
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id and event.batch_id % 10 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+        if event.batch_id % 400 == 0:
+            result = trainer.test(reader=reader, feeding=feeding)
+            print "\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)
+
+    if isinstance(event, paddle.event.EndPass):
+        # save parameters
+        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+            trainer.save_parameter_to_tar(f)
+
+        result = trainer.test(reader=reader, feeding=feeding)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+
+`trainer.train` will train the model.
+
+```python
+trainer.train(
+    reader=reader,
+    event_handler=event_handler,
+    num_passes=10000,
+    feeding=feeding)
+```
+
+### Application
+
+When training is completed, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs do not include the ground truth label.
+
+```python
+predict = paddle.layer.crf_decoding(
+    size=label_dict_len,
+    input=feature_out,
+    param_attr=paddle.attr.Param(name='crfw'))
+```
+
+Here, using one testing sample as an example.
+
+```python
+test_creator = paddle.dataset.conll05.test()
+test_data = []
+for item in test_creator():
+    test_data.append(item[0:8])
+    if len(test_data) == 1:
+        break
+```
+
+The inference interface `paddle.infer` returns the index of predicting labels. Then printing the tagging results based dictionary `labels_reverse`.
+
+
+```python
+labs = paddle.infer(
+    output_layer=predict, parameters=parameters, input=test_data, field='id')
+assert len(labs) == len(test_data[0][0])
+labels_reverse={}
+for (k,v) in label_dict.items():
+    labels_reverse[v]=k
+pre_lab = [labels_reverse[i] for i in labs]
+print pre_lab
+```
+
+## Conclusion
+
+Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we use SRL as an example to illustrate using PaddlePaddle to do sequence tagging tasks. The models proposed are from our published paper\[[10](#Reference)\]. We only use test data for illustration since the training data on the CoNLL 2005 dataset is not completely public. This aims to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models in terms of performance. Please check out our paper for more information and discussions.
+
+## References
+1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
+2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013.
+3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
+4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014.
+5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289.
+6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012.
+7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330.
+8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106.
+9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164.
+10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/label_semantic_roles/train.py b/07.label_semantic_roles/train.py
similarity index 61%
rename from label_semantic_roles/train.py
rename to 07.label_semantic_roles/train.py
index f88a709009989f8111bb1ad8e43f7d820bb957c0..d7ca2d097033b0e9b957b5803f8193793794611a 100644
--- a/label_semantic_roles/train.py
+++ b/07.label_semantic_roles/train.py
@@ -1,25 +1,31 @@
-import math
+import math, os
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.evaluator as evaluator
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
 
-def db_lstm():
-    word_dict, verb_dict, label_dict = conll05.get_dict()
-    word_dict_len = len(word_dict)
-    label_dict_len = len(label_dict)
-    pred_len = len(verb_dict)
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
 
-    mark_dict_len = 2
-    word_dim = 32
-    mark_dim = 5
-    hidden_dim = 512
-    depth = 8
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+default_std = 1 / math.sqrt(hidden_dim) / 3.0
+mix_hidden_lr = 1e-3
 
-    #8 features
-    def d_type(size):
-        return paddle.data_type.integer_value_sequence(size)
 
+def d_type(size):
+    return paddle.data_type.integer_value_sequence(size)
+
+
+def db_lstm():
+    #8 features
     word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
     predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
 
@@ -30,25 +36,21 @@ def db_lstm():
     ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
     mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
 
-    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-
     emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
     std_0 = paddle.attr.Param(initial_std=0.)
-    default_std = 1 / math.sqrt(hidden_dim) / 3.0
     std_default = paddle.attr.Param(initial_std=default_std)
 
     predicate_embedding = paddle.layer.embedding(
         size=word_dim,
         input=predicate,
-        param_attr=paddle.attr.Param(
-            name='vemb', initial_std=default_std))
+        param_attr=paddle.attr.Param(name='vemb', initial_std=default_std))
     mark_embedding = paddle.layer.embedding(
         size=mark_dim, input=mark, param_attr=std_0)
 
     word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
     emb_layers = [
-        paddle.layer.embedding(
-            size=word_dim, input=x, param_attr=emb_para) for x in word_input
+        paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+        for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
@@ -61,7 +63,6 @@ def db_lstm():
                 input=emb, param_attr=std_default) for emb in emb_layers
         ])
 
-    mix_hidden_lr = 1e-3
     lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
     hidden_para_attr = paddle.attr.Param(
         initial_std=default_std, learning_rate=mix_hidden_lr)
@@ -109,22 +110,7 @@ def db_lstm():
                 input=input_tmp[1], param_attr=lstm_para_attr)
         ], )
 
-    crf_cost = paddle.layer.crf(size=label_dict_len,
-                                input=feature_out,
-                                label=target,
-                                param_attr=paddle.attr.Param(
-                                    name='crfw',
-                                    initial_std=default_std,
-                                    learning_rate=mix_hidden_lr))
-
-    crf_dec = paddle.layer.crf_decoding(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=paddle.attr.Param(name='crfw'))
-
-    return crf_cost, crf_dec
+    return feature_out
 
 
 def load_parameter(file_name, h, w):
@@ -134,13 +120,27 @@ def load_parameter(file_name, h, w):
 
 
 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
 
     # define network topology
-    crf_cost, crf_dec = db_lstm()
+    feature_out = db_lstm()
+    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+    crf_cost = paddle.layer.crf(
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=paddle.attr.Param(
+            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
+
+    crf_dec = paddle.layer.crf_decoding(
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=paddle.attr.Param(name='crfw'))
+    evaluator.sum(input=crf_dec)
 
     # create parameters
-    parameters = paddle.parameters.create([crf_cost, crf_dec])
+    parameters = paddle.parameters.create(crf_cost)
     parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
 
     # create optimizer
@@ -151,15 +151,19 @@ def main():
         model_average=paddle.optimizer.ModelAverage(
             average_window=0.5, max_average_window=10000), )
 
-    trainer = paddle.trainer.SGD(cost=crf_cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer)
+    trainer = paddle.trainer.SGD(
+        cost=crf_cost,
+        parameters=parameters,
+        update_equation=optimizer,
+        extra_layers=crf_dec)
 
     reader = paddle.batch(
-        paddle.reader.shuffle(
-            conll05.test(), buf_size=8192), batch_size=10)
+        paddle.reader.shuffle(conll05.test(), buf_size=8192), batch_size=10)
 
-    reader_dict = {
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(conll05.test(), buf_size=8192), batch_size=10)
+
+    feeding = {
         'word_data': 0,
         'ctx_n2_data': 1,
         'ctx_n1_data': 2,
@@ -174,14 +178,50 @@ def main():
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
             if event.batch_id % 100 == 0:
-                print "Pass %d, Batch %d, Cost %f" % (
-                    event.pass_id, event.batch_id, event.cost)
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            if event.batch_id % 1000 == 0:
+                result = trainer.test(reader=test_reader, feeding=feeding)
+                print "\nTest with Pass %d, Batch %d, %s" % (
+                    event.pass_id, event.batch_id, result.metrics)
+
+        if isinstance(event, paddle.event.EndPass):
+            # save parameters
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 
     trainer.train(
         reader=reader,
         event_handler=event_handler,
-        num_passes=10000,
-        reader_dict=reader_dict)
+        num_passes=1,
+        feeding=feeding)
+
+    test_creator = paddle.dataset.conll05.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append(item[0:8])
+        if len(test_data) == 1:
+            break
+
+    predict = paddle.layer.crf_decoding(
+        size=label_dict_len,
+        input=feature_out,
+        param_attr=paddle.attr.Param(name='crfw'))
+    probs = paddle.infer(
+        output_layer=predict,
+        parameters=parameters,
+        input=test_data,
+        feeding=feeding,
+        field='id')
+    assert len(probs) == len(test_data[0][0])
+    labels_reverse = {}
+    for (k, v) in label_dict.items():
+        labels_reverse[v] = k
+    pre_lab = [labels_reverse[i] for i in probs]
+    print pre_lab
 
 
 if __name__ == '__main__':
diff --git a/machine_translation/.gitignore b/08.machine_translation/.gitignore
similarity index 100%
rename from machine_translation/.gitignore
rename to 08.machine_translation/.gitignore
diff --git a/08.machine_translation/README.cn.md b/08.machine_translation/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..d91d01998086f4b854d8f5f836dfb998be39f5b1
--- /dev/null
+++ b/08.machine_translation/README.cn.md
@@ -0,0 +1,512 @@
+# 机器翻译
+
+本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/179.html)。
+
+## 背景介绍
+
+机器翻译（machine translation, MT）是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言（source language），翻译成的结果语言称为目标语言（target language）。机器翻译即实现从源语言到目标语言转换的过程，是自然语言处理的重要研究领域之一。
+
+早期机器翻译系统多为基于规则的翻译系统，需要由语言学家编写两种语言之间的转换规则，再将这些规则录入计算机。该方法对语言学家的要求非常高，而且我们几乎无法总结一门语言会用到的所有规则，更何况两种甚至更多的语言。因此，传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。
+
+为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
+
+近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
+<p align="center">
+<img src="image/nmt.png" width=400><br/>
+图1. 基于神经网络的机器翻译系统
+</p>
+
+本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
+
+## 效果展示
+
+以中英翻译（中文翻译到英文）的模型为例，当模型训练完毕时，如果输入如下已分词的中文句子：
+```text
+这些 是 希望 的 曙光 和 解脱 的 迹象 .
+```
+如果设定显示翻译结果的条数（即[柱搜索算法](#柱搜索算法)的宽度）为3，生成的英语句子如下：
+```text
+0 -5.36816   These are signs of hope and relief . <e>
+1 -6.23177   These are the light of hope and relief . <e>
+2 -7.7914  These are the light of hope and the relief of hope . <e>
+```
+- 左起第一列是生成句子的序号；左起第二列是该条句子的得分（从大到小），分值越高越好；左起第三列是生成的英语句子。
+- 另外有两个特殊标志：`<e>`表示句子的结尾，`<unk>`表示未登录词（unknown word），即未在训练字典中出现的词。
+
+## 模型概览
+
+本节依次介绍GRU（Gated Recurrent Unit，门控循环单元），双向循环神经网络（Bi-directional Recurrent Neural Network），NMT模型中典型的编码器-解码器（Encoder-Decoder）框架和注意力（Attention）机制，以及柱搜索（beam search）算法。
+
+### GRU
+
+我们已经在[情感分析](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)一章中介绍了循环神经网络（RNN）及长短时间记忆网络（LSTM）。相比于简单的RNN，LSTM增加了记忆单元（memory cell）、输入门（input gate）、遗忘门（forget gate）及输出门（output gate），这些门及记忆单元组合起来大大提升了RNN处理远距离依赖问题的能力。
+
+GRU\[[2](#参考文献)\]是Cho等人在LSTM上提出的简化版本，也是RNN的一种扩展，如下图所示。GRU单元只有两个门：
+- 重置门（reset gate）：如果重置门关闭，会忽略掉历史信息，即历史不相干的信息不会影响未来的输出。
+- 更新门（update gate）：将LSTM的输入门和遗忘门合并，用于控制历史信息对当前时刻隐层输出的影响。如果更新门接近1，会把历史信息传递下去。
+<p align="center">
+<img src="image/gru.png" width=700><br/>
+图2. GRU（门控循环单元）
+</p>
+
+一般来说，具有短距离依赖属性的序列，其重置门比较活跃；相反，具有长距离依赖属性的序列，其更新门比较活跃。另外，Chung等人\[[3](#参考文献)\]通过多组实验表明，GRU虽然参数更少，但是在多个任务上都和LSTM有相近的表现。
+
+### 双向循环神经网络
+
+我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络，这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列，得到其在每个时刻的特征表示，即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。
+
+具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（$W_1, W_3$），隐层到隐层自己的权重矩阵（$W_2,W_5$），前向隐层和后向隐层到输出层的权重矩阵（$W_4, W_6$）。注意，该网络的前向隐层和后向隐层之间没有连接。
+
+<p align="center">
+<img src="image/bi_rnn.png" width=450><br/>
+图3. 按时间步展开的双向循环神经网络
+</p>
+
+### 编码器-解码器框架
+
+编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
+<p align="center">
+<img src="image/encoder_decoder.png" width=700><br/>
+图4. 编码器-解码器框架
+</p>
+
+#### 编码器
+
+编码阶段分为三步：
+
+1. one-hot vector表示：将源语言句子$x=\left \{ x_1,x_2,...,x_T \right \}$的每个词$x_i$表示成一个列向量$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$。这个向量$w_i$的维度与词汇表大小$\left | V \right |$ 相同，并且只有一个维度上有值1（该位置对应该词在词汇表中的位置），其余全是0。
+
+2. 映射到低维语义空间的词向量：one-hot vector表示存在两个问题，1）生成的向量维度往往很大，容易造成维数灾难；2）难以刻画词与词之间的关系（如语义相似性，也就是无法很好地表达语义）。因此，需再one-hot vector映射到低维的语义空间，由一个固定维度的稠密向量（称为词向量）表示。记映射矩阵为$C\epsilon R^{K\times \left | V \right |}$，用$s_i=Cw_i$表示第$i$个词的词向量，$K$为向量维度。
+
+3. 用RNN编码源语言词序列：这一过程的计算公式为$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$，其中$h_0$是一个全零的向量，$\varnothing _\theta$是一个非线性激活函数，最后得到的$\mathbf{h}=\left \{ h_1,..., h_T \right \}$就是RNN依次读入源语言$T$个词的状态编码序列。整句话的向量表示可以采用$\mathbf{h}$在最后一个时间步$T$的状态编码，或使用时间维上的池化（pooling）结果。
+
+第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列$(x_1,x_2,...,x_T)$的顺序依次编码源语言端词，并得到一系列隐层状态$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$。类似的，后向GRU按照$(x_T,x_{T-1},...,x_1)$的顺序依次编码源语言端词，得到$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$。最后对于词$x_i$，通过拼接两个GRU的结果得到它的隐层状态，即$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$。
+
+<p align="center">
+<img src="image/encoder_attention.png" width=500><br/>
+图5. 使用双向GRU的编码器
+</p>
+
+#### 解码器
+
+机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
+
+1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）$c$、真实目标语言序列的第$i$个词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。计算公式如下：
+
+   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
+
+   其中$\phi _{\theta '}$是一个非线性激活函数；$c=q\mathbf{h}$是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义$c=h_T$；$u_i$是目标语言序列的第$i$个单词，$u_0$是目标语言序列的开始标记`<s>`，表示解码开始；$z_i$是$i$时刻解码RNN的隐层状态，$z_0$是一个全零的向量。
+
+2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下：
+
+   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
+
+   其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分，再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。
+
+3. 根据$p_{i+1}$和$u_{i+1}$计算代价。
+4. 重复步骤1~3，直到目标语言序列中的所有词处理完毕。
+
+机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
+
+### 注意力机制
+
+如果编码阶段的输出是一个固定维度的向量，会带来以下两个问题：1）不论源语言序列的长度是5个词还是50个词，如果都用固定维度的向量去编码其中的语义和句法结构信息，对模型来说是一个非常高的要求，特别是对长句子序列而言；2）直觉上，当人类翻译一句话时，会对与当前译文更相关的源语言片段上给予更多关注，且关注点会随着翻译的进行而改变。而固定维度的向量则相当于，任何时刻都对源语言所有信息给予了同等程度的关注，这是不合理的。因此，Bahdanau等人\[[4](#参考文献)\]引入注意力（attention）机制，可以对编码后的上下文片段进行解码，以此来解决长句子的特征学习问题。下面介绍在注意力机制下的解码器结构。
+
+与简单的解码器不同，这里$z_i$的计算公式为：
+
+$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
+
+可见，源语言句子的编码向量表示为第$i$个词的上下文片段$c_i$，即针对每一个目标语言中的词$u_i$，都有一个特定的$c_i$与之对应。$c_i$的计算公式如下：
+
+$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
+
+从公式中可以看出，注意力机制是通过对编码器中各时刻的RNN状态$h_j$进行加权平均实现的。权重$a_{ij}$表示目标语言中第$i$个词对源语言中第$j$个词的注意力大小，$a_{ij}$的计算公式如下：
+
+\begin{align}
+a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
+e_{ij}&=align(z_i,h_j)\\\\
+\end{align}
+
+其中，$align$可以看作是一个对齐模型，用来衡量目标语言中第$i$个词和源语言中第$j$个词的匹配程度。具体而言，这个程度是通过解码RNN的第$i$个隐层状态$z_i$和源语言句子的第$j$个上下文片段$h_j$计算得到的。传统的对齐模型中，目标语言的每个词明确对应源语言的一个或多个词（hard alignment）；而在注意力模型中采用的是soft alignment，即任何两个目标语言和源语言词间均存在一定的关联，且这个关联强度是由模型计算得到的实数，因此可以融入整个NMT框架，并通过反向传播算法进行训练。
+
+<p align="center">
+<img src="image/decoder_attention.png" width=500><br/>
+图6. 基于注意力机制的解码器
+</p>
+
+### 柱搜索算法
+
+柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
+
+柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
+
+使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
+
+1. 每一个时刻，根据源语言句子的编码信息$c$、生成的第$i$个目标语言序列单词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。
+2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。
+3. 根据$p_{i+1}$采样出单词$u_{i+1}$。
+4. 重复步骤1~3，直到获得句子结束标记`<e>`或超过句子的最大生成长度为止。
+
+注意：$z_{i+1}$和$p_{i+1}$的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的，因此并不能保证得到全局最优解。
+
+## 数据介绍
+
+本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
+
+### 数据预处理
+
+我们的预处理流程包括两步：
+- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
+  - 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。
+  - `XXX`中的第$i$行内容为`XXX.src`中的第$i$行和`XXX.trg`中的第$i$行连接，用'\t'分隔。
+- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词，包括：语料中词频最高的（DICTSIZE - 3）个单词，和3个特殊符号`<s>`（序列的开始）、`<e>`（序列的结束）和`<unk>`（未登录词）。
+
+### 示例数据
+
+因为完整的数据集数据量较大，为了验证训练流程，PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。
+
+该数据集有193319条训练数据，6003条测试数据，词典长度为30000。因为数据规模限制，使用该数据集训练出来的模型效果无法保证。
+
+## 流程说明
+
+### paddle初始化
+
+```python
+# 加载 paddle的python包
+import sys
+import paddle.v2 as paddle
+
+# 配置只使用cpu，并且使用一个cpu进行训练
+paddle.init(use_gpu=False, trainer_count=1)
+# 训练模式False，生成模式True
+is_generating = False
+```
+
+### 模型结构
+1. 首先，定义了一些全局变量。
+
+    ```python
+    dict_size = 30000 # 字典维度
+    source_dict_dim = dict_size # 源语言字典维度
+    target_dict_dim = dict_size # 目标语言字典维度
+    word_vector_dim = 512 # 词向量维度
+    encoder_size = 512 # 编码器中的GRU隐层大小
+    decoder_size = 512 # 解码器中的GRU隐层大小
+    beam_size = 3 # 柱宽度
+    max_length = 250 # 生成句子的最大长度
+    ```
+
+2. 其次，实现编码器框架。分为三步：
+
+   - 输入是一个文字序列，被表示成整型的序列。序列中每个元素是文字在字典中的索引。所以，我们定义数据层的数据类型为`integer_value_sequence`（整型序列），序列中每个元素的范围是`[0, source_dict_dim)`。
+
+   ```python
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+   ```
+   - 将上述编码映射到低维语言空间的词向量$\mathbf{s}$。
+
+   ```python
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id, size=word_vector_dim)
+   ```
+   - 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到$\mathbf{h}$。
+
+   ```python
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+   ```
+
+3. 接着，定义基于注意力机制的解码器框架。分为三步：
+
+   - 对源语言序列编码后的结果（见2的最后一步），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。
+
+   ```python
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
+   ```
+
+   - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。
+
+   ```python
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
+   ```
+
+   - 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
+      - decoder_mem记录了前一个时间步的隐层状态$z_i$，其初始状态是decoder_boot。
+      - context通过调用`simple_attention`函数，实现公式$c_i=\sum {j=1}^{T}a_{ij}h_j$。其中，enc_vec是$h_j$，enc_proj是$h_j$的映射（见3.1），权重$a_{ij}$的计算已经封装在`simple_attention`函数中。
+      - decoder_inputs融合了$c_i$和当前目标词current_word（即$u_i$）的表示。
+      - gru_step通过调用`gru_step_layer`函数，在decoder_inputs和decoder_mem上做了激活操作，即实现公式$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$。
+      - 最后，使用softmax归一化计算单词的概率，将out结果返回，即实现公式$p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$。
+
+   ```python
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
+            size=decoder_size * 3,
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+                error_clipping_threshold=100.0))
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
+        return out
+   ```
+
+4. 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
+
+    ```python
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
+    group_inputs = [group_input1, group_input2]
+    ```
+
+5. 训练模式下的解码器调用：
+
+   - 首先，将目标语言序列的词向量trg_embedding，直接作为训练模式下的current_word传给`gru_decoder_with_attention`函数。
+   - 其次，使用`recurrent_group`函数循环调用`gru_decoder_with_attention`函数。
+   - 接着，使用目标语言的下一个词序列作为标签层lbl，即预测目标词。
+   - 最后，用多类交叉熵损失函数`classification_cost`来计算损失值。
+
+   ```python
+   if not is_generating:
+       trg_embedding = paddle.layer.embedding(
+           input=paddle.layer.data(
+               name='target_language_word',
+               type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+           size=word_vector_dim,
+           param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+       group_inputs.append(trg_embedding)
+
+       # For decoder equipped with attention mechanism, in training,
+       # target embeding (the groudtruth) is the data input,
+       # while encoded source sequence is accessed to as an unbounded memory.
+       # Here, the StaticInput defines a read-only memory
+       # for the recurrent_group.
+       decoder = paddle.layer.recurrent_group(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs)
+
+       lbl = paddle.layer.data(
+           name='target_language_next_word',
+           type=paddle.data_type.integer_value_sequence(target_dict_dim))
+       cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+   ```
+
+6. 生成模式下的解码器调用：
+
+   - 首先，在序列生成任务中，由于解码阶段的RNN总是引用上一时刻生成出的词的词向量，作为当前时刻的输入，因此，使用`GeneratedInput`来自动完成这一过程。具体说明可见[GeneratedInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
+   - 其次，使用`beam_search`函数循环调用`gru_decoder_with_attention`函数，生成出序列id。
+
+   ```python
+   if is_generating:
+      # In generation, the decoder predicts a next target word based on
+      # the encoded source sequence and the previous generated target word.
+
+      # The encoded source sequence (encoder's output) must be specified by
+      # StaticInput, which is a read-only memory.
+      # Embedding of the previous generated word is automatically retrieved
+      # by GeneratedInputs initialized by a start mark <s>.
+
+       trg_embedding = paddle.layer.GeneratedInput(
+           size=target_dict_dim,
+           embedding_name='_target_language_embedding',
+           embedding_size=word_vector_dim)
+       group_inputs.append(trg_embedding)
+
+       beam_gen = paddle.layer.beam_search(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs,
+           bos_id=0,
+           eos_id=1,
+           beam_size=beam_size,
+           max_length=max_length)
+   ```
+
+注意：我们提供的配置在Bahdanau的论文\[[4](#参考文献)\]上做了一些简化，可参考[issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133)。
+
+### 训练模型
+
+1. 参数定义
+
+    依据模型配置的`cost`定义模型参数。可以打印参数名字，如果在网络配置中没有指定名字，则默认生成。
+
+    ```python
+    if not is_generating:
+        parameters = paddle.parameters.create(cost)
+        for param in parameters.keys():
+            print param
+    ```
+
+2. 数据定义
+
+    获取wmt14的dataset reader。
+
+    ```python
+    if not is_generating:
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
+            batch_size=5)
+    ```
+
+3. 构造trainer
+
+    根据优化目标cost,网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法。
+
+    ```python
+    if not is_generating:
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+    ```
+
+4. 构造event_handler
+
+    可以通过自定义回调函数来评估训练过程中的各种状态，比如错误率等。下面的代码通过event.batch_id % 2 == 0 指定每2个batch打印一次日志，包含cost等信息。
+
+    ```python
+    if not is_generating:
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 2 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost, event.metrics)
+    ```
+
+5. 启动训练
+
+    ```python
+    if not is_generating:
+        trainer.train(
+                reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+    ```
+
+ 训练开始后，可以观察到event_handler输出的日志如下：
+ ```text
+ Pass 0, Batch 0, Cost 148.444983, {'classification_error_evaluator': 1.0}
+ .........
+ Pass 0, Batch 10, Cost 335.896802, {'classification_error_evaluator': 0.9325153231620789}
+ .........
+ ```
+
+### 生成模型
+
+1. 加载预训练的模型
+
+    由于NMT模型的训练非常耗时，我们在50个物理节点（每节点含有2颗6核CPU）的集群中，花了5天时间训练了一个模型供大家直接下载使用。该模型大小为205MB，[BLEU评估](#BLEU评估)值为26.92。
+
+    ```python
+    if is_generating:
+        parameters = paddle.dataset.wmt14.model()
+    ```
+2. 数据定义
+
+    从wmt14的生成集中读取前3个样本作为源语言句子。
+
+    ```python
+    if is_generating:
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+    ```
+3. 构造infer
+
+    根据网络拓扑结构和模型参数构造出infer用来生成，在预测时还需要指定输出域`field`，这里使用生成句子的概率`prob`和句子中每个词的`id`。
+
+    ```python
+    if is_generating:
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+    ```
+
+4. 打印生成结果
+
+    根据源/目标语言字典，将源语言句子和`beam_size`个生成句子打印输出。
+
+    ```python
+    if is_generating:
+        # load the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+        gen_sen_idx = np.where(beam_result[1] == -1)[0]
+        assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+        # -1 is the delimiter of generated sequences.
+        # the first element of each generated sequence its length.
+        start_pos, end_pos = 1, 0
+        for i, sample in enumerate(gen_data):
+            print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
+            for j in xrange(beam_size):
+                end_pos = gen_sen_idx[i * beam_size + j]
+                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+                start_pos = end_pos + 2
+            print("\n")
+    ```
+
+  生成开始后，可以观察到输出的日志如下：
+  ```text
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  ```
+
+## 总结
+
+端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中，我们介绍了NMT中典型的“编码器-解码器”框架和“注意力”机制。由于NMT是一个典型的Seq2Seq（Sequence to Sequence，序列到序列）学习问题，因此，Seq2Seq中的query改写（query rewriting）、摘要、单轮对话等问题都可以用本教程的模型来解决。
+
+## 参考文献
+
+1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
+2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
+4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
+5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/08.machine_translation/README.md b/08.machine_translation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e11297a895a638aa5a57f2c56f175dd0f6c29966
--- /dev/null
+++ b/08.machine_translation/README.md
@@ -0,0 +1,555 @@
+# Machine Translation
+
+The source code of this tutorial is live at [book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation). Please refer to the [book running tutorial](https://github.com/PaddlePaddle/book#running-the-book) for getting started with Paddle.
+
+## Background
+
+Machine translation (MT) leverages computers to translate from one language to another. The language to be translated is referred to as the source language, while the language to be translated into is referred to as the target language. Thus, Machine translation is the process of translating from the source language to the target language. It is one of the most important research topics in the field of natural language processing.
+
+
+Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one language. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#references)\].
+
+
+To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example:
+
+1. Human designed features cannot cover all possible linguistic variations;
+
+2. It is difficult to use global features;
+
+3. The techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
+
+
+
+The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are:
+
+1. Techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
+
+2. Techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
+
+<p align="center">
+<img src="image/nmt_en.png" width=400><br/>
+Figure 1. Neural Network based Machine Translation
+</p>
+
+
+This tutorial will mainly introduce an NMT model and how to use PaddlePaddle to train it.
+
+## Illustrative Results
+
+Let's consider an example of Chinese-to-English translation. The model is given the following segmented sentence in Chinese
+```text
+这些 是 希望 的 曙光 和 解脱 的 迹象 .
+```
+After training and with a beam-search size of 3, the generated translations are as follows:
+```text
+0 -5.36816   These are signs of hope and relief . <e>
+1 -6.23177   These are the light of hope and relief . <e>
+2 -7.7914  These are the light of hope and the relief of hope . <e>
+```
+- The first column corresponds to the id of the generated sentence; the second column corresponds to the score of the generated sentence (in descending order), where a larger value indicates better quality; the last column corresponds to the generated sentence.
+- There are two special tokens: `<e>` denotes the end of a sentence while `<unk>` denotes unknown word, i.e., a word not in the training dictionary.
+
+## Overview of the Model
+
+This section will introduce Gated Recurrent Unit (GRU), Bi-directional Recurrent Neural Network, the Encoder-Decoder framework used in NMT, attention mechanism, as well as the beam search algorithm.
+
+### Gated Recurrent Unit (GRU)
+
+We already introduced RNN and LSTM in the [Sentiment Analysis](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md) chapter.
+Compared to a simple RNN, the LSTM added memory cell, input gate, forget gate and output gate. These gates combined with the memory cell greatly improve the ability to handle long-term dependencies.
+
+GRU\[[2](#references)\] proposed by Cho et al is a simplified LSTM and an extension of a simple RNN. It is shown in the figure below.
+A GRU unit has only two gates:
+- reset gate: when this gate is closed, the history information is discarded, i.e., the irrelevant historical information has no effect on the future output.
+- update gate: it combines the input gate and the forget gate and is used to control the impact of historical information on the hidden output. The historical information is passed over when the update gate is close to 1.
+
+<p align="center">
+<img src="image/gru_en.png" width=700><br/>
+Figure 2. A GRU Gate
+</p>
+
+Generally speaking, sequences with short distance dependencies will have an active reset gate while sequences with long distance dependency will have an active update date.
+In addition, Chung et al.\[[3](#references)\] have empirically shown that although GRU has less parameters, it has similar performance to LSTM on several different tasks.
+
+### Bi-directional Recurrent Neural Network
+
+We already introduced an instance of bi-directional RNN in the [Semantic Role Labeling](https://github.com/PaddlePaddle/book/blob/develop/label_semantic_roles/README.md) chapter. Here we present another bi-directional RNN model with a different architecture proposed by Bengio et al. in \[[2](#references),[4](#references)\]. This model takes a sequence as input and outputs a fixed dimensional feature vector at each step, encoding the context information at the corresponding time step.
+
+Specifically, this bi-directional RNN processes the input sequence in the original and reverse order respectively, and then concatenates the output feature vectors at each time step as the final output. Thus the output node at each time step contains information from the past and future as context. The figure below shows an unrolled bi-directional RNN. This network contains a forward RNN and backward RNN with six weight matrices: weight matrices from input to forward hidden layer and backward hidden ($W_1, W_3$), weight matrices from hidden to itself ($W_2, W_5$), matrices from forward hidden and backward hidden to output layer ($W_4, W_6$). Note that there are no connections between forward hidden and backward hidden layers.
+
+<p align="center">
+<img src="image/bi_rnn_en.png" width=450><br/>
+Figure 3. Temporally unrolled bi-directional RNN
+</p>
+
+### Encoder-Decoder Framework
+
+The Encoder-Decoder\[[2](#references)\] framework aims to solve the mapping of a sequence to another sequence, for sequences with arbitrary lengths. The source sequence is encoded into a vector via an encoder, which is then decoded to a target sequence via a decoder by maximizing the predictive probability. Both the encoder and the decoder are typically implemented via RNN.
+
+<p align="center">
+<img src="image/encoder_decoder_en.png" width=700><br/>
+Figure 4. Encoder-Decoder Framework
+</p>
+
+#### Encoder
+
+There are three steps for encoding a sentence:
+
+1. One-hot vector representation of a word: Each word $x_i$ in the source sentence $x=\left \{ x_1,x_2,...,x_T \right \}$ is represented as a vector $w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$   where $w_i$ has the same dimensionality as the size of the dictionary, i.e., $\left | V \right |$, and has an element of one at the location corresponding to the location of the word in the dictionary and zero elsewhere.
+
+2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation
+
+  * The dimensionality of the vector is typically large, leading to the curse of dimensionality;
+
+  * It is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
+
+3. Encoding of the source sequence via RNN: This can be described mathematically as:
+
+    $$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$$
+
+    where
+    $h_0$ is a zero vector,
+    $\varnothing _\theta$ is a non-linear activation function, and
+    $\mathbf{h}=\left \{ h_1,..., h_T \right \}$
+    is the sequential encoding of the first $T$ words from the source sequence. The vector representation of the whole sentence can be represented as the encoding vector at the last time step $T$ from $\mathbf{h}$, or by temporal pooling over $\mathbf{h}$.
+
+
+Bi-directional RNN can also be used in step (3) for more a complicated sentence encoding. This can be implemented using a bi-directional GRU. Forward GRU encodes the source sequence in its original order $(x_1,x_2,...,x_T)$, and generates a sequence of hidden states $(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$. The backward GRU encodes the source sequence in reverse order, i.e., $(x_T,x_T-1,...,x_1)$ and generates $(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$. Then for each word $x_i$, its complete hidden state is the concatenation of the corresponding hidden states from the two GRUs, i.e., $h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$.
+
+<p align="center">
+<img src="image/encoder_attention_en.png" width=500><br/>
+Figure 5. Encoder using bi-directional GRU
+</p>
+
+#### Decoder
+
+The goal of the decoder is to maximize the probability of the next correct word in the target language. The main idea is as follows:
+
+1. At each time step $i$, given the encoding vector (or context vector) $c$ of the source sentence, the $i$-th word $u_i$ from the ground-truth target language and the RNN hidden state $z_i$, the next hidden state $z_{i+1}$ is computed as:
+
+   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
+   where $\phi _{\theta '}$ is a non-linear activation function and $c=q\mathbf{h}$ is the context vector of the source sentence. Without using [attention](#Attention Mechanism), if the output of the [encoder](#Encoder) is the encoding vector at the last time step of the source sentence, then $c$ can be defined as $c=h_T$. $u_i$ denotes the $i$-th word from the target language sentence and $u_0$ denotes the beginning of the target language sentence (i.e., `<s>`), indicating the beginning of decoding. $z_i$ is the RNN hidden state at time step $i$ and $z_0$ is an all zero vector.
+
+2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows
+
+   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
+
+   where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word.
+
+3. Compute the cost accoding to $p_{i+1}$ and $u_{i+1}$.
+4. Repeat Steps 1-3, until all the words in the target language sentence have been processed.
+
+The generation process of machine translation is to translate the source sentence into a sentence in the target language according to a pre-trained model. There are some differences between the decoding step in generation and training. Please refer to [Beam Search Algorithm](#Beam Search Algorithm) for details.
+
+### Attention Mechanism
+
+There are a few problems with the fixed dimensional vector representation from the encoding stage:
+  * It is very challenging to encode both the semantic and syntactic information a sentence with a fixed dimensional vector regardless of the length of the sentence.
+  * Intuitively, when translating a sentence, we typically pay more attention to the parts in the source sentence more relevant to the current translation. Moreover, the focus changes along the process of the translation. With a fixed dimensional vector, all the information from the source sentence is treated equally in terms of attention. This is not reasonable. Therefore, Bahdanau et al. \[[4](#references)\] introduced attention mechanism, which can decode based on different fragments of the context sequence in order to address the difficulty of feature learning for long sentences. Decoder with attention will be explained in the following.
+
+Different from the simple decoder, $z_i$ is computed as:
+
+$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
+
+It is observed that for each word $u_i$ in the target language sentence, there is a corresponding context vector $c_i$ as the encoding of the source sentence, which is computed as:
+
+$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
+
+It is noted that the attention mechanism is achieved by a weighted average over the RNN hidden states $h_j$. The weight $a_{ij}$ denotes the strength of attention of the $i$-th word in the target language sentence to the $j$-th word in the source sentence and is calculated as
+
+\begin{align}
+a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
+e_{ij}&=align(z_i,h_j)\\\\
+\end{align}
+
+where $align$ is an alignment model that measures the fitness between the $i$-th word in the target language sentence and the $j$-th word in the source sentence. More concretely, the fitness is computed with the $i$-th hidden state $z_i$ of the decoder RNN and the $j$-th context vector $h_j$ of the source sentence. Hard alignment is used in the conventional alignment model, which means each word in the target language explicitly corresponds to one or more words from the target language sentence. In an attention model, soft alignment is used, where any word in source sentence is related to any word in the target language sentence, where the strength of the relation is a real number computed via the model, thus can be incorporated into the NMT framework and can be trained via back-propagation.
+
+<p align="center">
+<img src="image/decoder_attention_en.png" width=500><br/>
+Figure 6. Decoder with Attention Mechanism
+</p>
+
+### Beam Search Algorithm
+
+[Beam Search](http://en.wikipedia.org/wiki/Beam_search) is a heuristic search algorithm that explores a graph by expanding the most promising node in a limited set. It is typically used when the solution space is huge  (e.g., for machine translation, speech recognition), and there is not enough memory for all the possible solutions. For example, if we want to translate “`<s>你好<e>`” into English, even if there are only three words in the dictionary (`<s>`, `<e>`, `hello`), it is still possible to generate an infinite number of sentences, where the word `hello` can appear different number of times. Beam search could be used to find a good translation among them.
+
+Beam search builds a search tree using breadth first search and sorts the nodes according to a heuristic cost (sum of the log probability of the generated words) at each level of the tree. Only a fixed number of nodes according to the pre-specified beam size (or beam width) are considered. Thus, only nodes with highest scores are expanded in the next level. This reduces the space and time requirements significantly. However, a globally optimal solution is not guaranteed.
+
+The goal is to maximize the probability of the generated sequence when using beam search in decoding, The procedure is as follows:
+
+1. At each time step $i$, compute the hidden state $z_{i+1}$ of the next time step according to the context vector $c$ of the source sentence, the $i$-th word $u_i$ generated for the target language sentence and the RNN hidden state $z_i$.
+2. Normalize $z_{i+1}$ using `softmax` to get the probability $p_{i+1}$ for the $i+1$-th word for the target language sentence.
+3. Sample the word $u_{i+1}$ according to $p_{i+1}$.
+4. Repeat Steps 1-3, until end-of-sentence token `<e>` is generated or the maximum length of the sentence is reached.
+
+Note: $z_{i+1}$ and $p_{i+1}$ are computed the same way as in [Decoder](#Decoder). In generation mode, each step is greedy in so there is no guarantee of a global optimum.
+
+## BLEU Score
+
+Bilingual Evaluation understudy (BLEU) is a metric widely used for automatic machine translation proposed by IBM Watson Research Center in 2002\[[5](#References)\]. The closer the translation produced by a machine is to the translation produced by a human expert, the better the performance of the translation system.
+
+To measure the closeness between machine translation and human translation, sentence precision is used. It compares the number of matched n-grams. More matches will lead to higher BLEU scores.
+
+## Data Preparation
+
+This tutorial uses a dataset from [WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), where [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) is used as the training set, and [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) is used as test and generation set.
+
+
+### Data Preprocessing
+
+There are two steps for pre-processing:
+- Merge the source and target parallel corpus files into one file
+  - Merge `XXX.src` and `XXX.trg` file pair as `XXX`
+  - The $i$-th row in `XXX` is the concatenation of the $i$-th row from `XXX.src` with the $i$-th row from `XXX.trg`, separated with '\t'.
+
+- Create source dictionary and target dictionary, each containing **DICTSIZE** number of words, including the most frequent (DICTSIZE - 3) fo word from the corpus and 3 special token `<s>` (begin of sequence), `<e>` (end of sequence)  and `<unk>` (unknown words that are not in the vocabulary).
+
+### A Subset of Dataset
+
+Because the full dataset is very big, to reduce the time for downloading the full dataset. PadddlePaddle package `paddle.dataset.wmt14` provides a preprocessed `subset of dataset`(http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz).
+
+This subset has 193319 instances of training data and 6003 instances of test data. Dictionary size is 30000. Because of the limitation of size of the subset, the effectiveness of trained model from this subset is not guaranteed.
+
+## Training Instructions
+
+### Initialize PaddlePaddle
+
+```python
+import sys
+import paddle.v2 as paddle
+
+# train with a single CPU
+paddle.init(use_gpu=False, trainer_count=1)
+# False: training, True: generating
+is_generating = False
+```
+
+### Model Configuration
+
+1. Define some global variables
+
+   ```python
+   dict_size = 30000 # dict dim
+   source_dict_dim = dict_size # source language dictionary size
+   target_dict_dim = dict_size # destination language dictionary size
+   word_vector_dim = 512 # word embedding dimension
+   encoder_size = 512 # hidden layer size of GRU in encoder
+   decoder_size = 512 # hidden layer size of GRU in decoder
+   beam_size = 3 # expand width in beam search
+   max_length = 250 # a stop condition of sequence generation
+   ```
+
+2. Implement Encoder as follows:
+   - Input is a sequence of words represented by an integer word index sequence. So we define data layer of data type `integer_value_sequence`. The value range of each element in the sequence is `[0, source_dict_dim)`
+
+   ```python
+   src_word_id = paddle.layer.data(
+       name='source_language_word',
+       type=paddle.data_type.integer_value_sequence(source_dict_dim))
+   ```
+
+   - Map the one-hot vector (represented by word index) into a word vector $\mathbf{s}$ in a low-dimensional semantic space
+
+   ```python
+   src_embedding = paddle.layer.embedding(
+       input=src_word_id, size=word_vector_dim)
+   ```
+
+   - Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$
+
+   ```python
+   src_forward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size)
+   src_backward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size, reverse=True)
+   encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+   ```
+
+3. Implement Attention-based Decoder as follows:
+
+   - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network
+
+   ```python
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
+   ```
+
+   - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
+
+   ```python
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
+   ```
+
+   - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
+
+      - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
+      - context is computed via `simple_attention` as $c_i=\sum {j=1}^{T}a_{ij}h_j$, where enc_vec is the projection of $h_j$ and enc_proj is the projection of $h_j$ (c.f. 3.1). $a_{ij}$ is calculated within `simple_attention`.
+      - decoder_inputs fuse $c_i$ with the representation of the current_word (i.e., $u_i$).
+      - gru_step uses `gru_step_layer` function to compute $z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$.
+      - Softmax normalization is used in the end to computed the probability of words, i.e., $p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$. The output is returned.
+
+   ```python
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
+            size=decoder_size * 3,
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        out = paddle.layer.fc(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=gru_step)
+        return out
+   ```
+
+4. Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
+
+    ```python
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
+    group_inputs = [group_input1, group_input2]
+    ```
+
+5. Training mode:
+
+   - Word embedding from the target language trg_embedding is passed to `gru_decoder_with_attention` as current_word.
+   - `recurrent_group` calls `gru_decoder_with_attention` in a recurrent way
+   - The sequence of next words from the target language is used as label (lbl)
+   - Multi-class cross-entropy (`classification_cost`) is used to calculate the cost
+
+   ```python
+   if not is_generating:
+       trg_embedding = paddle.layer.embedding(
+           input=paddle.layer.data(
+               name='target_language_word',
+               type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+           size=word_vector_dim,
+           param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+       group_inputs.append(trg_embedding)
+
+       # For decoder equipped with attention mechanism, in training,
+       # target embeding (the groudtruth) is the data input,
+       # while encoded source sequence is accessed to as an unbounded memory.
+       # Here, the StaticInput defines a read-only memory
+       # for the recurrent_group.
+       decoder = paddle.layer.recurrent_group(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs)
+
+       lbl = paddle.layer.data(
+           name='target_language_next_word',
+           type=paddle.data_type.integer_value_sequence(target_dict_dim))
+       cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+   ```
+
+6. Generating mode:
+
+   - The decoder predicts a next target word based on the the last generated target word. Embedding of the last generated word is automatically gotten by GeneratedInputs.
+   - `beam_search` calls `gru_decoder_with_attention` in a recurrent way, to predict sequence id.
+
+   ```python
+   if is_generating:
+       # In generation, the decoder predicts a next target word based on
+       # the encoded source sequence and the previous generated target word.
+
+       # The encoded source sequence (encoder's output) must be specified by
+       # StaticInput, which is a read-only memory.
+       # Embedding of the previous generated word is automatically retrieved
+       # by GeneratedInputs initialized by a start mark <s>.
+
+       trg_embedding = paddle.layer.GeneratedInput(
+           size=target_dict_dim,
+           embedding_name='_target_language_embedding',
+           embedding_size=word_vector_dim)
+       group_inputs.append(trg_embedding)
+
+       beam_gen = paddle.layer.beam_search(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs,
+           bos_id=0,
+           eos_id=1,
+           beam_size=beam_size,
+           max_length=max_length)
+   ```
+
+Note: Our configuration is based on Bahdanau et al. \[[4](#references)\] but with a few simplifications. Please refer to [issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133) for more details.
+
+## Model Training
+
+1. Create Parameters
+
+    Create every parameter that `cost` layer needs. And we can get parameter names. If the parameter name is not specified during model configuration, it will be generated.
+
+    ```python
+    if not is_generating:
+        parameters = paddle.parameters.create(cost)
+        for param in parameters.keys():
+            print param
+    ```
+
+2. Define DataSet
+
+    Create [**data reader**](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader#python-data-reader-design-doc) for WMT-14 dataset.
+
+    ```python
+    if not is_generating:
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
+            batch_size=5)
+    ```
+3. Create trainer
+
+    We need to tell trainer what to optimize, and how to optimize. Here trainer will optimize `cost` layer using stochastic gradient descent (SDG).
+
+    ```python
+    if not is_generating:
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+    ```
+
+4. Define event handler
+
+    The event handler is a callback function invoked by trainer when an event happens. Here we will print log in event handler.
+
+    ```python
+    if not is_generating:
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 2 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost, event.metrics)
+    ```
+
+5. Start training
+
+    ```python
+    if not is_generating:
+        trainer.train(
+                reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+    ```
+
+  The training log is as follows:
+  ```text
+  Pass 0, Batch 0, Cost 247.408008, {'classification_error_evaluator': 1.0}
+  Pass 0, Batch 10, Cost 212.058789, {'classification_error_evaluator': 0.8737863898277283}
+  ...
+  ```
+
+## Model Usage
+
+1. Download Pre-trained Model
+
+    As the training of an NMT model is very time consuming, we provide a pre-trained model. The model is trained with a cluster of 50 physical nodes (each node has two 6-core CPU) over 5 days. The provided model has the [BLEU Score](#BLEU Score) of 26.92, and the size of 205M.
+
+    ```python
+    if is_generating:
+        parameters = paddle.dataset.wmt14.model()
+    ```
+2. Define DataSet
+
+    Get the first 3 samples of wmt14 generating set as the source language sequences.
+
+   ```python
+   if is_generating:
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+   ```
+
+3. Create infer
+
+    Use inference interface `paddle.infer` return the prediction probability (see field `prob`) and labels (see field `id`) of each generated sequence.
+
+   ```python
+   if is_generating:
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+   ```
+4. Print generated translation
+
+    Print sequence and its `beam_size` generated translation results based on the dictionary.
+
+   ```python
+   if is_generating:
+       # load the dictionary
+       src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+       gen_sen_idx = np.where(beam_result[1] == -1)[0]
+       assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+       # -1 is the delimiter of generated sequences.
+       # the first element of each generated sequence its length.
+       start_pos, end_pos = 1, 0
+       for i, sample in enumerate(gen_data):
+           print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
+           for j in xrange(beam_size):
+               end_pos = gen_sen_idx[i * beam_size + j]
+               print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                     trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+               start_pos = end_pos + 2
+           print("\n")
+   ```
+
+  The generating log is as follows:
+  ```text
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  ```
+
+## Summary
+
+End-to-end neural machine translation is a recently developed way to perform machine translations. In this chapter, we introduced the typical "Encoder-Decoder" framework and "attention" mechanism. Since NMT is a typical Sequence-to-Sequence (Seq2Seq) learning problem, tasks such as query rewriting, abstraction generation, and single-turn dialogues can all be solved with the model presented in this chapter.
+
+## References
+
+1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
+2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
+4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
+5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/machine_translation/image/bi_rnn.png b/08.machine_translation/image/bi_rnn.png
similarity index 100%
rename from machine_translation/image/bi_rnn.png
rename to 08.machine_translation/image/bi_rnn.png
diff --git a/machine_translation/image/bi_rnn_en.png b/08.machine_translation/image/bi_rnn_en.png
similarity index 100%
rename from machine_translation/image/bi_rnn_en.png
rename to 08.machine_translation/image/bi_rnn_en.png
diff --git a/machine_translation/image/decoder_attention.png b/08.machine_translation/image/decoder_attention.png
similarity index 100%
rename from machine_translation/image/decoder_attention.png
rename to 08.machine_translation/image/decoder_attention.png
diff --git a/machine_translation/image/decoder_attention_en.png b/08.machine_translation/image/decoder_attention_en.png
similarity index 100%
rename from machine_translation/image/decoder_attention_en.png
rename to 08.machine_translation/image/decoder_attention_en.png
diff --git a/machine_translation/image/encoder_attention.png b/08.machine_translation/image/encoder_attention.png
similarity index 100%
rename from machine_translation/image/encoder_attention.png
rename to 08.machine_translation/image/encoder_attention.png
diff --git a/machine_translation/image/encoder_attention_en.png b/08.machine_translation/image/encoder_attention_en.png
similarity index 100%
rename from machine_translation/image/encoder_attention_en.png
rename to 08.machine_translation/image/encoder_attention_en.png
diff --git a/machine_translation/image/encoder_decoder.png b/08.machine_translation/image/encoder_decoder.png
similarity index 100%
rename from machine_translation/image/encoder_decoder.png
rename to 08.machine_translation/image/encoder_decoder.png
diff --git a/machine_translation/image/encoder_decoder_en.png b/08.machine_translation/image/encoder_decoder_en.png
similarity index 100%
rename from machine_translation/image/encoder_decoder_en.png
rename to 08.machine_translation/image/encoder_decoder_en.png
diff --git a/machine_translation/image/gru.png b/08.machine_translation/image/gru.png
similarity index 100%
rename from machine_translation/image/gru.png
rename to 08.machine_translation/image/gru.png
diff --git a/machine_translation/image/gru_en.png b/08.machine_translation/image/gru_en.png
similarity index 100%
rename from machine_translation/image/gru_en.png
rename to 08.machine_translation/image/gru_en.png
diff --git a/machine_translation/image/nmt.png b/08.machine_translation/image/nmt.png
similarity index 100%
rename from machine_translation/image/nmt.png
rename to 08.machine_translation/image/nmt.png
diff --git a/machine_translation/image/nmt_en.png b/08.machine_translation/image/nmt_en.png
similarity index 100%
rename from machine_translation/image/nmt_en.png
rename to 08.machine_translation/image/nmt_en.png
diff --git a/08.machine_translation/index.cn.html b/08.machine_translation/index.cn.html
new file mode 100644
index 0000000000000000000000000000000000000000..41bc3d8451ac26f3bfc843dd6981811a4373f8a7
--- /dev/null
+++ b/08.machine_translation/index.cn.html
@@ -0,0 +1,576 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 机器翻译
+
+本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/179.html)。
+
+## 背景介绍
+
+机器翻译（machine translation, MT）是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言（source language），翻译成的结果语言称为目标语言（target language）。机器翻译即实现从源语言到目标语言转换的过程，是自然语言处理的重要研究领域之一。
+
+早期机器翻译系统多为基于规则的翻译系统，需要由语言学家编写两种语言之间的转换规则，再将这些规则录入计算机。该方法对语言学家的要求非常高，而且我们几乎无法总结一门语言会用到的所有规则，更何况两种甚至更多的语言。因此，传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。
+
+为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
+
+近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
+<p align="center">
+<img src="image/nmt.png" width=400><br/>
+图1. 基于神经网络的机器翻译系统
+</p>
+
+本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
+
+## 效果展示
+
+以中英翻译（中文翻译到英文）的模型为例，当模型训练完毕时，如果输入如下已分词的中文句子：
+```text
+这些 是 希望 的 曙光 和 解脱 的 迹象 .
+```
+如果设定显示翻译结果的条数（即[柱搜索算法](#柱搜索算法)的宽度）为3，生成的英语句子如下：
+```text
+0 -5.36816   These are signs of hope and relief . <e>
+1 -6.23177   These are the light of hope and relief . <e>
+2 -7.7914  These are the light of hope and the relief of hope . <e>
+```
+- 左起第一列是生成句子的序号；左起第二列是该条句子的得分（从大到小），分值越高越好；左起第三列是生成的英语句子。
+- 另外有两个特殊标志：`<e>`表示句子的结尾，`<unk>`表示未登录词（unknown word），即未在训练字典中出现的词。
+
+## 模型概览
+
+本节依次介绍GRU（Gated Recurrent Unit，门控循环单元），双向循环神经网络（Bi-directional Recurrent Neural Network），NMT模型中典型的编码器-解码器（Encoder-Decoder）框架和注意力（Attention）机制，以及柱搜索（beam search）算法。
+
+### GRU
+
+我们已经在[情感分析](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)一章中介绍了循环神经网络（RNN）及长短时间记忆网络（LSTM）。相比于简单的RNN，LSTM增加了记忆单元（memory cell）、输入门（input gate）、遗忘门（forget gate）及输出门（output gate），这些门及记忆单元组合起来大大提升了RNN处理远距离依赖问题的能力。
+
+GRU\[[2](#参考文献)\]是Cho等人在LSTM上提出的简化版本，也是RNN的一种扩展，如下图所示。GRU单元只有两个门：
+- 重置门（reset gate）：如果重置门关闭，会忽略掉历史信息，即历史不相干的信息不会影响未来的输出。
+- 更新门（update gate）：将LSTM的输入门和遗忘门合并，用于控制历史信息对当前时刻隐层输出的影响。如果更新门接近1，会把历史信息传递下去。
+<p align="center">
+<img src="image/gru.png" width=700><br/>
+图2. GRU（门控循环单元）
+</p>
+
+一般来说，具有短距离依赖属性的序列，其重置门比较活跃；相反，具有长距离依赖属性的序列，其更新门比较活跃。另外，Chung等人\[[3](#参考文献)\]通过多组实验表明，GRU虽然参数更少，但是在多个任务上都和LSTM有相近的表现。
+
+### 双向循环神经网络
+
+我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一章中介绍了一种双向循环神经网络，这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列，得到其在每个时刻的特征表示，即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。
+
+具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（$W_1, W_3$），隐层到隐层自己的权重矩阵（$W_2,W_5$），前向隐层和后向隐层到输出层的权重矩阵（$W_4, W_6$）。注意，该网络的前向隐层和后向隐层之间没有连接。
+
+<p align="center">
+<img src="image/bi_rnn.png" width=450><br/>
+图3. 按时间步展开的双向循环神经网络
+</p>
+
+### 编码器-解码器框架
+
+编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
+<p align="center">
+<img src="image/encoder_decoder.png" width=700><br/>
+图4. 编码器-解码器框架
+</p>
+
+#### 编码器
+
+编码阶段分为三步：
+
+1. one-hot vector表示：将源语言句子$x=\left \{ x_1,x_2,...,x_T \right \}$的每个词$x_i$表示成一个列向量$w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$。这个向量$w_i$的维度与词汇表大小$\left | V \right |$ 相同，并且只有一个维度上有值1（该位置对应该词在词汇表中的位置），其余全是0。
+
+2. 映射到低维语义空间的词向量：one-hot vector表示存在两个问题，1）生成的向量维度往往很大，容易造成维数灾难；2）难以刻画词与词之间的关系（如语义相似性，也就是无法很好地表达语义）。因此，需再one-hot vector映射到低维的语义空间，由一个固定维度的稠密向量（称为词向量）表示。记映射矩阵为$C\epsilon R^{K\times \left | V \right |}$，用$s_i=Cw_i$表示第$i$个词的词向量，$K$为向量维度。
+
+3. 用RNN编码源语言词序列：这一过程的计算公式为$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$，其中$h_0$是一个全零的向量，$\varnothing _\theta$是一个非线性激活函数，最后得到的$\mathbf{h}=\left \{ h_1,..., h_T \right \}$就是RNN依次读入源语言$T$个词的状态编码序列。整句话的向量表示可以采用$\mathbf{h}$在最后一个时间步$T$的状态编码，或使用时间维上的池化（pooling）结果。
+
+第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列$(x_1,x_2,...,x_T)$的顺序依次编码源语言端词，并得到一系列隐层状态$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$。类似的，后向GRU按照$(x_T,x_{T-1},...,x_1)$的顺序依次编码源语言端词，得到$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$。最后对于词$x_i$，通过拼接两个GRU的结果得到它的隐层状态，即$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$。
+
+<p align="center">
+<img src="image/encoder_attention.png" width=500><br/>
+图5. 使用双向GRU的编码器
+</p>
+
+#### 解码器
+
+机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
+
+1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）$c$、真实目标语言序列的第$i$个词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。计算公式如下：
+
+   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
+
+   其中$\phi _{\theta '}$是一个非线性激活函数；$c=q\mathbf{h}$是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义$c=h_T$；$u_i$是目标语言序列的第$i$个单词，$u_0$是目标语言序列的开始标记`<s>`，表示解码开始；$z_i$是$i$时刻解码RNN的隐层状态，$z_0$是一个全零的向量。
+
+2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下：
+
+   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
+
+   其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分，再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。
+
+3. 根据$p_{i+1}$和$u_{i+1}$计算代价。
+4. 重复步骤1~3，直到目标语言序列中的所有词处理完毕。
+
+机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
+
+### 注意力机制
+
+如果编码阶段的输出是一个固定维度的向量，会带来以下两个问题：1）不论源语言序列的长度是5个词还是50个词，如果都用固定维度的向量去编码其中的语义和句法结构信息，对模型来说是一个非常高的要求，特别是对长句子序列而言；2）直觉上，当人类翻译一句话时，会对与当前译文更相关的源语言片段上给予更多关注，且关注点会随着翻译的进行而改变。而固定维度的向量则相当于，任何时刻都对源语言所有信息给予了同等程度的关注，这是不合理的。因此，Bahdanau等人\[[4](#参考文献)\]引入注意力（attention）机制，可以对编码后的上下文片段进行解码，以此来解决长句子的特征学习问题。下面介绍在注意力机制下的解码器结构。
+
+与简单的解码器不同，这里$z_i$的计算公式为：
+
+$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
+
+可见，源语言句子的编码向量表示为第$i$个词的上下文片段$c_i$，即针对每一个目标语言中的词$u_i$，都有一个特定的$c_i$与之对应。$c_i$的计算公式如下：
+
+$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
+
+从公式中可以看出，注意力机制是通过对编码器中各时刻的RNN状态$h_j$进行加权平均实现的。权重$a_{ij}$表示目标语言中第$i$个词对源语言中第$j$个词的注意力大小，$a_{ij}$的计算公式如下：
+
+\begin{align}
+a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
+e_{ij}&=align(z_i,h_j)\\\\
+\end{align}
+
+其中，$align$可以看作是一个对齐模型，用来衡量目标语言中第$i$个词和源语言中第$j$个词的匹配程度。具体而言，这个程度是通过解码RNN的第$i$个隐层状态$z_i$和源语言句子的第$j$个上下文片段$h_j$计算得到的。传统的对齐模型中，目标语言的每个词明确对应源语言的一个或多个词（hard alignment）；而在注意力模型中采用的是soft alignment，即任何两个目标语言和源语言词间均存在一定的关联，且这个关联强度是由模型计算得到的实数，因此可以融入整个NMT框架，并通过反向传播算法进行训练。
+
+<p align="center">
+<img src="image/decoder_attention.png" width=500><br/>
+图6. 基于注意力机制的解码器
+</p>
+
+### 柱搜索算法
+
+柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
+
+柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
+
+使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
+
+1. 每一个时刻，根据源语言句子的编码信息$c$、生成的第$i$个目标语言序列单词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。
+2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。
+3. 根据$p_{i+1}$采样出单词$u_{i+1}$。
+4. 重复步骤1~3，直到获得句子结束标记`<e>`或超过句子的最大生成长度为止。
+
+注意：$z_{i+1}$和$p_{i+1}$的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的，因此并不能保证得到全局最优解。
+
+## 数据介绍
+
+本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
+
+### 数据预处理
+
+我们的预处理流程包括两步：
+- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
+  - 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。
+  - `XXX`中的第$i$行内容为`XXX.src`中的第$i$行和`XXX.trg`中的第$i$行连接，用'\t'分隔。
+- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词，包括：语料中词频最高的（DICTSIZE - 3）个单词，和3个特殊符号`<s>`（序列的开始）、`<e>`（序列的结束）和`<unk>`（未登录词）。
+
+### 示例数据
+
+因为完整的数据集数据量较大，为了验证训练流程，PaddlePaddle接口paddle.dataset.wmt14中默认提供了一个经过预处理的[较小规模的数据集](http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz)。
+
+该数据集有193319条训练数据，6003条测试数据，词典长度为30000。因为数据规模限制，使用该数据集训练出来的模型效果无法保证。
+
+## 流程说明
+
+### paddle初始化
+
+```python
+# 加载 paddle的python包
+import sys
+import paddle.v2 as paddle
+
+# 配置只使用cpu，并且使用一个cpu进行训练
+paddle.init(use_gpu=False, trainer_count=1)
+# 训练模式False，生成模式True
+is_generating = False
+```
+
+### 模型结构
+1. 首先，定义了一些全局变量。
+
+    ```python
+    dict_size = 30000 # 字典维度
+    source_dict_dim = dict_size # 源语言字典维度
+    target_dict_dim = dict_size # 目标语言字典维度
+    word_vector_dim = 512 # 词向量维度
+    encoder_size = 512 # 编码器中的GRU隐层大小
+    decoder_size = 512 # 解码器中的GRU隐层大小
+    beam_size = 3 # 柱宽度
+    max_length = 250 # 生成句子的最大长度
+    ```
+
+2. 其次，实现编码器框架。分为三步：
+
+   - 输入是一个文字序列，被表示成整型的序列。序列中每个元素是文字在字典中的索引。所以，我们定义数据层的数据类型为`integer_value_sequence`（整型序列），序列中每个元素的范围是`[0, source_dict_dim)`。
+
+   ```python
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+   ```
+   - 将上述编码映射到低维语言空间的词向量$\mathbf{s}$。
+
+   ```python
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id, size=word_vector_dim)
+   ```
+   - 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到$\mathbf{h}$。
+
+   ```python
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+   ```
+
+3. 接着，定义基于注意力机制的解码器框架。分为三步：
+
+   - 对源语言序列编码后的结果（见2的最后一步），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。
+
+   ```python
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
+   ```
+
+   - 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。
+
+   ```python
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
+   ```
+
+   - 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
+      - decoder_mem记录了前一个时间步的隐层状态$z_i$，其初始状态是decoder_boot。
+      - context通过调用`simple_attention`函数，实现公式$c_i=\sum {j=1}^{T}a_{ij}h_j$。其中，enc_vec是$h_j$，enc_proj是$h_j$的映射（见3.1），权重$a_{ij}$的计算已经封装在`simple_attention`函数中。
+      - decoder_inputs融合了$c_i$和当前目标词current_word（即$u_i$）的表示。
+      - gru_step通过调用`gru_step_layer`函数，在decoder_inputs和decoder_mem上做了激活操作，即实现公式$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$。
+      - 最后，使用softmax归一化计算单词的概率，将out结果返回，即实现公式$p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$。
+
+   ```python
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
+            size=decoder_size * 3,
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+                error_clipping_threshold=100.0))
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
+        return out
+   ```
+
+4. 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
+
+    ```python
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
+    group_inputs = [group_input1, group_input2]
+    ```
+
+5. 训练模式下的解码器调用：
+
+   - 首先，将目标语言序列的词向量trg_embedding，直接作为训练模式下的current_word传给`gru_decoder_with_attention`函数。
+   - 其次，使用`recurrent_group`函数循环调用`gru_decoder_with_attention`函数。
+   - 接着，使用目标语言的下一个词序列作为标签层lbl，即预测目标词。
+   - 最后，用多类交叉熵损失函数`classification_cost`来计算损失值。
+
+   ```python
+   if not is_generating:
+       trg_embedding = paddle.layer.embedding(
+           input=paddle.layer.data(
+               name='target_language_word',
+               type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+           size=word_vector_dim,
+           param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+       group_inputs.append(trg_embedding)
+
+       # For decoder equipped with attention mechanism, in training,
+       # target embeding (the groudtruth) is the data input,
+       # while encoded source sequence is accessed to as an unbounded memory.
+       # Here, the StaticInput defines a read-only memory
+       # for the recurrent_group.
+       decoder = paddle.layer.recurrent_group(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs)
+
+       lbl = paddle.layer.data(
+           name='target_language_next_word',
+           type=paddle.data_type.integer_value_sequence(target_dict_dim))
+       cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+   ```
+
+6. 生成模式下的解码器调用：
+
+   - 首先，在序列生成任务中，由于解码阶段的RNN总是引用上一时刻生成出的词的词向量，作为当前时刻的输入，因此，使用`GeneratedInput`来自动完成这一过程。具体说明可见[GeneratedInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
+   - 其次，使用`beam_search`函数循环调用`gru_decoder_with_attention`函数，生成出序列id。
+
+   ```python
+   if is_generating:
+      # In generation, the decoder predicts a next target word based on
+      # the encoded source sequence and the previous generated target word.
+
+      # The encoded source sequence (encoder's output) must be specified by
+      # StaticInput, which is a read-only memory.
+      # Embedding of the previous generated word is automatically retrieved
+      # by GeneratedInputs initialized by a start mark <s>.
+
+       trg_embedding = paddle.layer.GeneratedInput(
+           size=target_dict_dim,
+           embedding_name='_target_language_embedding',
+           embedding_size=word_vector_dim)
+       group_inputs.append(trg_embedding)
+
+       beam_gen = paddle.layer.beam_search(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs,
+           bos_id=0,
+           eos_id=1,
+           beam_size=beam_size,
+           max_length=max_length)
+   ```
+
+注意：我们提供的配置在Bahdanau的论文\[[4](#参考文献)\]上做了一些简化，可参考[issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133)。
+
+### 训练模型
+
+1. 参数定义
+
+    依据模型配置的`cost`定义模型参数。可以打印参数名字，如果在网络配置中没有指定名字，则默认生成。
+
+    ```python
+    if not is_generating:
+        parameters = paddle.parameters.create(cost)
+        for param in parameters.keys():
+            print param
+    ```
+
+2. 数据定义
+
+    获取wmt14的dataset reader。
+
+    ```python
+    if not is_generating:
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
+            batch_size=5)
+    ```
+
+3. 构造trainer
+
+    根据优化目标cost,网络拓扑结构和模型参数来构造出trainer用来训练，在构造时还需指定优化方法，这里使用最基本的SGD方法。
+
+    ```python
+    if not is_generating:
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+    ```
+
+4. 构造event_handler
+
+    可以通过自定义回调函数来评估训练过程中的各种状态，比如错误率等。下面的代码通过event.batch_id % 2 == 0 指定每2个batch打印一次日志，包含cost等信息。
+
+    ```python
+    if not is_generating:
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 2 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost, event.metrics)
+    ```
+
+5. 启动训练
+
+    ```python
+    if not is_generating:
+        trainer.train(
+                reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+    ```
+
+ 训练开始后，可以观察到event_handler输出的日志如下：
+ ```text
+ Pass 0, Batch 0, Cost 148.444983, {'classification_error_evaluator': 1.0}
+ .........
+ Pass 0, Batch 10, Cost 335.896802, {'classification_error_evaluator': 0.9325153231620789}
+ .........
+ ```
+
+### 生成模型
+
+1. 加载预训练的模型
+
+    由于NMT模型的训练非常耗时，我们在50个物理节点（每节点含有2颗6核CPU）的集群中，花了5天时间训练了一个模型供大家直接下载使用。该模型大小为205MB，[BLEU评估](#BLEU评估)值为26.92。
+
+    ```python
+    if is_generating:
+        parameters = paddle.dataset.wmt14.model()
+    ```
+2. 数据定义
+
+    从wmt14的生成集中读取前3个样本作为源语言句子。
+
+    ```python
+    if is_generating:
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+    ```
+3. 构造infer
+
+    根据网络拓扑结构和模型参数构造出infer用来生成，在预测时还需要指定输出域`field`，这里使用生成句子的概率`prob`和句子中每个词的`id`。
+
+    ```python
+    if is_generating:
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+    ```
+
+4. 打印生成结果
+
+    根据源/目标语言字典，将源语言句子和`beam_size`个生成句子打印输出。
+
+    ```python
+    if is_generating:
+        # load the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+        gen_sen_idx = np.where(beam_result[1] == -1)[0]
+        assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+        # -1 is the delimiter of generated sequences.
+        # the first element of each generated sequence its length.
+        start_pos, end_pos = 1, 0
+        for i, sample in enumerate(gen_data):
+            print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
+            for j in xrange(beam_size):
+                end_pos = gen_sen_idx[i * beam_size + j]
+                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+                start_pos = end_pos + 2
+            print("\n")
+    ```
+
+  生成开始后，可以观察到输出的日志如下：
+  ```text
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  ```
+
+## 总结
+
+端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中，我们介绍了NMT中典型的“编码器-解码器”框架和“注意力”机制。由于NMT是一个典型的Seq2Seq（Sequence to Sequence，序列到序列）学习问题，因此，Seq2Seq中的query改写（query rewriting）、摘要、单轮对话等问题都可以用本教程的模型来解决。
+
+## 参考文献
+
+1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
+2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
+4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
+5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
+
+<br/>
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/08.machine_translation/index.html b/08.machine_translation/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..ccff2408a160ff7e8890c9f057d6c6dbb247f726
--- /dev/null
+++ b/08.machine_translation/index.html
@@ -0,0 +1,619 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Machine Translation
+
+The source code of this tutorial is live at [book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation). Please refer to the [book running tutorial](https://github.com/PaddlePaddle/book#running-the-book) for getting started with Paddle.
+
+## Background
+
+Machine translation (MT) leverages computers to translate from one language to another. The language to be translated is referred to as the source language, while the language to be translated into is referred to as the target language. Thus, Machine translation is the process of translating from the source language to the target language. It is one of the most important research topics in the field of natural language processing.
+
+
+Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one language. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#references)\].
+
+
+To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example:
+
+1. Human designed features cannot cover all possible linguistic variations;
+
+2. It is difficult to use global features;
+
+3. The techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
+
+
+
+The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are:
+
+1. Techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
+
+2. Techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
+
+<p align="center">
+<img src="image/nmt_en.png" width=400><br/>
+Figure 1. Neural Network based Machine Translation
+</p>
+
+
+This tutorial will mainly introduce an NMT model and how to use PaddlePaddle to train it.
+
+## Illustrative Results
+
+Let's consider an example of Chinese-to-English translation. The model is given the following segmented sentence in Chinese
+```text
+这些 是 希望 的 曙光 和 解脱 的 迹象 .
+```
+After training and with a beam-search size of 3, the generated translations are as follows:
+```text
+0 -5.36816   These are signs of hope and relief . <e>
+1 -6.23177   These are the light of hope and relief . <e>
+2 -7.7914  These are the light of hope and the relief of hope . <e>
+```
+- The first column corresponds to the id of the generated sentence; the second column corresponds to the score of the generated sentence (in descending order), where a larger value indicates better quality; the last column corresponds to the generated sentence.
+- There are two special tokens: `<e>` denotes the end of a sentence while `<unk>` denotes unknown word, i.e., a word not in the training dictionary.
+
+## Overview of the Model
+
+This section will introduce Gated Recurrent Unit (GRU), Bi-directional Recurrent Neural Network, the Encoder-Decoder framework used in NMT, attention mechanism, as well as the beam search algorithm.
+
+### Gated Recurrent Unit (GRU)
+
+We already introduced RNN and LSTM in the [Sentiment Analysis](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md) chapter.
+Compared to a simple RNN, the LSTM added memory cell, input gate, forget gate and output gate. These gates combined with the memory cell greatly improve the ability to handle long-term dependencies.
+
+GRU\[[2](#references)\] proposed by Cho et al is a simplified LSTM and an extension of a simple RNN. It is shown in the figure below.
+A GRU unit has only two gates:
+- reset gate: when this gate is closed, the history information is discarded, i.e., the irrelevant historical information has no effect on the future output.
+- update gate: it combines the input gate and the forget gate and is used to control the impact of historical information on the hidden output. The historical information is passed over when the update gate is close to 1.
+
+<p align="center">
+<img src="image/gru_en.png" width=700><br/>
+Figure 2. A GRU Gate
+</p>
+
+Generally speaking, sequences with short distance dependencies will have an active reset gate while sequences with long distance dependency will have an active update date.
+In addition, Chung et al.\[[3](#references)\] have empirically shown that although GRU has less parameters, it has similar performance to LSTM on several different tasks.
+
+### Bi-directional Recurrent Neural Network
+
+We already introduced an instance of bi-directional RNN in the [Semantic Role Labeling](https://github.com/PaddlePaddle/book/blob/develop/label_semantic_roles/README.md) chapter. Here we present another bi-directional RNN model with a different architecture proposed by Bengio et al. in \[[2](#references),[4](#references)\]. This model takes a sequence as input and outputs a fixed dimensional feature vector at each step, encoding the context information at the corresponding time step.
+
+Specifically, this bi-directional RNN processes the input sequence in the original and reverse order respectively, and then concatenates the output feature vectors at each time step as the final output. Thus the output node at each time step contains information from the past and future as context. The figure below shows an unrolled bi-directional RNN. This network contains a forward RNN and backward RNN with six weight matrices: weight matrices from input to forward hidden layer and backward hidden ($W_1, W_3$), weight matrices from hidden to itself ($W_2, W_5$), matrices from forward hidden and backward hidden to output layer ($W_4, W_6$). Note that there are no connections between forward hidden and backward hidden layers.
+
+<p align="center">
+<img src="image/bi_rnn_en.png" width=450><br/>
+Figure 3. Temporally unrolled bi-directional RNN
+</p>
+
+### Encoder-Decoder Framework
+
+The Encoder-Decoder\[[2](#references)\] framework aims to solve the mapping of a sequence to another sequence, for sequences with arbitrary lengths. The source sequence is encoded into a vector via an encoder, which is then decoded to a target sequence via a decoder by maximizing the predictive probability. Both the encoder and the decoder are typically implemented via RNN.
+
+<p align="center">
+<img src="image/encoder_decoder_en.png" width=700><br/>
+Figure 4. Encoder-Decoder Framework
+</p>
+
+#### Encoder
+
+There are three steps for encoding a sentence:
+
+1. One-hot vector representation of a word: Each word $x_i$ in the source sentence $x=\left \{ x_1,x_2,...,x_T \right \}$ is represented as a vector $w_i\epsilon \left \{ 0,1 \right \}^{\left | V \right |},i=1,2,...,T$   where $w_i$ has the same dimensionality as the size of the dictionary, i.e., $\left | V \right |$, and has an element of one at the location corresponding to the location of the word in the dictionary and zero elsewhere.
+
+2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation
+
+  * The dimensionality of the vector is typically large, leading to the curse of dimensionality;
+
+  * It is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
+
+3. Encoding of the source sequence via RNN: This can be described mathematically as:
+
+    $$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$$
+
+    where
+    $h_0$ is a zero vector,
+    $\varnothing _\theta$ is a non-linear activation function, and
+    $\mathbf{h}=\left \{ h_1,..., h_T \right \}$
+    is the sequential encoding of the first $T$ words from the source sequence. The vector representation of the whole sentence can be represented as the encoding vector at the last time step $T$ from $\mathbf{h}$, or by temporal pooling over $\mathbf{h}$.
+
+
+Bi-directional RNN can also be used in step (3) for more a complicated sentence encoding. This can be implemented using a bi-directional GRU. Forward GRU encodes the source sequence in its original order $(x_1,x_2,...,x_T)$, and generates a sequence of hidden states $(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$. The backward GRU encodes the source sequence in reverse order, i.e., $(x_T,x_T-1,...,x_1)$ and generates $(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$. Then for each word $x_i$, its complete hidden state is the concatenation of the corresponding hidden states from the two GRUs, i.e., $h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$.
+
+<p align="center">
+<img src="image/encoder_attention_en.png" width=500><br/>
+Figure 5. Encoder using bi-directional GRU
+</p>
+
+#### Decoder
+
+The goal of the decoder is to maximize the probability of the next correct word in the target language. The main idea is as follows:
+
+1. At each time step $i$, given the encoding vector (or context vector) $c$ of the source sentence, the $i$-th word $u_i$ from the ground-truth target language and the RNN hidden state $z_i$, the next hidden state $z_{i+1}$ is computed as:
+
+   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
+   where $\phi _{\theta '}$ is a non-linear activation function and $c=q\mathbf{h}$ is the context vector of the source sentence. Without using [attention](#Attention Mechanism), if the output of the [encoder](#Encoder) is the encoding vector at the last time step of the source sentence, then $c$ can be defined as $c=h_T$. $u_i$ denotes the $i$-th word from the target language sentence and $u_0$ denotes the beginning of the target language sentence (i.e., `<s>`), indicating the beginning of decoding. $z_i$ is the RNN hidden state at time step $i$ and $z_0$ is an all zero vector.
+
+2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows
+
+   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
+
+   where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word.
+
+3. Compute the cost accoding to $p_{i+1}$ and $u_{i+1}$.
+4. Repeat Steps 1-3, until all the words in the target language sentence have been processed.
+
+The generation process of machine translation is to translate the source sentence into a sentence in the target language according to a pre-trained model. There are some differences between the decoding step in generation and training. Please refer to [Beam Search Algorithm](#Beam Search Algorithm) for details.
+
+### Attention Mechanism
+
+There are a few problems with the fixed dimensional vector representation from the encoding stage:
+  * It is very challenging to encode both the semantic and syntactic information a sentence with a fixed dimensional vector regardless of the length of the sentence.
+  * Intuitively, when translating a sentence, we typically pay more attention to the parts in the source sentence more relevant to the current translation. Moreover, the focus changes along the process of the translation. With a fixed dimensional vector, all the information from the source sentence is treated equally in terms of attention. This is not reasonable. Therefore, Bahdanau et al. \[[4](#references)\] introduced attention mechanism, which can decode based on different fragments of the context sequence in order to address the difficulty of feature learning for long sentences. Decoder with attention will be explained in the following.
+
+Different from the simple decoder, $z_i$ is computed as:
+
+$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
+
+It is observed that for each word $u_i$ in the target language sentence, there is a corresponding context vector $c_i$ as the encoding of the source sentence, which is computed as:
+
+$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
+
+It is noted that the attention mechanism is achieved by a weighted average over the RNN hidden states $h_j$. The weight $a_{ij}$ denotes the strength of attention of the $i$-th word in the target language sentence to the $j$-th word in the source sentence and is calculated as
+
+\begin{align}
+a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
+e_{ij}&=align(z_i,h_j)\\\\
+\end{align}
+
+where $align$ is an alignment model that measures the fitness between the $i$-th word in the target language sentence and the $j$-th word in the source sentence. More concretely, the fitness is computed with the $i$-th hidden state $z_i$ of the decoder RNN and the $j$-th context vector $h_j$ of the source sentence. Hard alignment is used in the conventional alignment model, which means each word in the target language explicitly corresponds to one or more words from the target language sentence. In an attention model, soft alignment is used, where any word in source sentence is related to any word in the target language sentence, where the strength of the relation is a real number computed via the model, thus can be incorporated into the NMT framework and can be trained via back-propagation.
+
+<p align="center">
+<img src="image/decoder_attention_en.png" width=500><br/>
+Figure 6. Decoder with Attention Mechanism
+</p>
+
+### Beam Search Algorithm
+
+[Beam Search](http://en.wikipedia.org/wiki/Beam_search) is a heuristic search algorithm that explores a graph by expanding the most promising node in a limited set. It is typically used when the solution space is huge  (e.g., for machine translation, speech recognition), and there is not enough memory for all the possible solutions. For example, if we want to translate “`<s>你好<e>`” into English, even if there are only three words in the dictionary (`<s>`, `<e>`, `hello`), it is still possible to generate an infinite number of sentences, where the word `hello` can appear different number of times. Beam search could be used to find a good translation among them.
+
+Beam search builds a search tree using breadth first search and sorts the nodes according to a heuristic cost (sum of the log probability of the generated words) at each level of the tree. Only a fixed number of nodes according to the pre-specified beam size (or beam width) are considered. Thus, only nodes with highest scores are expanded in the next level. This reduces the space and time requirements significantly. However, a globally optimal solution is not guaranteed.
+
+The goal is to maximize the probability of the generated sequence when using beam search in decoding, The procedure is as follows:
+
+1. At each time step $i$, compute the hidden state $z_{i+1}$ of the next time step according to the context vector $c$ of the source sentence, the $i$-th word $u_i$ generated for the target language sentence and the RNN hidden state $z_i$.
+2. Normalize $z_{i+1}$ using `softmax` to get the probability $p_{i+1}$ for the $i+1$-th word for the target language sentence.
+3. Sample the word $u_{i+1}$ according to $p_{i+1}$.
+4. Repeat Steps 1-3, until end-of-sentence token `<e>` is generated or the maximum length of the sentence is reached.
+
+Note: $z_{i+1}$ and $p_{i+1}$ are computed the same way as in [Decoder](#Decoder). In generation mode, each step is greedy in so there is no guarantee of a global optimum.
+
+## BLEU Score
+
+Bilingual Evaluation understudy (BLEU) is a metric widely used for automatic machine translation proposed by IBM Watson Research Center in 2002\[[5](#References)\]. The closer the translation produced by a machine is to the translation produced by a human expert, the better the performance of the translation system.
+
+To measure the closeness between machine translation and human translation, sentence precision is used. It compares the number of matched n-grams. More matches will lead to higher BLEU scores.
+
+## Data Preparation
+
+This tutorial uses a dataset from [WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), where [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) is used as the training set, and [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) is used as test and generation set.
+
+
+### Data Preprocessing
+
+There are two steps for pre-processing:
+- Merge the source and target parallel corpus files into one file
+  - Merge `XXX.src` and `XXX.trg` file pair as `XXX`
+  - The $i$-th row in `XXX` is the concatenation of the $i$-th row from `XXX.src` with the $i$-th row from `XXX.trg`, separated with '\t'.
+
+- Create source dictionary and target dictionary, each containing **DICTSIZE** number of words, including the most frequent (DICTSIZE - 3) fo word from the corpus and 3 special token `<s>` (begin of sequence), `<e>` (end of sequence)  and `<unk>` (unknown words that are not in the vocabulary).
+
+### A Subset of Dataset
+
+Because the full dataset is very big, to reduce the time for downloading the full dataset. PadddlePaddle package `paddle.dataset.wmt14` provides a preprocessed `subset of dataset`(http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz).
+
+This subset has 193319 instances of training data and 6003 instances of test data. Dictionary size is 30000. Because of the limitation of size of the subset, the effectiveness of trained model from this subset is not guaranteed.
+
+## Training Instructions
+
+### Initialize PaddlePaddle
+
+```python
+import sys
+import paddle.v2 as paddle
+
+# train with a single CPU
+paddle.init(use_gpu=False, trainer_count=1)
+# False: training, True: generating
+is_generating = False
+```
+
+### Model Configuration
+
+1. Define some global variables
+
+   ```python
+   dict_size = 30000 # dict dim
+   source_dict_dim = dict_size # source language dictionary size
+   target_dict_dim = dict_size # destination language dictionary size
+   word_vector_dim = 512 # word embedding dimension
+   encoder_size = 512 # hidden layer size of GRU in encoder
+   decoder_size = 512 # hidden layer size of GRU in decoder
+   beam_size = 3 # expand width in beam search
+   max_length = 250 # a stop condition of sequence generation
+   ```
+
+2. Implement Encoder as follows:
+   - Input is a sequence of words represented by an integer word index sequence. So we define data layer of data type `integer_value_sequence`. The value range of each element in the sequence is `[0, source_dict_dim)`
+
+   ```python
+   src_word_id = paddle.layer.data(
+       name='source_language_word',
+       type=paddle.data_type.integer_value_sequence(source_dict_dim))
+   ```
+
+   - Map the one-hot vector (represented by word index) into a word vector $\mathbf{s}$ in a low-dimensional semantic space
+
+   ```python
+   src_embedding = paddle.layer.embedding(
+       input=src_word_id, size=word_vector_dim)
+   ```
+
+   - Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$
+
+   ```python
+   src_forward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size)
+   src_backward = paddle.networks.simple_gru(
+       input=src_embedding, size=encoder_size, reverse=True)
+   encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+   ```
+
+3. Implement Attention-based Decoder as follows:
+
+   - Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network
+
+   ```python
+   encoded_proj = paddle.layer.fc(
+         act=paddle.activation.Linear(),
+         size=decoder_size,
+         bias_attr=False,
+         input=encoded_vector)
+   ```
+
+   - Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
+
+   ```python
+   backward_first = paddle.layer.first_seq(input=src_backward)
+   decoder_boot = paddle.layer.fc(
+         size=decoder_size,
+         act=paddle.activation.Tanh(),
+         bias_attr=False,
+         input=backward_first)
+   ```
+
+   - Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
+
+      - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
+      - context is computed via `simple_attention` as $c_i=\sum {j=1}^{T}a_{ij}h_j$, where enc_vec is the projection of $h_j$ and enc_proj is the projection of $h_j$ (c.f. 3.1). $a_{ij}$ is calculated within `simple_attention`.
+      - decoder_inputs fuse $c_i$ with the representation of the current_word (i.e., $u_i$).
+      - gru_step uses `gru_step_layer` function to compute $z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$.
+      - Softmax normalization is used in the end to computed the probability of words, i.e., $p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$. The output is returned.
+
+   ```python
+   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
+            size=decoder_size * 3,
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        out = paddle.layer.fc(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=gru_step)
+        return out
+   ```
+
+4. Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
+
+    ```python
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
+    group_inputs = [group_input1, group_input2]
+    ```
+
+5. Training mode:
+
+   - Word embedding from the target language trg_embedding is passed to `gru_decoder_with_attention` as current_word.
+   - `recurrent_group` calls `gru_decoder_with_attention` in a recurrent way
+   - The sequence of next words from the target language is used as label (lbl)
+   - Multi-class cross-entropy (`classification_cost`) is used to calculate the cost
+
+   ```python
+   if not is_generating:
+       trg_embedding = paddle.layer.embedding(
+           input=paddle.layer.data(
+               name='target_language_word',
+               type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+           size=word_vector_dim,
+           param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+       group_inputs.append(trg_embedding)
+
+       # For decoder equipped with attention mechanism, in training,
+       # target embeding (the groudtruth) is the data input,
+       # while encoded source sequence is accessed to as an unbounded memory.
+       # Here, the StaticInput defines a read-only memory
+       # for the recurrent_group.
+       decoder = paddle.layer.recurrent_group(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs)
+
+       lbl = paddle.layer.data(
+           name='target_language_next_word',
+           type=paddle.data_type.integer_value_sequence(target_dict_dim))
+       cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+   ```
+
+6. Generating mode:
+
+   - The decoder predicts a next target word based on the the last generated target word. Embedding of the last generated word is automatically gotten by GeneratedInputs.
+   - `beam_search` calls `gru_decoder_with_attention` in a recurrent way, to predict sequence id.
+
+   ```python
+   if is_generating:
+       # In generation, the decoder predicts a next target word based on
+       # the encoded source sequence and the previous generated target word.
+
+       # The encoded source sequence (encoder's output) must be specified by
+       # StaticInput, which is a read-only memory.
+       # Embedding of the previous generated word is automatically retrieved
+       # by GeneratedInputs initialized by a start mark <s>.
+
+       trg_embedding = paddle.layer.GeneratedInput(
+           size=target_dict_dim,
+           embedding_name='_target_language_embedding',
+           embedding_size=word_vector_dim)
+       group_inputs.append(trg_embedding)
+
+       beam_gen = paddle.layer.beam_search(
+           name=decoder_group_name,
+           step=gru_decoder_with_attention,
+           input=group_inputs,
+           bos_id=0,
+           eos_id=1,
+           beam_size=beam_size,
+           max_length=max_length)
+   ```
+
+Note: Our configuration is based on Bahdanau et al. \[[4](#references)\] but with a few simplifications. Please refer to [issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133) for more details.
+
+## Model Training
+
+1. Create Parameters
+
+    Create every parameter that `cost` layer needs. And we can get parameter names. If the parameter name is not specified during model configuration, it will be generated.
+
+    ```python
+    if not is_generating:
+        parameters = paddle.parameters.create(cost)
+        for param in parameters.keys():
+            print param
+    ```
+
+2. Define DataSet
+
+    Create [**data reader**](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader#python-data-reader-design-doc) for WMT-14 dataset.
+
+    ```python
+    if not is_generating:
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
+            batch_size=5)
+    ```
+3. Create trainer
+
+    We need to tell trainer what to optimize, and how to optimize. Here trainer will optimize `cost` layer using stochastic gradient descent (SDG).
+
+    ```python
+    if not is_generating:
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+    ```
+
+4. Define event handler
+
+    The event handler is a callback function invoked by trainer when an event happens. Here we will print log in event handler.
+
+    ```python
+    if not is_generating:
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 2 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost, event.metrics)
+    ```
+
+5. Start training
+
+    ```python
+    if not is_generating:
+        trainer.train(
+                reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+    ```
+
+  The training log is as follows:
+  ```text
+  Pass 0, Batch 0, Cost 247.408008, {'classification_error_evaluator': 1.0}
+  Pass 0, Batch 10, Cost 212.058789, {'classification_error_evaluator': 0.8737863898277283}
+  ...
+  ```
+
+## Model Usage
+
+1. Download Pre-trained Model
+
+    As the training of an NMT model is very time consuming, we provide a pre-trained model. The model is trained with a cluster of 50 physical nodes (each node has two 6-core CPU) over 5 days. The provided model has the [BLEU Score](#BLEU Score) of 26.92, and the size of 205M.
+
+    ```python
+    if is_generating:
+        parameters = paddle.dataset.wmt14.model()
+    ```
+2. Define DataSet
+
+    Get the first 3 samples of wmt14 generating set as the source language sequences.
+
+   ```python
+   if is_generating:
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+   ```
+
+3. Create infer
+
+    Use inference interface `paddle.infer` return the prediction probability (see field `prob`) and labels (see field `id`) of each generated sequence.
+
+   ```python
+   if is_generating:
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+   ```
+4. Print generated translation
+
+    Print sequence and its `beam_size` generated translation results based on the dictionary.
+
+   ```python
+   if is_generating:
+       # load the dictionary
+       src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+       gen_sen_idx = np.where(beam_result[1] == -1)[0]
+       assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+       # -1 is the delimiter of generated sequences.
+       # the first element of each generated sequence its length.
+       start_pos, end_pos = 1, 0
+       for i, sample in enumerate(gen_data):
+           print(" ".join([src_dict[w] for w in sample[0][1:-1]]))
+           for j in xrange(beam_size):
+               end_pos = gen_sen_idx[i * beam_size + j]
+               print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                     trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+               start_pos = end_pos + 2
+           print("\n")
+   ```
+
+  The generating log is as follows:
+  ```text
+  Les <unk> se <unk> au sujet de la largeur des sièges alors que de grosses commandes sont en jeu
+  -19.0196        The <unk> will be rotated about the width of the seats , while large orders are at stake . <e>
+  -19.1131        The <unk> will be rotated about the width of the seats , while large commands are at stake . <e>
+  -19.5129        The <unk> will be rotated about the width of the seats , while large commands are at play . <e>
+  ```
+
+## Summary
+
+End-to-end neural machine translation is a recently developed way to perform machine translations. In this chapter, we introduced the typical "Encoder-Decoder" framework and "attention" mechanism. Since NMT is a typical Sequence-to-Sequence (Seq2Seq) learning problem, tasks such as query rewriting, abstraction generation, and single-turn dialogues can all be solved with the model presented in this chapter.
+
+## References
+
+1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
+2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
+4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
+5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
+
+<br/>
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/08.machine_translation/train.py b/08.machine_translation/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..30817328f3dd4eaab0aa3f2514ec94a9489ac110
--- /dev/null
+++ b/08.machine_translation/train.py
@@ -0,0 +1,235 @@
+import sys, os
+import numpy as np
+import paddle.v2 as paddle
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
+
+def save_model(trainer, parameters, save_path):
+    with open(save_path, 'w') as f:
+        trainer.save_parameter_to_tar(f)
+
+
+def seq_to_seq_net(source_dict_dim,
+                   target_dict_dim,
+                   is_generating,
+                   beam_size=3,
+                   max_length=250):
+    ### Network Architecture
+    word_vector_dim = 512  # dimension of word vector
+    decoder_size = 512  # dimension of hidden unit of GRU decoder
+    encoder_size = 512  # dimension of hidden unit of GRU encoder
+
+    #### Encoder
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id, size=word_vector_dim)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    #### Decoder
+    encoded_proj = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size,
+        bias_attr=False,
+        input=encoded_vector)
+
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    decoder_boot = paddle.layer.fc(
+        size=decoder_size,
+        act=paddle.activation.Tanh(),
+        bias_attr=False,
+        input=backward_first)
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        decoder_inputs = paddle.layer.fc(
+            act=paddle.activation.Linear(),
+            size=decoder_size * 3,
+            bias_attr=False,
+            input=[context, current_word],
+            layer_attr=paddle.attr.ExtraLayerAttribute(
+                error_clipping_threshold=100.0))
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        out = paddle.layer.fc(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=gru_step)
+        return out
+
+    decoder_group_name = 'decoder_group'
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj)
+    group_inputs = [group_input1, group_input2]
+
+    if not is_generating:
+        trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+
+        # For decoder equipped with attention mechanism, in training,
+        # target embeding (the groudtruth) is the data input,
+        # while encoded source sequence is accessed to as an unbounded memory.
+        # Here, the StaticInput defines a read-only memory
+        # for the recurrent_group.
+        decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+        lbl = paddle.layer.data(
+            name='target_language_next_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim))
+        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+        return cost
+    else:
+        # In generation, the decoder predicts a next target word based on
+        # the encoded source sequence and the previous generated target word.
+
+        # The encoded source sequence (encoder's output) must be specified by
+        # StaticInput, which is a read-only memory.
+        # Embedding of the previous generated word is automatically retrieved
+        # by GeneratedInputs initialized by a start mark <s>.
+
+        trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+        group_inputs.append(trg_embedding)
+
+        beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+
+        return beam_gen
+
+
+def main():
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
+    is_generating = False
+
+    # source and target dict dim.
+    dict_size = 30000
+    source_dict_dim = target_dict_dim = dict_size
+
+    # train the network
+    if not is_generating:
+        # define optimize method and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+        cost = seq_to_seq_net(source_dict_dim, target_dict_dim, is_generating)
+        parameters = paddle.parameters.create(cost)
+
+        trainer = paddle.trainer.SGD(
+            cost=cost, parameters=parameters, update_equation=optimizer)
+        # define data reader
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
+            batch_size=4)
+
+        # define event_handler callback
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 10 == 0:
+                    print("\nPass %d, Batch %d, Cost %f, %s" %
+                          (event.pass_id, event.batch_id, event.cost,
+                           event.metrics))
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+
+                if not event.batch_id % 10:
+                    save_path = 'params_pass_%05d_batch_%05d.tar' % (
+                        event.pass_id, event.batch_id)
+                    save_model(trainer, parameters, save_path)
+
+            if isinstance(event, paddle.event.EndPass):
+                # save parameters
+                save_path = 'params_pass_%05d.tar' % (event.pass_id)
+                save_model(trainer, parameters, save_path)
+
+        # start to train
+        trainer.train(
+            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+
+    # generate a english sequence to french
+    else:
+        # use the first 3 samples for generation
+        gen_data = []
+        gen_num = 3
+        for item in paddle.dataset.wmt14.gen(dict_size)():
+            gen_data.append([item[0]])
+            if len(gen_data) == gen_num:
+                break
+
+        beam_size = 3
+        beam_gen = seq_to_seq_net(source_dict_dim, target_dict_dim,
+                                  is_generating, beam_size)
+
+        # get the trained model, whose bleu = 26.92
+        parameters = paddle.dataset.wmt14.model()
+
+        # prob is the prediction probabilities, and id is the prediction word.
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+
+        # load the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+        gen_sen_idx = np.where(beam_result[1] == -1)[0]
+        assert len(gen_sen_idx) == len(gen_data) * beam_size
+
+        # -1 is the delimiter of generated sequences.
+        # the first element of each generated sequence its length.
+        start_pos, end_pos = 1, 0
+        for i, sample in enumerate(gen_data):
+            print(
+                " ".join([src_dict[w] for w in sample[0][1:-1]])
+            )  # skip the start and ending mark when printing the source sentence
+            for j in xrange(beam_size):
+                end_pos = gen_sen_idx[i * beam_size + j]
+                print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                    trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+                start_pos = end_pos + 2
+            print("\n")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/README.cn.md b/README.cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3eb4cefef0cdfad464c454162b75fd76d1d640a3
--- /dev/null
+++ b/README.cn.md
@@ -0,0 +1,72 @@
+# 深度学习入门
+
+[![Build Status](https://travis-ci.org/PaddlePaddle/book.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/book)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://book.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://book.paddlepaddle.org/index.cn.html)
+
+1. [新手入门](http://book.paddlepaddle.org/01.fit_a_line/index.cn.html)
+1. [识别数字](http://book.paddlepaddle.org/02.recognize_digits/index.cn.html)
+1. [图像分类](http://book.paddlepaddle.org/03.image_classification/index.cn.html)
+1. [词向量](http://book.paddlepaddle.org/04.word2vec/index.cn.html)
+1. [个性化推荐](http://book.paddlepaddle.org/05.recommender_system/index.cn.html)
+1. [情感分析](http://book.paddlepaddle.org/06.understand_sentiment/index.cn.html)
+1. [语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)
+1. [机器翻译](http://book.paddlepaddle.org/08.machine_translation/index.cn.html)
+
+更多学习内容请访问PaddlePaddle[视频课堂](http://bit.baidu.com/Course/datalist/column/117.html)。
+
+## 运行这本书
+
+您现在在看的这本书是一本“交互式”电子书 —— 每一章都可以运行在一个Jupyter Notebook里。
+
+我们把Jupyter、PaddlePaddle、以及各种被依赖的软件都打包进一个Docker image了。所以您不需要自己来安装各种软件，只需要安装Docker即可。对于各种Linux发行版，请参考 https://www.docker.com 。如果您使用[Windows](https://www.docker.com/docker-windows)或者[Mac](https://www.docker.com/docker-mac)，可以考虑[给Docker更多内存和CPU资源](http://stackoverflow.com/a/39720010/724872)。
+
+只需要在命令行窗口里运行：
+
+```bash
+docker run -d -p 8888:8888 paddlepaddle/book
+```
+
+会从DockerHub.com下载和运行本书的Docker image。阅读和在线编辑本书请在浏览器里访问 http://localhost:8888 。
+
+如果您访问DockerHub.com很慢，可以试试我们的另一个镜像docker.paddlepaddle.org：
+
+```bash
+docker run -d -p 8888:8888 docker.paddlepaddle.org/book
+```
+
+### 使用GPU训练
+
+本书默认使用CPU训练，若是要使用GPU训练，使用步骤会稍有变化。为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。请先安装nvidia-docker，之后请运行：
+
+```bash
+nvidia-docker run -d -p 8888:8888 paddlepaddle/book:latest-gpu
+```
+
+或者使用国内的镜像请运行：
+
+```bash
+nvidia-docker run -d -p 8888:8888 docker.paddlepaddle.org/book:latest-gpu
+```
+
+还需要将以下代码
+```python
+paddle.init(use_gpu=False, trainer_count=1)
+```
+
+改成：
+```python
+paddle.init(use_gpu=True, trainer_count=1)
+```
+
+
+## 贡献内容
+
+您要是能贡献新的章节那就太好了！请发Pull Requests把您写的章节加入到`/pending`下面的一个子目录里。当这一章稳定下来，我们一起把您的目录挪到根目录。
+
+为了写作、运行、调试，您需要安装Python 2.x和Go >1.5, 并可以用[脚本程序](https://github.com/PaddlePaddle/book/blob/develop/.tools/convert-markdown-into-ipynb-and-test.sh)来生成新的Docker image。
+
+**Note:** We also provide [English Readme](https://github.com/PaddlePaddle/book/blob/develop/README.md) for PaddlePaddle book.
+
+
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">知识共享 署名-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/README.md b/README.md
index 03d40ec103c2bb4d9449de5ac5c2185611d32c23..1ceba927f4c9e0ca9a142655a565830b19856886 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,72 @@
-# 深度学习入门
-
-1. 新手入门 [[fit_a_line](fit_a_line/)] [[html](http://book.paddlepaddle.org/fit_a_line)]
-1. 识别数字 [[recognize_digits](recognize_digits/)] [[html](http://book.paddlepaddle.org/recognize_digits)]
-1. 图像分类 [[image_classification](image_classification/)] [[html](http://book.paddlepaddle.org/image_classification)]
-1. 词向量 [[word2vec](word2vec/)] [[html](http://book.paddlepaddle.org/word2vec)]
-1. 情感分析 [[understand_sentiment](understand_sentiment/)] [[html](http://book.paddlepaddle.org/understand_sentiment)]
-1. 语义角色标注 [[label_semantic_roles](label_semantic_roles/)] [[html](http://book.paddlepaddle.org/label_semantic_roles)]
-1. 机器翻译 [[machine_translation](machine_translation/)] [[html](http://book.paddlepaddle.org/machine_translation)]
-1. 个性化推荐 [[recommender_system](recommender_system/)] [[html](http://book.paddlepaddle.org/recommender_system)]
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
+# Deep Learning with PaddlePaddle
+
+[![Build Status](https://travis-ci.org/PaddlePaddle/book.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/book)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://book.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://book.paddlepaddle.org/index.cn.html)
+
+1. [Fit a Line](http://book.paddlepaddle.org/01.fit_a_line/)
+1. [Recognize Digits](http://book.paddlepaddle.org/02.recognize_digits/)
+1. [Image Classification](http://book.paddlepaddle.org/03.image_classification/)
+1. [Word to Vector](http://book.paddlepaddle.org/04.word2vec/)
+1. [Recommender System](http://book.paddlepaddle.org/05.recommender_system/)
+1. [Understand Sentiment](http://book.paddlepaddle.org/06.understand_sentiment/)
+1. [Label Semantic Roles](http://book.paddlepaddle.org/07.label_semantic_roles/)
+1. [Machine Translation](http://book.paddlepaddle.org/08.machine_translation/)
+
+## Running the Book
+
+This book you are reading is interactive -- each chapter can run as a Jupyter Notebook.
+
+We packed this book, Jupyter, PaddlePaddle, and all dependencies into a Docker image. So you don't need to install anything except Docker. If you are using Windows, please follow [this installation guide](https://www.docker.com/docker-windows).  If you are running Mac, please follow [this](https://www.docker.com/docker-mac). For various Linux distros, please refer to https://www.docker.com.  If you are using Windows or Mac, you might want to give Docker [more memory and CPUs/cores](http://stackoverflow.com/a/39720010/724872).
+
+Just type
+
+```bash
+docker run -d -p 8888:8888 paddlepaddle/book
+
+```
+
+This command will download the pre-built Docker image from DockerHub.com and run it in a container.  Please direct your Web browser to http://localhost:8888 to read the book.
+
+If you are living in somewhere slow to access DockerHub.com, you might try our mirror server docker.paddlepaddle.org:
+
+```bash
+docker run -d -p 8888:8888 docker.paddlepaddle.org/book
+
+```
+
+### Training with GPU
+
+By default we are using CPU for training, if you want to train with GPU, the steps are a little different.
+
+To make sure GPU can be successfully used from inside container, please install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). Then run:
+
+```bash
+nvidia-docker run -d -p 8888:8888 paddlepaddle/book:latest-gpu
+
+```
+
+Or you can use the image registry mirror in China:
+
+```bash
+nvidia-docker run -d -p 8888:8888 docker.paddlepaddle.org/book:latest-gpu
+
+```
+
+Change the code in the chapter that you are reading from
+```python
+paddle.init(use_gpu=False, trainer_count=1)
+```
+
+to:
+```python
+paddle.init(use_gpu=True, trainer_count=1)
+```
+
+
+## Contribute
+
+Your contribution is welcome!  Please feel free to file Pull Requests to add your chapter as a directory under `/pending`. Once it is going stable, the community would like to move it to `/`.
+
+To write, run, and debug your chapters, you will need Python 2.x, Go >1.5. You can build the Docker image using [this script](https://github.com/PaddlePaddle/book/blob/develop/.tools/convert-markdown-into-ipynb-and-test.sh).
+This tutorial is contributed by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/build.sh b/build.sh
deleted file mode 100755
index 8497a3db15496faba30b245db9189c1916d1f374..0000000000000000000000000000000000000000
--- a/build.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-for i in $(du -a | grep '\.\/.\+\/README.md' | cut -f 2); do
-    .tmpl/convert-markdown-into-html.sh $i > $(dirname $i)/index.html
-done
-
-for i in $(du -a | grep '\.\/.\+\/README.en.md' | cut -f 2); do
-    .tmpl/convert-markdown-into-html.sh $i > $(dirname $i)/index.en.html
-done
diff --git a/fit_a_line/README.en.md b/fit_a_line/README.en.md
deleted file mode 100644
index a804ca9192d4df295bce81d9b95f1c69e9478439..0000000000000000000000000000000000000000
--- a/fit_a_line/README.en.md
+++ /dev/null
@@ -1,189 +0,0 @@
-# Linear Regression
-Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict house prices. Some important concepts in Machine Learning will be covered through this example.
-
-The source code for this tutorial is at [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line). If this is your first time using PaddlePaddle, please refer to the [Install Guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Problem
-Suppose we have a dataset of $n$ houses. Each house $i$ has $d$ properties and the price $y_i$. A property $x_{i,d}$ describes one aspect of the house, for example, the number of rooms in the house, the number of schools or hospitals in the neighborhood, the nearby traffic condition, etc. Our task is to predict $y_i$ given a set of properties $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price is a linear combination of all the properties, i.e.,
-
-$$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b,  i=1,\ldots,n$$
-
-where $\omega_{d}$ and $b$ are the model parameters we want to estimate. Once they are learned, given a set of properties of a house, we will be able to predict a price for that house. The model we have here is called Linear Regression, namely, we want to regress a value as a linear combination of several values. In practice this linear model for our problem is hardly true, because the real relationship between the house properties and the price is much more complicated. However, due to its simple formulation which makes the model training and analysis easy, Linear Regression has been applied to lots of real problems. It is always an important topic in many classical Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
-
-## Results Demonstration
-We first show the training result of our model. We use the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) to train a linear model and predict the house prices in Boston. The figure below shows the predictions the model makes for some house prices. The $X$ coordinate of each point represents the median value of the prices of a certain type of houses, while the $Y$ coordinate represents the predicted value by our linear model. When $X=Y$, the point lies exactly on the dotted line. In other words, the more precise the model predicts, the closer the point is to the dotted line.
-<p align="center">
-	<img src = "image/predictions_en.png" width=400><br/>
-	Figure 1. Predicted Value V.S. Actual Value
-</p>
-
-## Model Overview
-
-### Model Definition
-
-In the UCI Housing Data Set, there are 13 house properties $x_{i,d}$ that are related to the median house price $y_i$. Thus our model is:
-
-$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
-
-where $\hat{Y}$ is the predicted value used to differentiate from the actual value $Y$. The model parameters to be learned are: $\omega_1, \ldots, \omega_{13}, b$, where $\omega$ are called the weights and $b$ is called the bias.
-
-Now we need an optimization goal, so that with the learned parameters, $\hat{Y}$ is close to $Y$ as much as possible. Here we introduce the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). The Loss Function has such property: given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$, its output is always non-negative. This non-negative value reflects the model error.
-
-For Linear Regression, the most common Loss Function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
-
-$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
-
-For a dataset of size $n$, MSE is the average value of the $n$ predicted errors.
-
-### Training
-
-After defining our model, we have several major steps for the training:
-1. Initialize the parameters including the weights $\omega$ and the bias $b$. For example, we can set their mean values as 0s, and their standard deviations as 1s.
-2. Feedforward to compute the network output and the Loss Function.
-3. Backward to [backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
-4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of repeats is reached.
-
-## Data Preparation
-Follow the command below to prepare data:
-```bash
-cd data && python prepare_data.py
-```
-This line of code will download the dataset from the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) and perform some [preprocessing](#Preprocessing). The dataset is split into a training set and a test set.
-
-The dataset contains 506 lines in total, each line describing the properties and the median price of a certain type of houses in Boston. The meaning of each line is below:
-
-
-| Property Name | Explanation | Data Type |
-| ------| ------ | ------ |
-| CRIM | per capita crime rate by town | Continuous|
-| ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous |
-| INDUS | proportion of non-retail business acres per town | Continuous |
-| CHAS | Charles River dummy variable | Discrete, 1 if tract bounds river; 0 otherwise|
-| NOX | nitric oxides concentration (parts per 10 million) | Continuous |
-| RM | average number of rooms per dwelling | Continuous |
-| AGE | proportion of owner-occupied units built prior to 1940 | Continuous |
-| DIS | weighted distances to five Boston employment centres | Continuous |
-| RAD | index of accessibility to radial highways | Continuous |
-| TAX | full-value property-tax rate per $10,000 | Continuous |
-| PTRATIO | pupil-teacher ratio by town | Continuous |
-| B | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | Continuous |
-| LSTAT | % lower status of the population | Continuous |
-| MEDV | Median value of owner-occupied homes in $1000's | Continuous |
-
-The last entry is the median house price.
-
-### Preprocessing
-#### Continuous and Discrete Data
-We define a feature vector of length 13 for each house, where each entry of the feature vector corresponds to a property of that house. Our first observation is that among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension. Note that although a discrete value is also written as digits such as 0, 1, or 2, it has a quite different meaning from a continuous value. The reason is that the difference between two discrete values has no practical meaning. For example, if we use 0, 1, and 2 to represent `red`, `green`, and `blue` respectively, although the numerical difference between `red` and `green` is smaller than that between `red` and `blue`, we cannot say that the extent to which `blue` is different from `red` is greater than the extent to which `green` is different from `red`. Therefore, when handling a discrete feature that has $d$ possible values, we will usually convert it to $d$ new features where each feature can only take 0 or 1, indicating whether the original $d$th value is present or not. Or we can map the discrete feature to a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
-
-#### Feature Normalization
-Another observation we have is that there is a huge difference among the value ranges of the 13 features (Figure 2). For example, feature B has a value range of [0.32, 396.90] while feature NOX has a range of [0.3850, 0.8170]. For an effective optimization, here we need data normalization. The goal of data normalization is to scale each feature into roughly the same value range, for example [-0.5, 0.5]. In this example, we adopt a standard way of normalization: substracting the mean value from the feature and divide the result by the original value range.
-
-There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
-- A value range that is too large or too small might cause floating number overflow or underflow during computation.
-- Different value ranges might result in different importances of different features to the model (at least in the beginning of the training process), which however is an unreasonable assumption. Such assumption makes the optimization more difficult and increases the training time a lot.
-- Many Machine Learning techniques or models (e.g., L1/L2 regularization and Vector Space Model) are based on the assumption that all the features have roughly zero means and their value ranges are similar.
-
-<p align="center">
-	<img src = "image/ranges_en.png" width=550><br/>
-	Figure 2. The value ranges of the features
-</p>
-
-#### Prepare Training and Test Sets
-We split the dataset into two subsets, one for estimating the model parameters, namely, model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal of training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$. You can try different split ratios to observe how the two variances change.
-
-Executing the following command to split the dataset and write the training and test set into the `train.list` and `test.list` files, so that later PaddlePaddle can read from them.
-```python
-python prepare_data.py -r 0.8 #8:2 is the default split ratio
-```
-
-When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process begins. These hyperparameters are not part of the model parameters and cannot be trained using the same Loss Function (e.g., the number of layers in the network). Thus we will try several sets of hyperparameters to get several models, and compare these trained models on the validation set to pick the best one, and finally it on the test set. Because our model is relatively simple in this problem, we ignore this validation process for now.
-
-### Provide Data to PaddlePaddle
-After the data is prepared, we use a Python Data Provider to provide data for PaddlePaddle. A Data Provider is a Python function which will be called by PaddlePaddle during training. In this example, the Data Provider only needs to read the data and return it to the training process of PaddlePaddle line by line.
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-import numpy as np
-#define data type and dimensionality
-@provider(input_types=[dense_vector(13), dense_vector(1)])
-def process(settings, input_file):
-    data = np.load(input_file.strip())
-    for row in data:
-	    yield row[:-1].tolist(), row[-1:].tolist()
-
-```
-
-## Model Configuration
-
-### Data Definition
-We first call the function `define_py_data_sources2` to let PaddlePaddle read training and test data from the `dataprovider.py` in the above. PaddlePaddle can accept configuration info from the command line, for example, here we pass a variable named `is_predict` to control the model to have different structures during training and test.
-```python
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg('is_predict', bool, False)
-
-define_py_data_sources2(
-    train_list='data/train.list',
-    test_list='data/test.list',
-    module='dataprovider',
-    obj='process')
-
-```
-
-### Algorithm Settings
-Next we need to set the details of the optimization algorithm. Due to the simplicity of the Linear Regression model, we only need to set the `batch_size` which defines how many samples are used every time for updating the parameters.
-```python
-settings(batch_size=2)
-```
-
-### Network
-Finally, we use `fc_layer` and `LinearActivation` to represent the Linear Regression model.
-```python
-#input data of 13 dimensional house information
-x = data_layer(name='x', size=13)
-
-y_predict = fc_layer(
-    input=x,
-    param_attr=ParamAttr(name='w'),
-    size=1,
-    act=LinearActivation(),
-    bias_attr=ParamAttr(name='b'))
-
-if not is_predict: #when training, we use MSE (i.e., regression_cost) as the Loss Function
-    y = data_layer(name='y', size=1)
-    cost = regression_cost(input=y_predict, label=y)
-    outputs(cost) #output MSE to view the loss change
-else: #during test, output the prediction value
-    outputs(y_predict)
-```
-
-## Training Model
-We can run the PaddlePaddle command line trainer in the root directory of the code. Here we name the configuration file as `trainer_config.py`. We train 30 passes and save the result in the directory `output`:
-```bash
-./train.sh
-```
-
-## Use Model
-Now we can use the trained model to do prediction.
-```bash
-python predict.py
-```
-Here by default we use the model in `output/pass-00029` for prediction, and compare the actual house price with the predicted one. The result is shown in `predictions.png`.
-If you want to use another model or test on other data, you can pass in a new model path or data path:
-```bash
-python predict.py -m output/pass-00020 -t data/housing.test.npy
-```
-
-## Summary
-In this chapter, we have introduced the Linear Regression model using the UCI Housing Data Set as an example. We have shown how to train and test this model with PaddlePaddle. Many more complex models and techniques are derived from this simple linear model, thus it is important for us to understand how it works.
-
-
-## References
-1. https://en.wikipedia.org/wiki/Linear_regression
-2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
-3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
-4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/fit_a_line/data/prepare_data.py b/fit_a_line/data/prepare_data.py
deleted file mode 100644
index 4a3782752c7964a7203e7e78afe1d36cd003037a..0000000000000000000000000000000000000000
--- a/fit_a_line/data/prepare_data.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from collections import Counter
-from urllib2 import urlopen
-import argparse
-import os
-import random
-import logging
-
-import numpy as np
-
-logging.basicConfig(level=logging.INFO)
-data_url = 'https://archive.ics.uci.edu/ml/machine' \
-           '-learning-databases/housing/housing.data'
-raw_data = 'housing.data'
-train_data = 'housing.train.npy'
-test_data = 'housing.test.npy'
-feature_names = [
-    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT'
-]
-root_dir = os.path.abspath(os.pardir)
-
-
-def maybe_download(url, file_path):
-    if not os.path.exists(file_path):
-        logging.info('data doesn\'t exist on %s, download from [%s]' %
-                     (file_path, url))
-        resp = urlopen(url).read()
-        with open(file_path, 'w') as f:
-            f.write(resp)
-
-    logging.info('got raw housing data')
-
-
-def save_list():
-    with open('train.list', 'w') as f:
-        f.write('data/' + train_data + '\n')
-    with open('test.list', 'w') as f:
-        f.write('data/' + test_data + '\n')
-
-
-def feature_range(maximums, minimums):
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-    fig, ax = plt.subplots()
-    feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
-    ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
-    plt.xlim([-1, feature_num])
-    fig.set_figheight(6)
-    fig.set_figwidth(10)
-    fig.savefig('%s/image/ranges.png' % root_dir, dpi=48)
-    plt.close(fig)
-
-
-def preprocess(file_path, feature_num=14, shuffle=False, ratio=0.8):
-    data = np.fromfile(file_path, sep=' ')
-    data = data.reshape(data.shape[0] / feature_num, feature_num)
-    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
-        axis=0) / data.shape[0]
-    feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
-        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
-    if shuffle:
-        np.random.shuffle(data)
-    offset = int(data.shape[0] * ratio)
-    np.save(train_data, data[:offset])
-    logging.info('saved training data to %s' % train_data)
-    np.save(test_data, data[offset:])
-    logging.info('saved test data to %s' % test_data)
-    save_list()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='download boston housing price data set and preprocess the data(normalization and split dataset)'
-    )
-    parser.add_argument(
-        '-r',
-        '--ratio',
-        dest='ratio',
-        default='0.8',
-        help='ratio of data used for training')
-    parser.add_argument(
-        '-s',
-        '--shuffle',
-        dest='shuffle',
-        default='0',
-        choices={'1', '0'},
-        help='shuffle the data before splitting, 1=shuffle, 0=do not shuffle')
-    args = parser.parse_args()
-
-    maybe_download(data_url, raw_data)
-    preprocess(raw_data, shuffle=int(args.shuffle), ratio=float(args.ratio))
diff --git a/fit_a_line/dataprovider.py b/fit_a_line/dataprovider.py
deleted file mode 100644
index f93fe4cafb470c21ac7cf1bd0f34b9fd676856dc..0000000000000000000000000000000000000000
--- a/fit_a_line/dataprovider.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import numpy as np
-
-
-# define data types of input
-@provider(input_types=[dense_vector(13), dense_vector(1)])
-def process(settings, input_file):
-    data = np.load(input_file.strip())
-    for row in data:
-        yield row[:-1].tolist(), row[-1:].tolist()
diff --git a/fit_a_line/image/ranges.png b/fit_a_line/image/ranges.png
deleted file mode 100644
index 3661ace169ad2bbda6355cff615b771108b258de..0000000000000000000000000000000000000000
Binary files a/fit_a_line/image/ranges.png and /dev/null differ
diff --git a/fit_a_line/index.en.html b/fit_a_line/index.en.html
deleted file mode 100644
index b2492b2c8d0ab1126ba444acc669102bc02ebdfb..0000000000000000000000000000000000000000
--- a/fit_a_line/index.en.html
+++ /dev/null
@@ -1,251 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# Linear Regression
-Let us begin the tutorial with a classical problem called Linear Regression \[[1](#References)\]. In this chapter, we will train a model from a realistic dataset to predict house prices. Some important concepts in Machine Learning will be covered through this example.
-
-The source code for this tutorial is at [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/fit_a_line). If this is your first time using PaddlePaddle, please refer to the [Install Guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Problem
-Suppose we have a dataset of $n$ houses. Each house $i$ has $d$ properties and the price $y_i$. A property $x_{i,d}$ describes one aspect of the house, for example, the number of rooms in the house, the number of schools or hospitals in the neighborhood, the nearby traffic condition, etc. Our task is to predict $y_i$ given a set of properties $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price is a linear combination of all the properties, i.e.,
-
-$$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b,  i=1,\ldots,n$$
-
-where $\omega_{d}$ and $b$ are the model parameters we want to estimate. Once they are learned, given a set of properties of a house, we will be able to predict a price for that house. The model we have here is called Linear Regression, namely, we want to regress a value as a linear combination of several values. In practice this linear model for our problem is hardly true, because the real relationship between the house properties and the price is much more complicated. However, due to its simple formulation which makes the model training and analysis easy, Linear Regression has been applied to lots of real problems. It is always an important topic in many classical Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
-
-## Results Demonstration
-We first show the training result of our model. We use the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) to train a linear model and predict the house prices in Boston. The figure below shows the predictions the model makes for some house prices. The $X$ coordinate of each point represents the median value of the prices of a certain type of houses, while the $Y$ coordinate represents the predicted value by our linear model. When $X=Y$, the point lies exactly on the dotted line. In other words, the more precise the model predicts, the closer the point is to the dotted line.
-<p align="center">
-	<img src = "image/predictions_en.png" width=400><br/>
-	Figure 1. Predicted Value V.S. Actual Value
-</p>
-
-## Model Overview
-
-### Model Definition
-
-In the UCI Housing Data Set, there are 13 house properties $x_{i,d}$ that are related to the median house price $y_i$. Thus our model is:
-
-$$\hat{Y} = \omega_1X_{1} + \omega_2X_{2} + \ldots + \omega_{13}X_{13} + b$$
-
-where $\hat{Y}$ is the predicted value used to differentiate from the actual value $Y$. The model parameters to be learned are: $\omega_1, \ldots, \omega_{13}, b$, where $\omega$ are called the weights and $b$ is called the bias.
-
-Now we need an optimization goal, so that with the learned parameters, $\hat{Y}$ is close to $Y$ as much as possible. Here we introduce the concept of [Loss Function (Cost Function)](https://en.wikipedia.org/wiki/Loss_function). The Loss Function has such property: given any pair of the actual value $y_i$ and the predicted value $\hat{y_i}$, its output is always non-negative. This non-negative value reflects the model error.
-
-For Linear Regression, the most common Loss Function is [Mean Square Error (MSE)](https://en.wikipedia.org/wiki/Mean_squared_error) which has the following form:
-
-$$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
-
-For a dataset of size $n$, MSE is the average value of the $n$ predicted errors.
-
-### Training
-
-After defining our model, we have several major steps for the training:
-1. Initialize the parameters including the weights $\omega$ and the bias $b$. For example, we can set their mean values as 0s, and their standard deviations as 1s.
-2. Feedforward to compute the network output and the Loss Function.
-3. Backward to [backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
-4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of repeats is reached.
-
-## Data Preparation
-Follow the command below to prepare data:
-```bash
-cd data && python prepare_data.py
-```
-This line of code will download the dataset from the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) and perform some [preprocessing](#Preprocessing). The dataset is split into a training set and a test set.
-
-The dataset contains 506 lines in total, each line describing the properties and the median price of a certain type of houses in Boston. The meaning of each line is below:
-
-
-| Property Name | Explanation | Data Type |
-| ------| ------ | ------ |
-| CRIM | per capita crime rate by town | Continuous|
-| ZN | proportion of residential land zoned for lots over 25,000 sq.ft. | Continuous |
-| INDUS | proportion of non-retail business acres per town | Continuous |
-| CHAS | Charles River dummy variable | Discrete, 1 if tract bounds river; 0 otherwise|
-| NOX | nitric oxides concentration (parts per 10 million) | Continuous |
-| RM | average number of rooms per dwelling | Continuous |
-| AGE | proportion of owner-occupied units built prior to 1940 | Continuous |
-| DIS | weighted distances to five Boston employment centres | Continuous |
-| RAD | index of accessibility to radial highways | Continuous |
-| TAX | full-value property-tax rate per $10,000 | Continuous |
-| PTRATIO | pupil-teacher ratio by town | Continuous |
-| B | 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town | Continuous |
-| LSTAT | % lower status of the population | Continuous |
-| MEDV | Median value of owner-occupied homes in $1000's | Continuous |
-
-The last entry is the median house price.
-
-### Preprocessing
-#### Continuous and Discrete Data
-We define a feature vector of length 13 for each house, where each entry of the feature vector corresponds to a property of that house. Our first observation is that among the 13 dimensions, there are 12 continuous dimensions and 1 discrete dimension. Note that although a discrete value is also written as digits such as 0, 1, or 2, it has a quite different meaning from a continuous value. The reason is that the difference between two discrete values has no practical meaning. For example, if we use 0, 1, and 2 to represent `red`, `green`, and `blue` respectively, although the numerical difference between `red` and `green` is smaller than that between `red` and `blue`, we cannot say that the extent to which `blue` is different from `red` is greater than the extent to which `green` is different from `red`. Therefore, when handling a discrete feature that has $d$ possible values, we will usually convert it to $d$ new features where each feature can only take 0 or 1, indicating whether the original $d$th value is present or not. Or we can map the discrete feature to a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
-
-#### Feature Normalization
-Another observation we have is that there is a huge difference among the value ranges of the 13 features (Figure 2). For example, feature B has a value range of [0.32, 396.90] while feature NOX has a range of [0.3850, 0.8170]. For an effective optimization, here we need data normalization. The goal of data normalization is to scale each feature into roughly the same value range, for example [-0.5, 0.5]. In this example, we adopt a standard way of normalization: substracting the mean value from the feature and divide the result by the original value range.
-
-There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
-- A value range that is too large or too small might cause floating number overflow or underflow during computation.
-- Different value ranges might result in different importances of different features to the model (at least in the beginning of the training process), which however is an unreasonable assumption. Such assumption makes the optimization more difficult and increases the training time a lot.
-- Many Machine Learning techniques or models (e.g., L1/L2 regularization and Vector Space Model) are based on the assumption that all the features have roughly zero means and their value ranges are similar.
-
-<p align="center">
-	<img src = "image/ranges_en.png" width=550><br/>
-	Figure 2. The value ranges of the features
-</p>
-
-#### Prepare Training and Test Sets
-We split the dataset into two subsets, one for estimating the model parameters, namely, model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal of training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$. You can try different split ratios to observe how the two variances change.
-
-Executing the following command to split the dataset and write the training and test set into the `train.list` and `test.list` files, so that later PaddlePaddle can read from them.
-```python
-python prepare_data.py -r 0.8 #8:2 is the default split ratio
-```
-
-When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process begins. These hyperparameters are not part of the model parameters and cannot be trained using the same Loss Function (e.g., the number of layers in the network). Thus we will try several sets of hyperparameters to get several models, and compare these trained models on the validation set to pick the best one, and finally it on the test set. Because our model is relatively simple in this problem, we ignore this validation process for now.
-
-### Provide Data to PaddlePaddle
-After the data is prepared, we use a Python Data Provider to provide data for PaddlePaddle. A Data Provider is a Python function which will be called by PaddlePaddle during training. In this example, the Data Provider only needs to read the data and return it to the training process of PaddlePaddle line by line.
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-import numpy as np
-#define data type and dimensionality
-@provider(input_types=[dense_vector(13), dense_vector(1)])
-def process(settings, input_file):
-    data = np.load(input_file.strip())
-    for row in data:
-	    yield row[:-1].tolist(), row[-1:].tolist()
-
-```
-
-## Model Configuration
-
-### Data Definition
-We first call the function `define_py_data_sources2` to let PaddlePaddle read training and test data from the `dataprovider.py` in the above. PaddlePaddle can accept configuration info from the command line, for example, here we pass a variable named `is_predict` to control the model to have different structures during training and test.
-```python
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg('is_predict', bool, False)
-
-define_py_data_sources2(
-    train_list='data/train.list',
-    test_list='data/test.list',
-    module='dataprovider',
-    obj='process')
-
-```
-
-### Algorithm Settings
-Next we need to set the details of the optimization algorithm. Due to the simplicity of the Linear Regression model, we only need to set the `batch_size` which defines how many samples are used every time for updating the parameters.
-```python
-settings(batch_size=2)
-```
-
-### Network
-Finally, we use `fc_layer` and `LinearActivation` to represent the Linear Regression model.
-```python
-#input data of 13 dimensional house information
-x = data_layer(name='x', size=13)
-
-y_predict = fc_layer(
-    input=x,
-    param_attr=ParamAttr(name='w'),
-    size=1,
-    act=LinearActivation(),
-    bias_attr=ParamAttr(name='b'))
-
-if not is_predict: #when training, we use MSE (i.e., regression_cost) as the Loss Function
-    y = data_layer(name='y', size=1)
-    cost = regression_cost(input=y_predict, label=y)
-    outputs(cost) #output MSE to view the loss change
-else: #during test, output the prediction value
-    outputs(y_predict)
-```
-
-## Training Model
-We can run the PaddlePaddle command line trainer in the root directory of the code. Here we name the configuration file as `trainer_config.py`. We train 30 passes and save the result in the directory `output`:
-```bash
-./train.sh
-```
-
-## Use Model
-Now we can use the trained model to do prediction.
-```bash
-python predict.py
-```
-Here by default we use the model in `output/pass-00029` for prediction, and compare the actual house price with the predicted one. The result is shown in `predictions.png`.
-If you want to use another model or test on other data, you can pass in a new model path or data path:
-```bash
-python predict.py -m output/pass-00020 -t data/housing.test.npy
-```
-
-## Summary
-In this chapter, we have introduced the Linear Regression model using the UCI Housing Data Set as an example. We have shown how to train and test this model with PaddlePaddle. Many more complex models and techniques are derived from this simple linear model, thus it is important for us to understand how it works.
-
-
-## References
-1. https://en.wikipedia.org/wiki/Linear_regression
-2. Friedman J, Hastie T, Tibshirani R. The elements of statistical learning[M]. Springer, Berlin: Springer series in statistics, 2001.
-3. Murphy K P. Machine learning: a probabilistic perspective[M]. MIT press, 2012.
-4. Bishop C M. Pattern recognition[J]. Machine Learning, 2006, 128.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/fit_a_line/predict.py b/fit_a_line/predict.py
deleted file mode 100644
index 0afbf76099435e8c8680cefa92f12afa53c127a4..0000000000000000000000000000000000000000
--- a/fit_a_line/predict.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import argparse
-import numpy as np
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import *
-from paddle.trainer.config_parser import parse_config
-
-logging.basicConfig(level=logging.INFO)
-
-
-def predict(input_file, model_dir):
-    # prepare PaddlePaddle environment, load models
-    swig_paddle.initPaddle("--use_gpu=0")
-    conf = parse_config('trainer_config.py', 'is_predict=1')
-    network = swig_paddle.GradientMachine.createFromConfigProto(
-        conf.model_config)
-    network.loadParameters(model_dir)
-    slots = [dense_vector(13)]
-    converter = DataProviderConverter(slots)
-
-    data = np.load(input_file)
-    ys = []
-    for row in data:
-        result = network.forwardTest(converter([[row[:-1].tolist()]]))
-        y_true = row[-1:].tolist()[0]
-        y_predict = result[0]['value'][0][0]
-        ys.append([y_true, y_predict])
-
-    ys = np.matrix(ys)
-    avg_err = np.average(np.square((ys[:, 0] - ys[:, 1])))
-    logging.info('MSE of test set is %f' % avg_err)
-
-    # draw a scatter plot
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-    fig, ax = plt.subplots()
-
-    ax.scatter(ys[:, 0], ys[:, 1])
-    y_range = [ys[:, 0].min(), ys[:, 0].max()]
-    ax.plot(y_range, y_range, 'k--', lw=4)
-    ax.set_xlabel('True ($1000)')
-    ax.set_ylabel('Predicted ($1000)')
-    ax.set_title('Predictions on boston housing price')
-    fig.savefig('image/predictions.png', dpi=60)
-    plt.close(fig)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='predict house price and save the result as image.')
-    parser.add_argument(
-        '-m',
-        '--model',
-        dest='model',
-        default='output/pass-00029',
-        help='model path')
-    parser.add_argument(
-        '-t',
-        '--test_data',
-        dest='test_data',
-        default='data/housing.test.npy',
-        help='test data path')
-    args = parser.parse_args()
-
-    predict(input_file=args.test_data, model_dir=args.model)
diff --git a/fit_a_line/train.sh b/fit_a_line/train.sh
deleted file mode 100755
index 7fd01321145fb7d0748945e99096dc7c180eb206..0000000000000000000000000000000000000000
--- a/fit_a_line/train.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
diff --git a/fit_a_line/trainer_config.py b/fit_a_line/trainer_config.py
deleted file mode 100644
index 347cbcef760111cd1b2f10b30d55b04011e16425..0000000000000000000000000000000000000000
--- a/fit_a_line/trainer_config.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg('is_predict', bool, False)
-
-# 1. read data
-define_py_data_sources2(
-    train_list='data/train.list',
-    test_list='data/test.list',
-    module='dataprovider',
-    obj='process')
-
-# 2. learning algorithm
-settings(batch_size=2)
-
-# 3. Network configuration
-
-x = data_layer(name='x', size=13)
-
-y_predict = fc_layer(
-    input=x,
-    param_attr=ParamAttr(name='w'),
-    size=1,
-    act=LinearActivation(),
-    bias_attr=ParamAttr(name='b'))
-
-if not is_predict:
-    y = data_layer(name='y', size=1)
-    cost = regression_cost(input=y_predict, label=y)
-    outputs(cost)
-else:
-    outputs(y_predict)
diff --git a/image_classification/README.en.md b/image_classification/README.en.md
deleted file mode 100644
index 26b0caeae0173ffa3b96c138a0e991759dd4d8f6..0000000000000000000000000000000000000000
--- a/image_classification/README.en.md
+++ /dev/null
@@ -1,550 +0,0 @@
-Image Classification
-=======================
-
-The source code of this chapter is in [book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification). For the first-time users, please refer to PaddlePaddle[Installation Tutorial](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html) for installation instructions.
-
-## Background
-
-Comparing to words, images provide more vivid and easier to understand information with more artistic sense. They are important source for people to convey and exchange ideas. In this chapter, we focus on one of the essential problems in image recognition -- image classification.
-
-Image classification distinguishes images of different categories based on their semantic meaning. It is a core problem in computer vision, and is also the foundation of other higher level computer vision tasks such as object detection, image segmentation, object tracking, action recognition, etc. Image classification has applications in many areas such as face recognition and intelligent video analysis in security systems, traffic scene recognition in transportation systems, content-based image retrieval and automatic photo indexing in web services, image classification in medicine, etc.
-
-In image classification, we first encode the whole image using handcrafted or learned features, and then determine the object category by a classifier. Therefore, feature extraction plays an important role in image classification. Prior to deep learning, BoW(Bag of Words) model is the most popular method for object classification. BoW was introduced in NLP where a sentence is represented as a bag of words (words, phrases, or characters) extracted from training sentences. In the context of image classification, BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding**, and **classifier design**.
-
-Deep learning approach to image classification works by supervised or unsupervised learning of hierarchical features automatically instead of crafting or selecting image features manually. Convolutional Neural Networks (CNNs) have made significant progress in image classification. They keep all image information by employing raw image pixels as input, extract low-level and high-level abstract features through convolution operations, and directly output the classification results from the model. This end-to-end learning fashion leads to good performance and wide applications.
-
-In this chapter, we focus on introducing deep learning-based image classification methods, and on explaining how to train a CNN model using PaddlePaddle.
-
-## Demonstration
-
-Image classification includes general and fine-grained ones. Figure 1 demonstrates the results of general image classification -- the trained model can correctly recognize the main objects in the images.
-
-<p align="center">
-<img src="image/dog_cat.png "  width="350" ><br/>
-Figure 1. General image classification
-</p>
-
-
-Figure 2 demonstrates the results of fine-grained image classification -- flower recognition, which requires correct recognition of flower categories.
-
-<p align="center">
-<img src="image/flowers.png" width="400" ><br/>
-Figure 2. Fine-grained image classification
-</p>
-
-
-A good model should be able to recognize objects of different categories correctly, and meanwhile can correctly classify images taken from different points of view, under different illuminations, with object distortion or partial occlusion (we call these image disturbance). Figure 3 show some images with various disturbance. A good model should be able to classify these images correctly like humans.
-
-<p align="center">
-<img src="image/variations_en.png" width="550" ><br/>
-Figure 3. Disturbed images [22]
-</p>
-
-## Model Overview
-
-A large amount of research work in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/). Many image classification algorithms are usually evaluated and compared on these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset started in Large Scale Visual Recognition Challenge (ILSVRC) 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
-
-Before 2012, traditional image classification methods can be achieved with the three steps described in the Background section. A complete model construction usually involves the following stages: low-level feature extraction, feature encoding, spatial constraint or feature clustering, classifier design, model ensemble.
-
-  1). **Low-level feature extraction**: This is a step for extracting large amounts of local features according to fixed strides and scales. Popular local features include Scale-Invariant Feature Transform(SIFT)[1], Histogram of Oriented Gradient(HOG)[2], Local Binary Pattern(LBP)[3], etc. A common practice is to employ multiple feature descriptors in order to avoid missing too much information.
-  2). **Feature encoding**: Low-level features contain large amount of redundancy and noise. In order to improve robustness of features, it is necessary to employ a feature transformation to encode low-level features, which is called feature encoding. Common feature encoding methods include vector quantization [4], sparse coding [5], locality-constrained linear coding [6], Fisher vector encoding [7], etc.
-  3). **Spatial constraint**: Spatial constraint or feature clustering is usually adopted after feature encoding for extracting the maximum or average of each dimension in the spatial domain. Pyramid feature matching--a popular feature clustering method--divides an image uniformly into patches, and performs feature clustering in each patch.
-  4). **Classification**: Upon the above steps, an image can be described by a vector of fixed dimension. Then a classifier can be used to classify the image into categories. Common classifiers include Support Vector Machine(SVM), random forest, etc. Kernel SVM is the most popular classifier, and has achieved very good performance in traditional image classification tasks.
-
-This method has been used widely as image classification algorithm in PASCAL VOC [18]. NEC Labs(http://www.nec-labs.com/) won the championship by employing SIFT and LBP features, two non-linear encoders and SVM in ILSVRC 2010 [8].
-
-The CNN model--AlexNet proposed by Alex Krizhevsky et al.[9], made a breakthrough in ILSVRC 2012. It outperformed traditional methods dramatically, and won the championship in ILSVRC 2012. This is also the first time that a deep learning method was used for large scale image classification. Since AlexNet, a series of  CNN models have been proposed and has advanced the state of the art steadily on Imagenet as shown in Figure 4. With deeper and more sophisticated architectures, Top-5 error rate is getting lower and lower, until to around 3.5%. The error rate of human raters on the same Imagenet dataset is 5.1%, which means that the image classification capability of a deep learning model surpasses human raters.
-
-<p align="center">
-<img src="image/ilsvrc.png" width="500" ><br/>
-Figure 4. Top-5 error rates on ILSVRC image classification
-</p>
-
-### CNN
-
-Traditional CNNs consist of convolutional and fully-connected layers, and employ softmax multi-category classifier and cross-entropy as loss function. Figure 5 shows a typical CNN. We first introduce the common parts of a CNN.
-
-<p align="center">
-<img src="image/lenet_en.png"><br/>
-Figure 5. A CNN example [20]
-</p>
-
-- convolutional layer: It uses convolution operation to extract low-level and high-level features, and to discover local correlation and spatial invariance.
-
-- pooling layer: It down-sample feature maps via extracting local max (max-pooling) or average (avg-pooling) of each patch in the feature map. Down-sampling, a common operator in image processing, can be used to filter out high frequency information.
-
-- fully-connected layer: It fully connects neurons between two adjacent layers.
-
-- non-linear activation: Convolutional and fully-connected layers are usually followed by some non-linear activation layers, such as Sigmoid, Tanh, Relu to enhance the expression capability. Relu is the most commonly used activation function in CNN.
-
-- Dropout [10]: At each training stage, individual nodes are dropped out of the net with a certain probability in order to improve generalization and to avoid overfitting.
-
-Due to parameter updating in each layer during training, it causes the change in the distributions of layer inputs, and requires careful tuning of hyper-parameters. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
-
-We will introduce the network architectures of VGG, GoogleNet and ResNets in the following sections.
-
-### VGG
-
-Oxford Visual Geometry Group (VGG) proposed VGG network in ILSVRC 2014 [11]. The model is deeper and wider than previous neural architectures. It comprises five main groups of convolution operations, with max-pooling layers between adjacent convolution groups. Each group contains a series of 3x3 convolutional layers, whose number of convolution kernels stays the same within the group and increases from 64 in the first group to 512 in the last one. The total number of learnable layers could be 11, 13, 16, or 19 depending on the number of convolutional layers in each group. Figure 6 illustrates a 16-layer VGG. The neural architecture of VGG is relatively simple, and has been adopted by many papers such as the first one that surpassed human-level performance on ImageNet [19].
-
-<p align="center">
-<img src="image/vgg16.png" width="750" ><br/>
-Figure 6. Vgg16 model for ImageNet
-</p>
-
-### GoogleNet
-
-GoogleNet [12] won the championship in ILSVRC 2014. Before introducing this model, lets get familiar with Network in Network(NIN) model [13] from which GoogleNet borrowed some ideas, and Inception blocks upon which GoogleNet is built.
-
-NIN model has two main characteristics: 1) it replaces the single-layer convolutional network by Multi-Layer Perceptron Convolution or MLPconv. MLPconv, a tiny multi-layer convolutional network, enhances non-linearity by adding several 1x1 convolutional layers after linear ones. 2) In traditional CNNs, the last fewer layers are usually fully-connected with a large number of parameters. In contrast, NIN replaces all fully-connected layers with convolutional layers whose feature maps are of the same size as the category dimension, and followed by a global average pooling. This replacement of fully-connected layers significantly reduces the number of parameters.
-
-Figure 7 depicts two Inception blocks. Figure 7(a) is the simplest design, the output of which is a concat of features from three convolutional layers and one pooling layer. The disadvantage of this design is that the pooling layer does not change the number of filters and leads to an increase of outputs. After going through several of such blocks, the number of outputs and parameters will become larger and larger, leading to higher computation complexity. To overcome this drawback, the Inception block in Figure 7(b) employs three 1x1 convolutional layers to reduce dimension or the number of channels, meanwhile improves non-linearity of the network.
-
-<p align="center">
-<img src="image/inception_en.png" width="800" ><br/>
-Figure 7. Inception block
-</p>
-
-GoogleNet consists of multiple stacking Inception blocks followed by an avg-pooling layer as in NIN in place of by traditional fully connected layers. The difference between GoogleNet and NIN is that GoogleNet adds a fully connected layer after avg-pooling layer to output a vector of category size. Besides these two characteristics, the features from middle layers of a GoogleNet are also very discriminative. Therefore, GoogeleNet inserts two auxiliary classifiers in the model for enhancing gradient and regularization when doing backpropagating. The loss function of the whole network is the weighted sum of these three classifiers.
-
-Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks-- the first group contains two Inception blocks, the second one five, and the third one two. It ends up with an average pooling and a fully-connected layer.
-
-<p align="center">
-<img src="image/googlenet.jpeg" ><br/>
-Figure 8. GoogleNet[12]
-</p>
-
-The above model is the first version of GoogleNet or GoogelNet-v1. GoogleNet-v2 [14] introduces BN layer; GoogleNet-v3 [16] further splits some convolutional layers, which increases non-linearity and network depth; GoogelNet-v4 [17] leads to the design idea of ResNet which will be introduced in the next section. The evolution from v1 to v4 leverages the accuracy rate consistently. We will not go into details of the neural architectures of v2 to v4.
-
-### ResNet
-
-Residual Network(ResNet)[15] won the 2015 championships on three ImageNet competitions -- image classification, object localization and object detection. The authors of ResNet proposed a residual learning approach to easing the difficulty of training deeper networks -- with the network depth increasing, accuracy degrades. Based upon the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer inputs. And then the outputs of these two branches are added up.
-
-Figure 9 illustrates the architecture of ResNet. The left is the basic building block consisting of two 3x3 convolutional layers of the same channels. The right one is a Bottleneck block. The bottleneck is a 1x1 convolutional layer used to reduce dimension from 256 to 64. The other 1x1 conolutional layer is used to increase dimension from 64 to 256. Therefore, the number of input and output channels of the middle 3x3 convolutional layer, which is 64, is relatively small.
-
-<p align="center">
-<img src="image/resnet_block.jpg" width="400"><br/>
-Figure 9. Residual block
-</p>
-
-Figure 10 illustrates ResNets with 50, 101, 152 layers, respectively. All three networks use bottleneck blocks of different numbers of repetitions. ResNet converges very fast and can be trained with hundreds or thousands of layers.
-
-<p align="center">
-<img src="image/resnet.png"><br/>
-Figure 10. ResNet model for ImageNet
-</p>
-
-
-## Data Preparation
-
-### Data description and downloading
-
-Commonly used public datasets for image classification are CIFAR(https://www.cs.toronto.edu/~kriz/cifar.html), ImageNet(http://image-net.org/), COCO(http://mscoco.org/), etc. Those used for fine-grained image classification are CUB-200-2011(http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), Stanford Dog(http://vision.stanford.edu/aditya86/ImageNetDogs/), Oxford-flowers(http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among them, ImageNet are the largest and most research results are reported on ImageNet as mentioned in Model Overview section. Since 2010, the data of Imagenet has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average.
-
-Since ImageNet is too large to be downloaded and trained efficiently, we use CIFAR10 (https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR10 as well as 10 images randomly sampled from each category.
-
-<p align="center">
-<img src="image/cifar.png" width="350"><br/>
-Figure 11. CIFAR10 dataset[21]
-</p>
-
-The following command is used for downloading data and calculating the mean image used for data preprocessing.
-
-```bash
-./data/get_data.sh
-```
-
-###  Data provider for PaddlePaddle
-
-We use Python interface for providing data to PaddlePaddle. The following file dataprovider.py is a complete example for CIFAR10.
-
-- 'initializer' function performs initialization of dataprovider: loading the mean image, defining two input types -- image and label.
-
-- 'process' function sends preprocessed data to PaddlePaddle. Data preprocessing performed in this function includes data perturbation, random horizontal flipping, deducting mean image from the raw image.
-
-```python
-import numpy as np
-import cPickle
-from paddle.trainer.PyDataProvider2 import *
-
-def initializer(settings, mean_path, is_train, **kwargs):
-    settings.is_train = is_train
-    settings.input_size = 3 * 32 * 32
-    settings.mean = np.load(mean_path)['mean']
-    settings.input_types = {
-        'image': dense_vector(settings.input_size),
-        'label': integer_value(10)
-    }
-
-
-@provider(init_hook=initializer, pool_size=50000)
-def process(settings, file_list):
-    with open(file_list, 'r') as fdata:
-        for fname in fdata:
-            fo = open(fname.strip(), 'rb')
-            batch = cPickle.load(fo)
-            fo.close()
-            images = batch['data']
-            labels = batch['labels']
-            for im, lab in zip(images, labels):
-                if settings.is_train and np.random.randint(2):
-                    im = im.reshape(3, 32, 32)
-                    im = im[:,:,::-1]
-                    im = im.flatten()
-                im = im - settings.mean
-                yield {
-                    'image': im.astype('float32'),
-                    'label': int(lab)
-                }
-```
-
-## Model Config
-
-### Data Definition
-
-In model config file, function `define_py_data_sources2` sets argument 'module' to dataprovider file for loading data, 'args' to mean image file. If the config file is used for prediction, then there is no need to set argument 'train_list'.
-
-```python
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-if not is_predict:
-    define_py_data_sources2(
-        train_list='data/train.list',
-        test_list='data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'mean_path': 'data/mean.meta'})
-```
-
-### Algorithm Settings
-
-In model config file, function 'settings' specifies optimization algorithm, batch size, learning rate, momentum and L2 regularization.
-
-```python
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128),)
-```
-
-The learning rate adjustment policy can be defined with variables `learning_rate_decay_a`($a$), `learning_rate_decay_b`($b$) and `learning_rate_schedule`. In this example, discrete exponential method is used for adjusting learning rate. The formula is as follows,
-$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
-where $n$ is the number of processed samples, $lr_{0}$ is the learning_rate set in 'settings'.
-
-### Model Architecture
-
-Here we provide the cofig files for VGG and ResNet models.
-
-#### VGG
-
-First we define VGG network. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we uses a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
-
-1. Define input data and its dimension
-
-        The input to the network is defined as `data_layer`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10.
-
-	```python
-	datadim = 3 * 32 * 32
-	classdim = 10
-	data = data_layer(name='image', size=datadim)
-	```
-
-2. Define VGG main module
-
-	```python
-	net = vgg_bn_drop(data)
-	```
-        The input to VGG main module is from data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail:
-
-	```python
-	def vgg_bn_drop(input, num_channels):
-	    def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
-	        return img_conv_group(
-	            input=ipt,
-	            num_channels=num_channels_,
-	            pool_size=2,
-	            pool_stride=2,
-	            conv_num_filter=[num_filter] * groups,
-	            conv_filter_size=3,
-	            conv_act=ReluActivation(),
-	            conv_with_batchnorm=True,
-	            conv_batchnorm_drop_rate=dropouts,
-	            pool_type=MaxPooling())
-
-	    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
-	    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-	    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-	    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-	    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-	    drop = dropout_layer(input=conv5, dropout_rate=0.5)
-	    fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
-	    bn = batch_norm_layer(
-	        input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
-	    fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
-	    return fc2
-
-	```
-
-        2.1. First defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.trainer_config_helpers` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
-
-
-        2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
-
-
-        2.3. The last two layers are fully-connected layer of dimension 512.
-
-3. Define Classifier
-
-        The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category.
-
-	```python
-	out = fc_layer(input=net, size=class_num, act=SoftmaxActivation())
-	```
-
-4. Define Loss Function and Outputs
-
-        In the context of supervised learning, labels of training images are defined in `data_layer`, too. During training, cross-entropy is used as loss function and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier.
-
-	```python
-	if not is_predict:
-	    lbl = data_layer(name="label", size=class_num)
-	    cost = classification_cost(input=out, label=lbl)
-	    outputs(cost)
-	else:
-	    outputs(out)
-	```
-
-### ResNet
-
-The first, third and forth steps of a ResNet are the same as a VGG. The second one is the main module.
-
-```python
-net = resnet_cifar10(data, depth=56)
-```
-
-Here are some basic functions used in `resnet_cifar10`:
-
-  - `conv_bn_layer` : convolutional layer followed by BN.
-  - `shortcut` : the shortcut branch in a residual block. There are two kinds of shortcuts: 1x1 convolution used when the number of channels between input and output are different; direct connection used otherwise.
-
-  - `basicblock` : a basic residual module as shown in the left of Figure 9, consisting of two sequential 3x3 convolutions and one "shortcut" branch.
-  - `bottleneck` : a bottleneck module as shown in the right of Figure 9, consisting of a two 1x1 convolutions with one 3x3 convolution in between branch and a "shortcut" branch.
-  - `layer_warp` : a group of residual modules consisting of several stacking blocks. In each group, the sliding window size of the first residual block could be different from the rest of blocks, in order to reduce the size of feature maps along horizontal and vertical directions.
-
-```python
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  active_type=ReluActivation(),
-                  ch_in=None):
-    tmp = img_conv_layer(
-        input=input,
-        filter_size=filter_size,
-        num_channels=ch_in,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(input=tmp, act=active_type)
-
-
-def shortcut(ipt, n_in, n_out, stride):
-    if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
-    else:
-        return ipt
-
-def basicblock(ipt, ch_out, stride):
-    ch_in = ipt.num_filters
-    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
-
-def bottleneck(ipt, ch_out, stride):
-    ch_in = ipt.num_filter
-    tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
-    tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
-
-def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(ipt, features, stride)
-    for i in range(1, count):
-        tmp = block_func(tmp, features, 1)
-    return tmp
-
-```
-
-The following are the components of `resnet_cifar10`:
-
-1. The lowest level is `conv_bn_layer`.
-2. The middle level consists of three `layer_warp`, each of which uses the left residual block in Figure 9.
-3. The last level is average pooling layer.
-
-Note: besides the first convolutional layer and the last fully-connected layer, the total number of layers in three `layer_warp` should be dividable by 6, that is the depth of `resnet_cifar10` should satisfy $(depth - 2) % 6 == 0$.
-
-```python
-def resnet_cifar10(ipt, depth=56):
-    # depth should be one of 20, 32, 44, 56, 110, 1202
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    nStages = {16, 64, 128}
-    conv1 = conv_bn_layer(ipt,
-        ch_in=3,
-        ch_out=16,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = img_pool_layer(input=res3,
-                         pool_size=8,
-                         stride=1,
-                         pool_type=AvgPooling())
-    return pool
-```
-
-## Model Training
-
-We can train the model by running the script train.sh, which specifies config file, device type, number of threads, number of passes, path to the trained models, etc,
-
-``` bash
-sh train.sh
-```
-
-Here is an example script `train.sh`:
-
-```bash
-#cfg=models/resnet.py
-cfg=models/vgg.py
-output=output
-log=train.log
-
-paddle train \
-    --config=$cfg \
-    --use_gpu=true \
-    --trainer_count=1 \
-    --log_period=100 \
-    --num_passes=300 \
-    --save_dir=$output \
-    2>&1 | tee $log
-```
-
-- `--config=$cfg` : specifies config file. The default is `models/vgg.py`.
-- `--use_gpu=true` : uses GPU for training. If use CPU，set it to be false.
-- `--trainer_count=1` : specifies the number of threads or GPUs.
-- `--log_period=100` : specifies the number of batches between two logs.
-- `--save_dir=$output` : specifies the path for saving trained models.
-
-Here is an example log after training for one pass. The average error rates are 0.79958 on training set and 0.7858 on validation set.
-
-```text
-TrainerInternal.cpp:165]  Batch=300 samples=38400 AvgCost=2.07708 CurrentCost=1.96158 Eval: classification_error_evaluator=0.81151  CurrentEval: classification_error_evaluator=0.789297
-TrainerInternal.cpp:181]  Pass=0 Batch=391 samples=50000 AvgCost=2.03348 Eval: classification_error_evaluator=0.79958
-Tester.cpp:115]  Test samples=10000 cost=1.99246 Eval: classification_error_evaluator=0.7858
-```
-
-Figure 12 shows the curve of training error rate, which indicates it converges at Pass 200 with error rate 8.54%.
-
-<p align="center">
-<img src="image/plot_en.png" width="400" ><br/>
-Figure 12. The error rate of VGG model on CIFAR10
-</p>
-
-## Model Application
-
-After training is done, the model from each pass is saved in `output/pass-%05d`. For example, the model of Pass 300 is saved in `output/pass-00299`. The script `classify.py` can be used to extract features and to classify an image. The default config file of this script is `models/vgg.py`.
-
-
-### Prediction
-
-We can run the following script to predict the category of an image. The default device is GPU. If to use CPU, set `-c`.
-
-```bash
-python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
-```
-
-Here is the result：
-
-```text
-Label of image/dog.png is: 5
-```
-
-### Feature Extraction
-
-We can run the following command to extract features from an image. Here `job` should be `extract` and the default layer is the first convolutional layer. Figure 13 shows the 64 feature maps output from the first convolutional layer of the VGG model.
-
-```bash
-python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
-```
-
-<p align="center">
-<img src="image/fea_conv0.png" width="500"><br/>
-Figre 13. Visualization of convolution layer feature maps
-</p>
-
-## Conclusion
-
-Traditional image classification methods involve multiple stages of processing and the framework is very complicated. In contrast, CNN models can be trained end-to-end with significant increase of classification accuracy. In this chapter, we introduce three models -- VGG, GoogleNet, ResNet, provide PaddlePaddle config files for training VGG and ResNet on CIFAR10, and explain how to perform prediction and feature extraction using PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
-
-
-## Reference
-
-[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
-
-[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
-
-[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
-
-[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
-
-[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
-
-[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
-
-[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
-
-[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
-
-[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
-
-[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
-
-[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
-
-[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
-
-[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
-
-[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
-
-[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
-
-[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
-
-[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
-
-[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
-
-[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
-
-[20] http://deeplearning.net/tutorial/lenet.html
-
-[21] https://www.cs.toronto.edu/~kriz/cifar.html
-
-[22] http://cs231n.github.io/classification/
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/image_classification/deprecated/README.md b/image_classification/deprecated/README.md
deleted file mode 100644
index f70b819af68e4b10b756e2daaac60202c71cdc7d..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/README.md
+++ /dev/null
@@ -1,545 +0,0 @@
-图像分类
-=======
-
-本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
-
-## 背景介绍 
-
-图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在本教程中，我们专注于图像识别领域的一个重要问题，即图像分类。
-
-图像分类是根据图像的语义信息将不同类别图像区分开来，是计算机视觉中重要的基本问题，也是图像检测、图像分割、物体跟踪、行为分析等其他高层视觉任务的基础。图像分类在很多领域有广泛应用，包括安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
-
-
-一般来说，图像分类通过手工特征或特征学习方法对整个图像进行全部描述，然后使用分类器判别物体类别，因此如何提取图像的特征至关重要。在深度学习算法之前使用较多的是基于词袋(Bag of Words)模型的物体分类方法。词袋方法从自然语言处理中引入，即一句话可以用一个装了词的袋子表示其特征，袋子中的词为句子中的单词、短语或字。对于图像而言，词袋方法需要构建字典。最简单的词袋模型框架可以设计为**底层特征抽取**、**特征编码**、**分类器设计**三个过程。
-
-而基于深度学习的图像分类方法，可以通过有监督或无监督的方式**学习**层次化的特征描述，从而取代了手工设计或选择图像特征的工作。深度学习模型中的卷积神经网络(Convolution Neural Network, CNN)近年来在图像领域取得了惊人的成绩，CNN直接利用图像像素信息作为输入，最大程度上保留了输入图像的所有信息，通过卷积操作进行特征的提取和高层抽象，模型输出直接是图像识别的结果。这种基于"输入-输出"直接端到端的学习方法取得了非常好的效果，得到了广泛的应用。
-
-本教程主要介绍图像分类的深度学习模型，以及如何使用PaddlePaddle训练CNN模型。
-
-## 效果展示
-
-图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。
-
-<p align="center">
-<img src="image/dog_cat.png "  width="350" ><br/>
-图1. 通用图像分类展示
-</p>
-
-
-图2展示了细粒度图像分类-花卉识别的效果，要求模型可以正确识别花的类别。
-
-
-<p align="center">
-<img src="image/flowers.png" width="400" ><br/>
-图2. 细粒度图像分类展示
-</p>
-
-
-一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。
-
-<p align="center">
-<img src="image/variations.png" width="550" ><br/>
-图3. 扰动图片展示[22]
-</p>
-
-## 模型概览
-
-图像识别领域大量的研究成果都是建立在[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[ImageNet](http://image-net.org/)等公开的数据集上，很多图像识别算法通常在这些数据集上进行测试和比较。PASCAL VOC是2005年发起的一个视觉挑战赛，ImageNet是2010年发起的大规模视觉识别竞赛(ILSVRC)的数据集，在本章中我们基于这些竞赛的一些论文介绍图像分类模型。
-
-在2012年之前的传统图像分类方法可以用背景描述中提到的三步完成，但通常完整建立图像识别模型一般包括底层特征学习、特征编码、空间约束、分类器设计、模型融合等几个阶段。
-  1). **底层特征提取**: 通常从图像中按照固定步长、尺度提取大量局部特征描述。常用的局部特征包括SIFT(Scale-Invariant Feature Transform, 尺度不变特征转换) \[[1](#参考文献)\]、HOG(Histogram of Oriented Gradient, 方向梯度直方图) \[[2](#参考文献)\]、LBP(Local Bianray Pattern, 局部二值模式) \[[3](#参考文献)\] 等，一般也采用多种特征描述子，防止丢失过多的有用信息。
-  2). **特征编码**: 底层特征中包含了大量冗余与噪声，为了提高特征表达的鲁棒性，需要使用一种特征变换算法对底层特征进行编码，称作特征编码。常用的特征编码包括向量量化编码 \[[4](#参考文献)\]、稀疏编码 \[[5](#参考文献)\]、局部线性约束编码 \[[6](#参考文献)\]、Fisher向量编码 \[[7](#参考文献)\] 等。
-  3). **空间特征约束**: 特征编码之后一般会经过空间特征约束，也称作**特征汇聚**。特征汇聚是指在一个空间范围内，对每一维特征取最大值或者平均值，可以获得一定特征不变形的特征表达。金字塔特征匹配是一种常用的特征聚会方法，这种方法提出将图像均匀分块，在分块内做特征汇聚。
-  4). **通过分类器分类**: 经过前面步骤之后一张图像可以用一个固定维度的向量进行描述，接下来就是经过分类器对图像进行分类。通常使用的分类器包括SVM(Support Vector Machine, 支持向量机)、随机森林等。而使用核方法的SVM是最为广泛的分类器，在传统图像分类任务上性能很好。
- 
-这种方法在PASCAL VOC竞赛中的图像分类算法中被广泛使用 \[[18](#参考文献)\]。[NEC实验室](http://www.nec-labs.com/)在ILSVRC2010中采用SIFT和LBP特征，两个非线性编码器以及SVM分类器获得图像分类的冠军 \[[8](#参考文献)\]。
-
-Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
-
-<p align="center">
-<img src="image/ilsvrc.png" width="500" ><br/>
-图4. ILSVRC图像分类Top-5错误率
-</p>
-
-### CNN
-
-传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。
-
-<p align="center">
-<img src="image/lenet.png"><br/>
-图5. CNN网络示例[20] 
-</p> 
-
-- 卷积层(convolution layer): 执行卷积操作提取底层到高层的特征，发掘出图片局部关联性质和空间不变性质。
-- 池化层(pooling layer): 执行降采样操作。通过取卷积输出特征图中局部区块的最大值(max-pooling)或者均值(avg-pooling)。降采样也是图像处理中常见的一种操作，可以过滤掉一些不重要的高频信息。
-- 全连接层(fully-connected layer，或者fc layer): 输入层到隐藏层的神经元是全部连接的。
-- 非线性变化: 卷积层、全连接层后面一般都会接非线性变化层，例如Sigmoid、Tanh、ReLu等来增强网络的表达能力，在CNN里最常使用的为ReLu激活函数。
-- Dropout \[[10](#参考文献)\] : 在模型训练阶段随机让一些隐层节点权重不工作，提高网络的泛化能力，一定程度上防止过拟合。
-
-另外，在训练过程中由于每层参数不断更新，会导致下一次输入分布发生变化，这样导致训练过程需要精心设计超参数。如2015年Sergey Ioffe和Christian Szegedy提出了Batch Normalization (BN)算法 \[[14](#参考文献)\] 中，每个batch对网络中的每一层特征都做归一化，使得每层分布相对稳定。BN算法不仅起到一定的正则作用，而且弱化了一些超参数的设计。经过实验证明，BN算法加速了模型收敛过程，在后来较深的模型中被广泛使用。
-
-接下来我们主要介绍VGG，GoogleNet和ResNet网络结构。
-
-### VGG
-
-牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
-
-<p align="center">
-<img src="image/vgg16.png" width="750" ><br/>
-图6. 基于ImageNet的VGG16模型
-</p>
-
-### GoogleNet
-
-GoogleNet \[[12](#参考文献)\] 在2014年ILSVRC的获得了冠军，在介绍该模型之前我们先来了解NIN(Network in Network)模型 \[[13](#参考文献)\] 和Inception模块，因为GoogleNet模型由多组Inception模块组成，模型设计借鉴了NIN的一些思想。
-
-NIN模型主要有两个特点：1) 引入了多层感知卷积网络(Multi-Layer Perceptron Convolution, MLPconv)代替一层线性卷积网络。MLPconv是一个微小的多层卷积网络，即在线性卷积后面增加若干层1x1的卷积，这样可以提取出高度非线性特征。2) 传统的CNN最后几层一般都是全连接层，参数较多。而NIN模型设计最后一层卷积层包含类别维度大小的特征图，然后采用全局均值池化(Avg-Pooling)替代全连接层，得到类别维度大小的向量，再进行分类。这种替代全连接层的方式有利于减少参数。
-
-Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
-
-<p align="center">
-<img src="image/inception.png" width="800" ><br/>
-图7. Inception模块
-</p>
-
-GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没有采用传统的多层全连接层，而是像NIN网络一样采用了均值池化层；但与NIN不同的是，池化层后面接了一层到类别数映射的全连接层。除了这两个特点之外，由于网络中间层特征也很有判别性，GoogleNet在中间层添加了两个辅助分类器，在后向传播中增强梯度并且增强正则化，而整个网络的损失函数是这个三个分类器的损失加权求和。
-
-GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。
-
-<p align="center">
-<img src="image/googlenet.jpeg" ><br/>
-图8. GoogleNet[12] 
-</p>
-
-
-上面介绍的是GoogleNet第一版模型(称作GoogleNet-v1)。GoogleNet-v2 \[[14](#参考文献)\] 引入BN层；GoogleNet-v3 \[[16](#参考文献)\] 对一些卷积层做了分解，进一步提高网络非线性能力和加深网络；GoogleNet-v4 \[[17](#参考文献)\] 引入下面要讲的ResNet设计思路。从v1到v4每一版的改进都会带来准确度的提升，介于篇幅，这里不再详细介绍v2到v4的结构。
-
-
-### ResNet
-
-ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类、图像物体定位和图像物体检测比赛的冠军。针对训练卷积神经网络时加深网络导致准确度下降的问题，ResNet提出了采用残差学习。在已有设计思路(BN, 小卷积核，全卷积网络)的基础上，引入了残差模块。每个残差模块包含两条路径，其中一条路径是输入特征的直连通路，另一条路径对该特征做两到三次卷积操作得到该特征的残差，最后再将两条路径上的特征相加。
-
-残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
-
-<p align="center">
-<img src="image/resnet_block.jpg" width="400"><br/>
-图9. 残差模块
-</p>
-
-图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。
-
-<p align="center">
-<img src="image/resnet.png"><br/>
-图10. 基于ImageNet的ResNet模型
-</p>
-
-
-## 数据准备
-
-### 数据介绍与下载
-
-通用图像分类公开的标准数据集常用的有[CIFAR](<https://www.cs.toronto.edu/~kriz/cifar.html)、[ImageNet](http://image-net.org/)、[COCO](http://mscoco.org/)等，常用的细粒度图像分类数据集包括[CUB-200-2011](http://www.vision.caltech.edu/visipedia/CUB-200-2011.html)、[Stanford Dog](http://vision.stanford.edu/aditya86/ImageNetDogs/)、[Oxford-flowers](http://www.robots.ox.ac.uk/~vgg/data/flowers/)等。其中ImageNet数据集规模相对较大，如[模型概览](#模型概览)一章所讲，大量研究成果基于ImageNet。ImageNet数据从2010年来稍有变化，常用的是ImageNet-2012数据集，该数据集包含1000个类别：训练集包含1,281,167张图片，每个类别数据732至1300张不等，验证集包含50,000张图片，平均每个类别50张图片。
-
-由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
-
-<p align="center">
-<img src="image/cifar.png" width="350"><br/>
-图11. CIFAR10数据集[21]
-</p>
-
-下面命令用于下载数据和基于训练集计算图像均值，在网络输入前，基于该均值对输入数据做预处理。
-
-```bash
-./data/get_data.sh
-```
-
-### 数据提供给PaddlePaddle
-
-我们使用Python接口传递数据给系统，下面 `dataprovider.py` 针对CIFAR10数据给出了完整示例。
-
-- `initializer` 函数进行dataprovider的初始化，这里加载图像的均值，定义了输入image和label两个字段的类型。
-
-- `process` 函数将数据逐条传输给系统，在图像分类任务里，可以在该函数中完成数据扰动操作，再传输给PaddlePaddle。这里对训练集做随机左右翻转，并将原始图片减去均值后传输给系统。
-
-
-```python
-import numpy as np
-import cPickle
-from paddle.trainer.PyDataProvider2 import *
-
-def initializer(settings, mean_path, is_train, **kwargs):
-    settings.is_train = is_train
-    settings.input_size = 3 * 32 * 32
-    settings.mean = np.load(mean_path)['mean']
-    settings.input_types = {
-        'image': dense_vector(settings.input_size),
-        'label': integer_value(10)
-    }
-
-
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_list):
-    with open(file_list, 'r') as fdata:
-        for fname in fdata:
-            fo = open(fname.strip(), 'rb')
-            batch = cPickle.load(fo)
-            fo.close()
-            images = batch['data']
-            labels = batch['labels']
-            for im, lab in zip(images, labels):
-                if settings.is_train and np.random.randint(2):
-                    im = im[:,:,::-1]
-                im = im - settings.mean
-                yield {
-                    'image': im.astype('float32'),
-                    'label': int(lab)
-                }
-```
-
-## 模型配置说明
-
-### 数据定义
-
-在模型配置中，定义通过 `define_py_data_sources2` 函数从 dataprovider 中读入数据， 其中 args 指定均值文件的路径。如果该配置文件用于预测，则不需要数据定义部分。
-
-```python
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-if not is_predict:
-    define_py_data_sources2(
-        train_list='data/train.list',
-        test_list='data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'mean_path': 'data/mean.meta'})
-```
-
-### 算法配置
-
-在模型配置中，通过 `settings` 设置训练使用的优化算法，并指定batch size 、初始学习率、momentum以及L2正则。
-
-```python
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128),)
-```
-
-通过 `learning_rate_decay_a` (简写$a$） 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略，这里采用离散指数的方式调节学习率，计算公式如下， $n$ 代表已经处理过的累计总样本数，$lr_{0}$ 即为 `settings` 里设置的 `learning_rate`。
-
-$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
-
-### 模型结构
-
-本教程中我们提供了VGG和ResNet两个模型的配置。
-
-#### VGG
-
-首先介绍VGG模型结构，由于CIFAR10图片大小和数量相比ImageNet数据小很多，因此这里的模型针对CIFAR10数据做了一定的适配。卷积部分引入了BN和Dropout操作。
-
-1. 定义数据输入及其维度
-
-	网络输入定义为 `data_layer` (数据层)，在图像分类中即为图像像素信息。CIFRAR10是RGB 3通道32x32大小的彩色图，因此输入数据大小为3072(3x32x32)，类别大小为10，即10分类。
-	
-	```python
-	datadim = 3 * 32 * 32
-	classdim = 10
-	data = data_layer(name='image', size=datadim)
-	```
-
-2. 定义VGG网络核心模块
-
-	```python
-	net = vgg_bn_drop(data)
-	```
-	VGG核心模块的输入是数据层，`vgg_bn_drop` 定义了16层VGG结构，每层卷积后面引入BN层和Dropout层，详细的定义如下：
-	
-	```python
-	def vgg_bn_drop(input, num_channels):
-	    def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
-	        return img_conv_group(
-	            input=ipt,
-	            num_channels=num_channels_,
-	            pool_size=2,
-	            pool_stride=2,
-	            conv_num_filter=[num_filter] * groups,
-	            conv_filter_size=3,
-	            conv_act=ReluActivation(),
-	            conv_with_batchnorm=True,
-	            conv_batchnorm_drop_rate=dropouts,
-	            pool_type=MaxPooling())
-	
-	    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
-	    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-	    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-	    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-	    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-	
-	    drop = dropout_layer(input=conv5, dropout_rate=0.5)
-	    fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
-	    bn = batch_norm_layer(
-	        input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
-	    fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
-	    return fc2
-	
-	```
-	
-	2.1. 首先定义了一组卷积网络，即conv_block。卷积核大小为3x3，池化窗口大小为2x2，窗口滑动大小为2，groups决定每组VGG模块是几次连续的卷积操作，dropouts指定Dropout操作的概率。所使用的`img_conv_group`是在`paddle.trainer_config_helpers`中预定义的模块，由若干组 `Conv->BN->ReLu->Dropout` 和 一组 `Pooling` 组成，
-	
-	2.2. 五组卷积操作，即 5个conv_block。 第一、二组采用两次连续的卷积操作。第三、四、五组采用三次连续的卷积操作。每组最后一个卷积后面Dropout概率为0，即不使用Dropout操作。
-	
-	2.3. 最后接两层512维的全连接。
-
-3. 定义分类器
-
-	通过上面VGG网络提取高层特征，然后经过全连接层映射到类别维度大小的向量，再通过Softmax归一化得到每个类别的概率，也可称作分类器。
-
-	```python
-	out = fc_layer(input=net, size=class_num, act=SoftmaxActivation())
-	```
-
-4. 定义损失函数和网络输出
-
-	在有监督训练中需要输入图像对应的类别信息，同样通过`data_layer`来定义。训练中采用多类交叉熵作为损失函数，并作为网络的输出，预测阶段定义网络的输出为分类器得到的概率信息。
-	
-	```python
-	if not is_predict:
-	    lbl = data_layer(name="label", size=class_num)
-	    cost = classification_cost(input=out, label=lbl)
-	    outputs(cost)
-	else:
-	    outputs(out)
-	```
-
-### ResNet
-
-ResNet模型的第1、3、4步和VGG模型相同，这里不再介绍。主要介绍第2步即CIFAR10数据集上ResNet核心模块。
-
-```python
-net = resnet_cifar10(data, depth=56)
-```
-
-先介绍`resnet_cifar10`中的一些基本函数，再介绍网络连接过程。
-
-  - `conv_bn_layer` : 带BN的卷积层。
-  - `shortcut` : 残差模块的"直连"路径，"直连"实际分两种形式：残差模块输入和输出特征通道数不等时，采用1x1卷积的升维操作；残差模块输入和输出通道相等时，采用直连操作。
-  - `basicblock` : 一个基础残差模块，即图9左边所示，由两组3x3卷积组成的路径和一条"直连"路径组成。
-  - `bottleneck` : 一个瓶颈残差模块，即图9右边所示，由上下1x1卷积和中间3x3卷积组成的路径和一条"直连"路径组成。
-  - `layer_warp` : 一组残差模块，由若干个残差模块堆积而成。每组中第一个残差模块滑动窗口大小与其他可以不同，以用来减少特征图在垂直和水平方向的大小。
-
-```python
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  active_type=ReluActivation(),
-                  ch_in=None):
-    tmp = img_conv_layer(
-        input=input,
-        filter_size=filter_size,
-        num_channels=ch_in,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(input=tmp, act=active_type)
-
-
-def shortcut(ipt, n_in, n_out, stride):
-    if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
-    else:
-        return ipt
-
-def basicblock(ipt, ch_out, stride):
-    ch_in = ipt.num_filters
-    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
-
-def bottleneck(ipt, ch_out, stride):
-    ch_in = ipt.num_filter
-    tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
-    tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
-
-def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(ipt, features, stride)
-    for i in range(1, count):
-        tmp = block_func(tmp, features, 1)
-    return tmp
-
-```
-
-`resnet_cifar10` 的连接结构主要有以下几个过程。
-
-1. 底层输入连接一层 `conv_bn_layer`，即带BN的卷积层。 
-2. 然后连接3组残差模块即下面配置3组 `layer_warp` ，每组采用图 10 左边残差模块组成。
-3. 最后对网络做均值池化并返回该层。 
-
-注意：除过第一层卷积层和最后一层全连接层之外，要求三组 `layer_warp` 总的含参层数能够被6整除，即 `resnet_cifar10` 的 depth 要满足 $(depth - 2) % 6 == 0$ 。
-
-```python
-def resnet_cifar10(ipt, depth=56):
-    # depth should be one of 20, 32, 44, 56, 110, 1202
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    nStages = {16, 64, 128}
-    conv1 = conv_bn_layer(ipt,
-        ch_in=3,
-        ch_out=16,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = img_pool_layer(input=res3,
-                         pool_size=8,
-                         stride=1,
-                         pool_type=AvgPooling())
-    return pool
-```
-
-## 模型训练
-
-执行脚本 train.sh 进行模型训练， 其中指定配置文件、设备类型、线程个数、总共训练的轮数、模型存储路径等。
-
-``` bash
-sh train.sh
-```
-
-脚本 `train.sh` 如下：
-
-```bash
-#cfg=models/resnet.py
-cfg=models/vgg.py
-output=output
-log=train.log
-
-paddle train \
-    --config=$cfg \
-    --use_gpu=true \
-    --trainer_count=1 \
-    --log_period=100 \
-    --num_passes=300 \
-    --save_dir=$output \
-    2>&1 | tee $log
-```
-
-- `--config=$cfg` : 指定配置文件，默认是 `models/vgg.py`。
-- `--use_gpu=true` : 指定使用GPU训练，若使用CPU，设置为false。
-- `--trainer_count=1` : 指定线程个数或GPU个数。
-- `--log_period=100` : 指定日志打印的batch间隔。
-- `--save_dir=$output` : 指定模型存储路径。
-
-一轮训练log示例如下所示，经过1个pass， 训练集上平均error为0.79958 ，测试集上平均error为0.7858 。
-
-```text
-TrainerInternal.cpp:165]  Batch=300 samples=38400 AvgCost=2.07708 CurrentCost=1.96158 Eval: classification_error_evaluator=0.81151  CurrentEval: classification_error_evaluator=0.789297
-TrainerInternal.cpp:181]  Pass=0 Batch=391 samples=50000 AvgCost=2.03348 Eval: classification_error_evaluator=0.79958
-Tester.cpp:115]  Test samples=10000 cost=1.99246 Eval: classification_error_evaluator=0.7858
-```
-
-图12是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
-
-<p align="center">
-<img src="image/plot.png" width="400" ><br/>
-图12. CIFAR10数据集上VGG模型的分类错误率
-</p>
-
-## 模型应用
-
-在训练完成后，模型会保存在路径 `output/pass-%05d` 下，例如第300个pass的模型会保存在路径 `output/pass-00299`。 可以使用脚本 `classify.py` 对图片进行预测或提取特征，注意该脚本默认使用模型配置为 `models/vgg.py`，
-
-
-### 预测
-
-可以按照下面方式预测图片的类别，默认使用GPU预测，如果使用CPU预测，在后面加参数 `-c`即可。
-
-```bash
-python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
-```
-
-预测结果为：
-
-```text
-Label of image/dog.png is: 5
-```
-
-### 特征提取
-
-可以按照下面方式对图片提取特征，和预测使用方式不同的是指定job类型为extract，并需要指定提取的层。`classify.py` 默认以第一层卷积特征为例提取特征，并画出了类似图13的可视化图。VGG模型的第一层卷积有64个通道，图13展示了每个通道的灰度图。
-
-```bash
-python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
-```
-
-<p align="center">
-<img src="image/fea_conv0.png" width="500"><br/>
-图13. 卷积特征可视化图 
-</p>
-
-## 总结
-
-传统图像分类方法由多个阶段构成，框架较为复杂，而端到端的CNN模型结构可一步到位，而且大幅度提升了分类准确率。本文我们首先介绍VGG、GoogleNet、ResNet三个经典的模型；然后基于CIFAR10数据集，介绍如何使用PaddlePaddle配置和训练CNN模型，尤其是VGG和ResNet模型；最后介绍如何使用PaddlePaddle的API接口对图片进行预测和特征提取。对于其他数据集比如ImageNet，配置和训练流程是同样的，大家可以自行进行实验。
-
-
-## 参考文献
-
-[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
-
-[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
-
-[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28. 
-
-[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
-
-[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
-
-[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
-
-[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
-
-[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
-
-[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
-
-[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
-
-[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
-
-[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
-
-[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
-
-[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
-
-[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
-
-[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
-
-[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
-
-[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
-
-[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
-
-[20] http://deeplearning.net/tutorial/lenet.html
-
-[21] https://www.cs.toronto.edu/~kriz/cifar.html
-
-[22] http://cs231n.github.io/classification/
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/image_classification/deprecated/classify.py b/image_classification/deprecated/classify.py
deleted file mode 100644
index 5a49bc22b0b205f7212c52c482f26720fea4e684..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/classify.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import cPickle
-import numpy as np
-from PIL import Image
-from optparse import OptionParser
-
-import paddle.utils.image_util as image_util
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-import logging
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
-logging.getLogger().setLevel(logging.INFO)
-
-
-def vis_square(data, fname):
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-    """Take an array of shape (n, height, width) or (n, height, width, 3)
-       and visualize each (height, width) thing in a grid of size approx. sqrt(n) by sqrt(n)"""
-    # normalize data for display
-    data = (data - data.min()) / (data.max() - data.min())
-    # force the number of filters to be square
-    n = int(np.ceil(np.sqrt(data.shape[0])))
-    padding = (
-        ((0, n**2 - data.shape[0]), (0, 1),
-         (0, 1))  # add some space between filters
-        + ((0, 0), ) *
-        (data.ndim - 3))  # don't pad the last dimension (if there is one)
-    data = np.pad(data, padding, mode='constant',
-                  constant_values=1)  # pad with ones (white)
-    # tile the filters into an image
-    data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3) + tuple(
-        range(4, data.ndim + 1)))
-    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
-    plt.imshow(data, cmap='gray')
-    plt.savefig(fname)
-    plt.axis('off')
-
-
-class ImageClassifier():
-    def __init__(self,
-                 train_conf,
-                 resize_dim,
-                 crop_dim,
-                 model_dir=None,
-                 use_gpu=True,
-                 mean_file=None,
-                 oversample=False,
-                 is_color=True):
-        self.train_conf = train_conf
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.resize_dim = resize_dim
-        self.crop_dims = [crop_dim, crop_dim]
-        self.oversample = oversample
-        self.is_color = is_color
-
-        self.transformer = image_util.ImageTransformer(is_color=is_color)
-        self.transformer.set_transpose((2, 0, 1))
-        self.transformer.set_channel_swap((2, 1, 0))
-
-        self.mean_file = mean_file
-        if self.mean_file is not None:
-            mean = np.load(self.mean_file)['mean']
-            mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-            self.transformer.set_mean(mean)  # mean pixel
-        else:
-            # if you use three mean value, set like:
-            # this three mean value is calculated from ImageNet.
-            self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
-
-        conf_args = "use_gpu=%d,is_predict=1" % (int(use_gpu))
-        conf = parse_config(train_conf, conf_args)
-        swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        assert isinstance(self.network, swig_paddle.GradientMachine)
-        self.network.loadParameters(self.model_dir)
-
-        dim = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [dense_vector(dim)]
-        self.converter = DataProviderConverter(slots)
-
-    def get_data(self, img_path):
-        """
-        1. load image from img_path.
-        2. resize or oversampling.
-        3. transformer data: transpose, channel swap, sub mean.
-        return K x H x W ndarray.
-
-        img_path: image path.
-        """
-        image = image_util.load_image(img_path, self.is_color)
-        # Another way to extract oversampled features is that
-        # cropping and averaging from large feature map which is
-        # calculated by large size of image.
-        # This way reduces the computation.
-        if self.oversample:
-            image = image_util.resize_image(image, self.resize_dim)
-            image = np.array(image)
-            input = np.zeros(
-                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
-            input[0] = image.astype(np.float32)
-            input = image_util.oversample(input, self.crop_dims)
-        else:
-            image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros(
-                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
-            input[0] = np.array(image).astype(np.float32)
-
-        data_in = []
-        for img in input:
-            img = self.transformer.transformer(img).flatten()
-            data_in.append([img.tolist()])
-        return data_in
-
-    def forward(self, input_data):
-        in_arg = self.converter(input_data)
-        return self.network.forwardTest(in_arg)
-
-    def forward(self, data, output_layer):
-        input = self.converter(data)
-        self.network.forwardTest(input)
-        output = self.network.getLayerOutputs(output_layer)
-        res = {}
-        if isinstance(output_layer, basestring):
-            output_layer = [output_layer]
-        for name in output_layer:
-            # For oversampling, average predictions across crops.
-            # If not, the shape of output[name]: (1, class_number),
-            # the mean is also applicable.
-            res[name] = output[name].mean(0)
-        return res
-
-
-def option_parser():
-    usage = "%prog -c config -i data_list -w model_dir [options]"
-    parser = OptionParser(usage="usage: %s" % usage)
-    parser.add_option(
-        "--job",
-        action="store",
-        dest="job_type",
-        choices=[
-            'predict',
-            'extract',
-        ],
-        default='predict',
-        help="The job type. \
-                            predict: predicting,\
-                            extract: extract features")
-    parser.add_option(
-        "--conf",
-        action="store",
-        dest="train_conf",
-        default='models/vgg.py',
-        help="network config")
-    parser.add_option(
-        "--data",
-        action="store",
-        dest="data_file",
-        default='image/dog.png',
-        help="image list")
-    parser.add_option(
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    parser.add_option(
-        "-c", dest="cpu_gpu", action="store_false", help="Use cpu mode.")
-    parser.add_option(
-        "-g",
-        dest="cpu_gpu",
-        default=True,
-        action="store_true",
-        help="Use gpu mode.")
-    parser.add_option(
-        "--mean",
-        action="store",
-        dest="mean",
-        default='data/mean.meta',
-        help="The mean file.")
-    parser.add_option(
-        "--multi_crop",
-        action="store_true",
-        dest="multi_crop",
-        default=False,
-        help="Wether to use multiple crops on image.")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    mean = 'data/mean.meta' if not options.mean else options.mean
-    conf = 'models/vgg.py' if not options.train_conf else options.train_conf
-    obj = ImageClassifier(
-        conf,
-        32,
-        32,
-        options.model_path,
-        use_gpu=options.cpu_gpu,
-        mean_file=mean,
-        oversample=options.multi_crop)
-    image_path = options.data_file
-    if options.job_type == 'predict':
-        output_layer = '__fc_layer_2__'
-        data = obj.get_data(image_path)
-        prob = obj.forward(data, output_layer)
-        lab = np.argsort(-prob[output_layer])
-        logging.info("Label of %s is: %d", image_path, lab[0])
-
-    elif options.job_type == "extract":
-        output_layer = '__conv_0__'
-        data = obj.get_data(options.data_file)
-        features = obj.forward(data, output_layer)
-        dshape = (64, 32, 32)
-        fea = features[output_layer].reshape(dshape)
-        vis_square(fea, 'fea_conv0.png')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/image_classification/deprecated/data/cifar10.py b/image_classification/deprecated/data/cifar10.py
deleted file mode 100755
index 0f51fd954a55da9fc9c0387b68b1fc7aa84f4401..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/data/cifar10.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-import cPickle
-
-DATA = "cifar-10-batches-py"
-CHANNEL = 3
-HEIGHT = 32
-WIDTH = 32
-
-
-def create_mean(dataset):
-    if not os.path.isfile("mean.meta"):
-        mean = np.zeros(CHANNEL * HEIGHT * WIDTH)
-        num = 0
-        for f in dataset:
-            batch = np.load(f)
-            mean += batch['data'].sum(0)
-            num += len(batch['data'])
-        mean /= num
-        print mean.size
-        data = {"mean": mean, "size": mean.size}
-        cPickle.dump(
-            data, open("mean.meta", 'w'), protocol=cPickle.HIGHEST_PROTOCOL)
-
-
-def create_data():
-    train_set = [DATA + "/data_batch_%d" % (i + 1) for i in xrange(0, 5)]
-    test_set = [DATA + "/test_batch"]
-
-    # create mean values
-    create_mean(train_set)
-
-    # create dataset lists
-    if not os.path.isfile("train.txt"):
-        train = ["data/" + i for i in train_set]
-        open("train.txt", "w").write("\n".join(train))
-        open("train.list", "w").write("\n".join(["data/train.txt"]))
-
-    if not os.path.isfile("text.txt"):
-        test = ["data/" + i for i in test_set]
-        open("test.txt", "w").write("\n".join(test))
-        open("test.list", "w").write("\n".join(["data/test.txt"]))
-
-
-if __name__ == '__main__':
-    create_data()
diff --git a/image_classification/deprecated/data/get_data.sh b/image_classification/deprecated/data/get_data.sh
deleted file mode 100755
index 82b4888c560238a7837c7dfc18c24570900041c8..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/data/get_data.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
-tar zxf cifar-10-python.tar.gz
-rm cifar-10-python.tar.gz
-
-python cifar10.py
diff --git a/image_classification/deprecated/dataprovider.py b/image_classification/deprecated/dataprovider.py
deleted file mode 100644
index 2c4e3fe2f2341c49e51aaf70213dfa9dd52b09a2..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/dataprovider.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import cPickle
-from paddle.trainer.PyDataProvider2 import *
-
-
-def initializer(settings, mean_path, is_train, **kwargs):
-    settings.is_train = is_train
-    settings.input_size = 3 * 32 * 32
-    settings.mean = np.load(mean_path)['mean']
-    settings.input_types = {
-        'image': dense_vector(settings.input_size),
-        'label': integer_value(10)
-    }
-
-
-@provider(init_hook=initializer, pool_size=50000)
-def process(settings, file_list):
-    with open(file_list, 'r') as fdata:
-        for fname in fdata:
-            fo = open(fname.strip(), 'rb')
-            batch = cPickle.load(fo)
-            fo.close()
-            images = batch['data']
-            labels = batch['labels']
-            for im, lab in zip(images, labels):
-                if settings.is_train and np.random.randint(2):
-                    im = im.reshape(3, 32, 32)
-                    im = im[:, :, ::-1]
-                    im = im.flatten()
-                im = im - settings.mean
-                yield {'image': im.astype('float32'), 'label': int(lab)}
diff --git a/image_classification/deprecated/extract.sh b/image_classification/deprecated/extract.sh
deleted file mode 100755
index 93004788c57922033f58818031b63caf42e32765..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/extract.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
diff --git a/image_classification/deprecated/models/resnet.py b/image_classification/deprecated/models/resnet.py
deleted file mode 100644
index fcaa6feb28d51d5175e2e1eda77a066a7a01a748..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/models/resnet.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-if not is_predict:
-    args = {'meta': 'data/mean.meta'}
-    define_py_data_sources2(
-        train_list='data/train.list',
-        test_list='data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'mean_path': 'data/mean.meta'})
-
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 140,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0002 * 128))
-
-
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  active_type=ReluActivation(),
-                  ch_in=None):
-    tmp = img_conv_layer(
-        input=input,
-        filter_size=filter_size,
-        num_channels=ch_in,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(input=tmp, act=active_type)
-
-
-def shortcut(ipt, n_in, n_out, stride):
-    if n_in != n_out:
-        print("n_in != n_out")
-        return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
-    else:
-        return ipt
-
-
-def basicblock(ipt, ch_out, stride):
-    ch_in = ipt.num_filters
-    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[tmp, short], act=ReluActivation())
-
-
-def bottleneck(ipt, ch_out, stride):
-    ch_in = ipt.num_filter
-    tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
-    tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out * 4, stride)
-    return addto_layer(input=[tmp, short], act=ReluActivation())
-
-
-def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(ipt, features, stride)
-    for i in range(1, count):
-        tmp = block_func(tmp, features, 1)
-    return tmp
-
-
-def resnet_imagenet(ipt, depth=50):
-    cfg = {
-        18: ([2, 2, 2, 1], basicblock),
-        34: ([3, 4, 6, 3], basicblock),
-        50: ([3, 4, 6, 3], bottleneck),
-        101: ([3, 4, 23, 3], bottleneck),
-        152: ([3, 8, 36, 3], bottleneck)
-    }
-    stages, block_func = cfg[depth]
-    tmp = conv_bn_layer(
-        ipt, ch_in=3, ch_out=64, filter_size=7, stride=2, padding=3)
-    tmp = img_pool_layer(input=tmp, pool_size=3, stride=2)
-    tmp = layer_warp(block_func, tmp, 64, stages[0], 1)
-    tmp = layer_warp(block_func, tmp, 128, stages[1], 2)
-    tmp = layer_warp(block_func, tmp, 256, stages[2], 2)
-    tmp = layer_warp(block_func, tmp, 512, stages[3], 2)
-    tmp = img_pool_layer(
-        input=tmp, pool_size=7, stride=1, pool_type=AvgPooling())
-
-    tmp = fc_layer(input=tmp, size=1000, act=SoftmaxActivation())
-    return tmp
-
-
-def resnet_cifar10(ipt, depth=32):
-    #depth should be one of 20, 32, 44, 56, 110, 1202
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    nStages = {16, 64, 128}
-    conv1 = conv_bn_layer(
-        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = img_pool_layer(
-        input=res3, pool_size=8, stride=1, pool_type=AvgPooling())
-    return pool
-
-
-datadim = 3 * 32 * 32
-classdim = 10
-data = data_layer(name='image', size=datadim)
-net = resnet_cifar10(data, depth=32)
-out = fc_layer(input=net, size=10, act=SoftmaxActivation())
-if not is_predict:
-    lbl = data_layer(name="label", size=classdim)
-    outputs(classification_cost(input=out, label=lbl))
-else:
-    outputs(out)
diff --git a/image_classification/deprecated/models/vgg.py b/image_classification/deprecated/models/vgg.py
deleted file mode 100644
index 64d0fd3016f19d7c2a6be5ab4171adcf5db74fce..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/models/vgg.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-if not is_predict:
-    define_py_data_sources2(
-        train_list='data/train.list',
-        test_list='data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'mean_path': 'data/mean.meta'})
-
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128), )
-
-
-def vgg_bn_drop(input):
-    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
-        return img_conv_group(
-            input=ipt,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=ReluActivation(),
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type=MaxPooling())
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = dropout_layer(input=conv5, dropout_rate=0.5)
-    fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
-    bn = batch_norm_layer(
-        input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
-    fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
-    return fc2
-
-
-datadim = 3 * 32 * 32
-classdim = 10
-data = data_layer(name='image', size=datadim)
-net = vgg_bn_drop(data)
-out = fc_layer(input=net, size=classdim, act=SoftmaxActivation())
-if not is_predict:
-    lbl = data_layer(name="label", size=classdim)
-    cost = classification_cost(input=out, label=lbl)
-    outputs(cost)
-else:
-    outputs(out)
diff --git a/image_classification/deprecated/predict.sh b/image_classification/deprecated/predict.sh
deleted file mode 100755
index 2e76191bea57c373f6e8f98c5e66b02e6eccf6b1..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/predict.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
diff --git a/image_classification/deprecated/train.sh b/image_classification/deprecated/train.sh
deleted file mode 100755
index 5019e6cd01f9bd1bef0e23efd7568b4c28a57cf1..0000000000000000000000000000000000000000
--- a/image_classification/deprecated/train.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#cfg=models/resnet.py
-cfg=models/vgg.py
-output=output
-log=train.log
-
-paddle train \
-    --config=$cfg \
-    --use_gpu=true \
-    --trainer_count=1 \
-    --log_period=100 \
-    --num_passes=300 \
-    --save_dir=$output \
-    2>&1 | tee $log
diff --git a/image_classification/index.en.html b/image_classification/index.en.html
deleted file mode 100644
index b6a80785027068f062e5e3ae21eec6ef7c4f143b..0000000000000000000000000000000000000000
--- a/image_classification/index.en.html
+++ /dev/null
@@ -1,612 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-Image Classification
-=======================
-
-The source code of this chapter is in [book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification). For the first-time users, please refer to PaddlePaddle[Installation Tutorial](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html) for installation instructions.
-
-## Background
-
-Comparing to words, images provide more vivid and easier to understand information with more artistic sense. They are important source for people to convey and exchange ideas. In this chapter, we focus on one of the essential problems in image recognition -- image classification.
-
-Image classification distinguishes images of different categories based on their semantic meaning. It is a core problem in computer vision, and is also the foundation of other higher level computer vision tasks such as object detection, image segmentation, object tracking, action recognition, etc. Image classification has applications in many areas such as face recognition and intelligent video analysis in security systems, traffic scene recognition in transportation systems, content-based image retrieval and automatic photo indexing in web services, image classification in medicine, etc.
-
-In image classification, we first encode the whole image using handcrafted or learned features, and then determine the object category by a classifier. Therefore, feature extraction plays an important role in image classification. Prior to deep learning, BoW(Bag of Words) model is the most popular method for object classification. BoW was introduced in NLP where a sentence is represented as a bag of words (words, phrases, or characters) extracted from training sentences. In the context of image classification, BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding**, and **classifier design**.
-
-Deep learning approach to image classification works by supervised or unsupervised learning of hierarchical features automatically instead of crafting or selecting image features manually. Convolutional Neural Networks (CNNs) have made significant progress in image classification. They keep all image information by employing raw image pixels as input, extract low-level and high-level abstract features through convolution operations, and directly output the classification results from the model. This end-to-end learning fashion leads to good performance and wide applications.
-
-In this chapter, we focus on introducing deep learning-based image classification methods, and on explaining how to train a CNN model using PaddlePaddle.
-
-## Demonstration
-
-Image classification includes general and fine-grained ones. Figure 1 demonstrates the results of general image classification -- the trained model can correctly recognize the main objects in the images.
-
-<p align="center">
-<img src="image/dog_cat.png "  width="350" ><br/>
-Figure 1. General image classification
-</p>
-
-
-Figure 2 demonstrates the results of fine-grained image classification -- flower recognition, which requires correct recognition of flower categories.
-
-<p align="center">
-<img src="image/flowers.png" width="400" ><br/>
-Figure 2. Fine-grained image classification
-</p>
-
-
-A good model should be able to recognize objects of different categories correctly, and meanwhile can correctly classify images taken from different points of view, under different illuminations, with object distortion or partial occlusion (we call these image disturbance). Figure 3 show some images with various disturbance. A good model should be able to classify these images correctly like humans.
-
-<p align="center">
-<img src="image/variations_en.png" width="550" ><br/>
-Figure 3. Disturbed images [22]
-</p>
-
-## Model Overview
-
-A large amount of research work in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/). Many image classification algorithms are usually evaluated and compared on these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset started in Large Scale Visual Recognition Challenge (ILSVRC) 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
-
-Before 2012, traditional image classification methods can be achieved with the three steps described in the Background section. A complete model construction usually involves the following stages: low-level feature extraction, feature encoding, spatial constraint or feature clustering, classifier design, model ensemble.
-
-  1). **Low-level feature extraction**: This is a step for extracting large amounts of local features according to fixed strides and scales. Popular local features include Scale-Invariant Feature Transform(SIFT)[1], Histogram of Oriented Gradient(HOG)[2], Local Binary Pattern(LBP)[3], etc. A common practice is to employ multiple feature descriptors in order to avoid missing too much information.
-  2). **Feature encoding**: Low-level features contain large amount of redundancy and noise. In order to improve robustness of features, it is necessary to employ a feature transformation to encode low-level features, which is called feature encoding. Common feature encoding methods include vector quantization [4], sparse coding [5], locality-constrained linear coding [6], Fisher vector encoding [7], etc.
-  3). **Spatial constraint**: Spatial constraint or feature clustering is usually adopted after feature encoding for extracting the maximum or average of each dimension in the spatial domain. Pyramid feature matching--a popular feature clustering method--divides an image uniformly into patches, and performs feature clustering in each patch.
-  4). **Classification**: Upon the above steps, an image can be described by a vector of fixed dimension. Then a classifier can be used to classify the image into categories. Common classifiers include Support Vector Machine(SVM), random forest, etc. Kernel SVM is the most popular classifier, and has achieved very good performance in traditional image classification tasks.
-
-This method has been used widely as image classification algorithm in PASCAL VOC [18]. NEC Labs(http://www.nec-labs.com/) won the championship by employing SIFT and LBP features, two non-linear encoders and SVM in ILSVRC 2010 [8].
-
-The CNN model--AlexNet proposed by Alex Krizhevsky et al.[9], made a breakthrough in ILSVRC 2012. It outperformed traditional methods dramatically, and won the championship in ILSVRC 2012. This is also the first time that a deep learning method was used for large scale image classification. Since AlexNet, a series of  CNN models have been proposed and has advanced the state of the art steadily on Imagenet as shown in Figure 4. With deeper and more sophisticated architectures, Top-5 error rate is getting lower and lower, until to around 3.5%. The error rate of human raters on the same Imagenet dataset is 5.1%, which means that the image classification capability of a deep learning model surpasses human raters.
-
-<p align="center">
-<img src="image/ilsvrc.png" width="500" ><br/>
-Figure 4. Top-5 error rates on ILSVRC image classification
-</p>
-
-### CNN
-
-Traditional CNNs consist of convolutional and fully-connected layers, and employ softmax multi-category classifier and cross-entropy as loss function. Figure 5 shows a typical CNN. We first introduce the common parts of a CNN.
-
-<p align="center">
-<img src="image/lenet_en.png"><br/>
-Figure 5. A CNN example [20]
-</p>
-
-- convolutional layer: It uses convolution operation to extract low-level and high-level features, and to discover local correlation and spatial invariance.
-
-- pooling layer: It down-sample feature maps via extracting local max (max-pooling) or average (avg-pooling) of each patch in the feature map. Down-sampling, a common operator in image processing, can be used to filter out high frequency information.
-
-- fully-connected layer: It fully connects neurons between two adjacent layers.
-
-- non-linear activation: Convolutional and fully-connected layers are usually followed by some non-linear activation layers, such as Sigmoid, Tanh, Relu to enhance the expression capability. Relu is the most commonly used activation function in CNN.
-
-- Dropout [10]: At each training stage, individual nodes are dropped out of the net with a certain probability in order to improve generalization and to avoid overfitting.
-
-Due to parameter updating in each layer during training, it causes the change in the distributions of layer inputs, and requires careful tuning of hyper-parameters. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
-
-We will introduce the network architectures of VGG, GoogleNet and ResNets in the following sections.
-
-### VGG
-
-Oxford Visual Geometry Group (VGG) proposed VGG network in ILSVRC 2014 [11]. The model is deeper and wider than previous neural architectures. It comprises five main groups of convolution operations, with max-pooling layers between adjacent convolution groups. Each group contains a series of 3x3 convolutional layers, whose number of convolution kernels stays the same within the group and increases from 64 in the first group to 512 in the last one. The total number of learnable layers could be 11, 13, 16, or 19 depending on the number of convolutional layers in each group. Figure 6 illustrates a 16-layer VGG. The neural architecture of VGG is relatively simple, and has been adopted by many papers such as the first one that surpassed human-level performance on ImageNet [19].
-
-<p align="center">
-<img src="image/vgg16.png" width="750" ><br/>
-Figure 6. Vgg16 model for ImageNet
-</p>
-
-### GoogleNet
-
-GoogleNet [12] won the championship in ILSVRC 2014. Before introducing this model, lets get familiar with Network in Network(NIN) model [13] from which GoogleNet borrowed some ideas, and Inception blocks upon which GoogleNet is built.
-
-NIN model has two main characteristics: 1) it replaces the single-layer convolutional network by Multi-Layer Perceptron Convolution or MLPconv. MLPconv, a tiny multi-layer convolutional network, enhances non-linearity by adding several 1x1 convolutional layers after linear ones. 2) In traditional CNNs, the last fewer layers are usually fully-connected with a large number of parameters. In contrast, NIN replaces all fully-connected layers with convolutional layers whose feature maps are of the same size as the category dimension, and followed by a global average pooling. This replacement of fully-connected layers significantly reduces the number of parameters.
-
-Figure 7 depicts two Inception blocks. Figure 7(a) is the simplest design, the output of which is a concat of features from three convolutional layers and one pooling layer. The disadvantage of this design is that the pooling layer does not change the number of filters and leads to an increase of outputs. After going through several of such blocks, the number of outputs and parameters will become larger and larger, leading to higher computation complexity. To overcome this drawback, the Inception block in Figure 7(b) employs three 1x1 convolutional layers to reduce dimension or the number of channels, meanwhile improves non-linearity of the network.
-
-<p align="center">
-<img src="image/inception_en.png" width="800" ><br/>
-Figure 7. Inception block
-</p>
-
-GoogleNet consists of multiple stacking Inception blocks followed by an avg-pooling layer as in NIN in place of by traditional fully connected layers. The difference between GoogleNet and NIN is that GoogleNet adds a fully connected layer after avg-pooling layer to output a vector of category size. Besides these two characteristics, the features from middle layers of a GoogleNet are also very discriminative. Therefore, GoogeleNet inserts two auxiliary classifiers in the model for enhancing gradient and regularization when doing backpropagating. The loss function of the whole network is the weighted sum of these three classifiers.
-
-Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks-- the first group contains two Inception blocks, the second one five, and the third one two. It ends up with an average pooling and a fully-connected layer.
-
-<p align="center">
-<img src="image/googlenet.jpeg" ><br/>
-Figure 8. GoogleNet[12]
-</p>
-
-The above model is the first version of GoogleNet or GoogelNet-v1. GoogleNet-v2 [14] introduces BN layer; GoogleNet-v3 [16] further splits some convolutional layers, which increases non-linearity and network depth; GoogelNet-v4 [17] leads to the design idea of ResNet which will be introduced in the next section. The evolution from v1 to v4 leverages the accuracy rate consistently. We will not go into details of the neural architectures of v2 to v4.
-
-### ResNet
-
-Residual Network(ResNet)[15] won the 2015 championships on three ImageNet competitions -- image classification, object localization and object detection. The authors of ResNet proposed a residual learning approach to easing the difficulty of training deeper networks -- with the network depth increasing, accuracy degrades. Based upon the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer inputs. And then the outputs of these two branches are added up.
-
-Figure 9 illustrates the architecture of ResNet. The left is the basic building block consisting of two 3x3 convolutional layers of the same channels. The right one is a Bottleneck block. The bottleneck is a 1x1 convolutional layer used to reduce dimension from 256 to 64. The other 1x1 conolutional layer is used to increase dimension from 64 to 256. Therefore, the number of input and output channels of the middle 3x3 convolutional layer, which is 64, is relatively small.
-
-<p align="center">
-<img src="image/resnet_block.jpg" width="400"><br/>
-Figure 9. Residual block
-</p>
-
-Figure 10 illustrates ResNets with 50, 101, 152 layers, respectively. All three networks use bottleneck blocks of different numbers of repetitions. ResNet converges very fast and can be trained with hundreds or thousands of layers.
-
-<p align="center">
-<img src="image/resnet.png"><br/>
-Figure 10. ResNet model for ImageNet
-</p>
-
-
-## Data Preparation
-
-### Data description and downloading
-
-Commonly used public datasets for image classification are CIFAR(https://www.cs.toronto.edu/~kriz/cifar.html), ImageNet(http://image-net.org/), COCO(http://mscoco.org/), etc. Those used for fine-grained image classification are CUB-200-2011(http://www.vision.caltech.edu/visipedia/CUB-200-2011.html), Stanford Dog(http://vision.stanford.edu/aditya86/ImageNetDogs/), Oxford-flowers(http://www.robots.ox.ac.uk/~vgg/data/flowers/), etc. Among them, ImageNet are the largest and most research results are reported on ImageNet as mentioned in Model Overview section. Since 2010, the data of Imagenet has gone through some changes. The commonly used ImageNet-2012 dataset contains 1000 categories. There are 1,281,167 training images, ranging from 732 to 1200 images per category, and 50,000 validation images with 50 images per category in average.
-
-Since ImageNet is too large to be downloaded and trained efficiently, we use CIFAR10 (https://www.cs.toronto.edu/~kriz/cifar.html) in this tutorial. The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. Figure 11 shows all the classes in CIFAR10 as well as 10 images randomly sampled from each category.
-
-<p align="center">
-<img src="image/cifar.png" width="350"><br/>
-Figure 11. CIFAR10 dataset[21]
-</p>
-
-The following command is used for downloading data and calculating the mean image used for data preprocessing.
-
-```bash
-./data/get_data.sh
-```
-
-###  Data provider for PaddlePaddle
-
-We use Python interface for providing data to PaddlePaddle. The following file dataprovider.py is a complete example for CIFAR10.
-
-- 'initializer' function performs initialization of dataprovider: loading the mean image, defining two input types -- image and label.
-
-- 'process' function sends preprocessed data to PaddlePaddle. Data preprocessing performed in this function includes data perturbation, random horizontal flipping, deducting mean image from the raw image.
-
-```python
-import numpy as np
-import cPickle
-from paddle.trainer.PyDataProvider2 import *
-
-def initializer(settings, mean_path, is_train, **kwargs):
-    settings.is_train = is_train
-    settings.input_size = 3 * 32 * 32
-    settings.mean = np.load(mean_path)['mean']
-    settings.input_types = {
-        'image': dense_vector(settings.input_size),
-        'label': integer_value(10)
-    }
-
-
-@provider(init_hook=initializer, pool_size=50000)
-def process(settings, file_list):
-    with open(file_list, 'r') as fdata:
-        for fname in fdata:
-            fo = open(fname.strip(), 'rb')
-            batch = cPickle.load(fo)
-            fo.close()
-            images = batch['data']
-            labels = batch['labels']
-            for im, lab in zip(images, labels):
-                if settings.is_train and np.random.randint(2):
-                    im = im.reshape(3, 32, 32)
-                    im = im[:,:,::-1]
-                    im = im.flatten()
-                im = im - settings.mean
-                yield {
-                    'image': im.astype('float32'),
-                    'label': int(lab)
-                }
-```
-
-## Model Config
-
-### Data Definition
-
-In model config file, function `define_py_data_sources2` sets argument 'module' to dataprovider file for loading data, 'args' to mean image file. If the config file is used for prediction, then there is no need to set argument 'train_list'.
-
-```python
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-if not is_predict:
-    define_py_data_sources2(
-        train_list='data/train.list',
-        test_list='data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'mean_path': 'data/mean.meta'})
-```
-
-### Algorithm Settings
-
-In model config file, function 'settings' specifies optimization algorithm, batch size, learning rate, momentum and L2 regularization.
-
-```python
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_rate_decay_a=0.1,
-    learning_rate_decay_b=50000 * 100,
-    learning_rate_schedule='discexp',
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128),)
-```
-
-The learning rate adjustment policy can be defined with variables `learning_rate_decay_a`($a$), `learning_rate_decay_b`($b$) and `learning_rate_schedule`. In this example, discrete exponential method is used for adjusting learning rate. The formula is as follows,
-$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
-where $n$ is the number of processed samples, $lr_{0}$ is the learning_rate set in 'settings'.
-
-### Model Architecture
-
-Here we provide the cofig files for VGG and ResNet models.
-
-#### VGG
-
-First we define VGG network. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we uses a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
-
-1. Define input data and its dimension
-
-        The input to the network is defined as `data_layer`, or image pixels in the context of image classification. The images in CIFAR10 are 32x32 color images of three channels. Therefore, the size of the input data is 3072 (3x32x32), and the number of categories is 10.
-
-	```python
-	datadim = 3 * 32 * 32
-	classdim = 10
-	data = data_layer(name='image', size=datadim)
-	```
-
-2. Define VGG main module
-
-	```python
-	net = vgg_bn_drop(data)
-	```
-        The input to VGG main module is from data layer. `vgg_bn_drop` defines a 16-layer VGG network, with each convolutional layer followed by BN and dropout layers. Here is the definition in detail:
-
-	```python
-	def vgg_bn_drop(input, num_channels):
-	    def conv_block(ipt, num_filter, groups, dropouts, num_channels_=None):
-	        return img_conv_group(
-	            input=ipt,
-	            num_channels=num_channels_,
-	            pool_size=2,
-	            pool_stride=2,
-	            conv_num_filter=[num_filter] * groups,
-	            conv_filter_size=3,
-	            conv_act=ReluActivation(),
-	            conv_with_batchnorm=True,
-	            conv_batchnorm_drop_rate=dropouts,
-	            pool_type=MaxPooling())
-
-	    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
-	    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-	    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-	    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-	    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-	    drop = dropout_layer(input=conv5, dropout_rate=0.5)
-	    fc1 = fc_layer(input=drop, size=512, act=LinearActivation())
-	    bn = batch_norm_layer(
-	        input=fc1, act=ReluActivation(), layer_attr=ExtraAttr(drop_rate=0.5))
-	    fc2 = fc_layer(input=bn, size=512, act=LinearActivation())
-	    return fc2
-
-	```
-
-        2.1. First defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.trainer_config_helpers` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
-
-
-        2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
-
-
-        2.3. The last two layers are fully-connected layer of dimension 512.
-
-3. Define Classifier
-
-        The above VGG network extracts high-level features and maps them to a vector of the same size as the categories. Softmax function or classifier is then used for calculating the probability of the image belonging to each category.
-
-	```python
-	out = fc_layer(input=net, size=class_num, act=SoftmaxActivation())
-	```
-
-4. Define Loss Function and Outputs
-
-        In the context of supervised learning, labels of training images are defined in `data_layer`, too. During training, cross-entropy is used as loss function and as the output of the network; During testing, the outputs are the probabilities calculated in the classifier.
-
-	```python
-	if not is_predict:
-	    lbl = data_layer(name="label", size=class_num)
-	    cost = classification_cost(input=out, label=lbl)
-	    outputs(cost)
-	else:
-	    outputs(out)
-	```
-
-### ResNet
-
-The first, third and forth steps of a ResNet are the same as a VGG. The second one is the main module.
-
-```python
-net = resnet_cifar10(data, depth=56)
-```
-
-Here are some basic functions used in `resnet_cifar10`:
-
-  - `conv_bn_layer` : convolutional layer followed by BN.
-  - `shortcut` : the shortcut branch in a residual block. There are two kinds of shortcuts: 1x1 convolution used when the number of channels between input and output are different; direct connection used otherwise.
-
-  - `basicblock` : a basic residual module as shown in the left of Figure 9, consisting of two sequential 3x3 convolutions and one "shortcut" branch.
-  - `bottleneck` : a bottleneck module as shown in the right of Figure 9, consisting of a two 1x1 convolutions with one 3x3 convolution in between branch and a "shortcut" branch.
-  - `layer_warp` : a group of residual modules consisting of several stacking blocks. In each group, the sliding window size of the first residual block could be different from the rest of blocks, in order to reduce the size of feature maps along horizontal and vertical directions.
-
-```python
-def conv_bn_layer(input,
-                  ch_out,
-                  filter_size,
-                  stride,
-                  padding,
-                  active_type=ReluActivation(),
-                  ch_in=None):
-    tmp = img_conv_layer(
-        input=input,
-        filter_size=filter_size,
-        num_channels=ch_in,
-        num_filters=ch_out,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=False)
-    return batch_norm_layer(input=tmp, act=active_type)
-
-
-def shortcut(ipt, n_in, n_out, stride):
-    if n_in != n_out:
-        return conv_bn_layer(ipt, n_out, 1, stride, 0, LinearActivation())
-    else:
-        return ipt
-
-def basicblock(ipt, ch_out, stride):
-    ch_in = ipt.num_filters
-    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
-
-def bottleneck(ipt, ch_out, stride):
-    ch_in = ipt.num_filter
-    tmp = conv_bn_layer(ipt, ch_out, 1, stride, 0)
-    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1)
-    tmp = conv_bn_layer(tmp, ch_out * 4, 1, 1, 0, LinearActivation())
-    short = shortcut(ipt, ch_in, ch_out, stride)
-    return addto_layer(input=[ipt, short], act=ReluActivation())
-
-def layer_warp(block_func, ipt, features, count, stride):
-    tmp = block_func(ipt, features, stride)
-    for i in range(1, count):
-        tmp = block_func(tmp, features, 1)
-    return tmp
-
-```
-
-The following are the components of `resnet_cifar10`:
-
-1. The lowest level is `conv_bn_layer`.
-2. The middle level consists of three `layer_warp`, each of which uses the left residual block in Figure 9.
-3. The last level is average pooling layer.
-
-Note: besides the first convolutional layer and the last fully-connected layer, the total number of layers in three `layer_warp` should be dividable by 6, that is the depth of `resnet_cifar10` should satisfy $(depth - 2) % 6 == 0$.
-
-```python
-def resnet_cifar10(ipt, depth=56):
-    # depth should be one of 20, 32, 44, 56, 110, 1202
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) / 6
-    nStages = {16, 64, 128}
-    conv1 = conv_bn_layer(ipt,
-        ch_in=3,
-        ch_out=16,
-        filter_size=3,
-        stride=1,
-        padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 64, n, 2)
-    pool = img_pool_layer(input=res3,
-                         pool_size=8,
-                         stride=1,
-                         pool_type=AvgPooling())
-    return pool
-```
-
-## Model Training
-
-We can train the model by running the script train.sh, which specifies config file, device type, number of threads, number of passes, path to the trained models, etc,
-
-``` bash
-sh train.sh
-```
-
-Here is an example script `train.sh`:
-
-```bash
-#cfg=models/resnet.py
-cfg=models/vgg.py
-output=output
-log=train.log
-
-paddle train \
-    --config=$cfg \
-    --use_gpu=true \
-    --trainer_count=1 \
-    --log_period=100 \
-    --num_passes=300 \
-    --save_dir=$output \
-    2>&1 | tee $log
-```
-
-- `--config=$cfg` : specifies config file. The default is `models/vgg.py`.
-- `--use_gpu=true` : uses GPU for training. If use CPU，set it to be false.
-- `--trainer_count=1` : specifies the number of threads or GPUs.
-- `--log_period=100` : specifies the number of batches between two logs.
-- `--save_dir=$output` : specifies the path for saving trained models.
-
-Here is an example log after training for one pass. The average error rates are 0.79958 on training set and 0.7858 on validation set.
-
-```text
-TrainerInternal.cpp:165]  Batch=300 samples=38400 AvgCost=2.07708 CurrentCost=1.96158 Eval: classification_error_evaluator=0.81151  CurrentEval: classification_error_evaluator=0.789297
-TrainerInternal.cpp:181]  Pass=0 Batch=391 samples=50000 AvgCost=2.03348 Eval: classification_error_evaluator=0.79958
-Tester.cpp:115]  Test samples=10000 cost=1.99246 Eval: classification_error_evaluator=0.7858
-```
-
-Figure 12 shows the curve of training error rate, which indicates it converges at Pass 200 with error rate 8.54%.
-
-<p align="center">
-<img src="image/plot_en.png" width="400" ><br/>
-Figure 12. The error rate of VGG model on CIFAR10
-</p>
-
-## Model Application
-
-After training is done, the model from each pass is saved in `output/pass-%05d`. For example, the model of Pass 300 is saved in `output/pass-00299`. The script `classify.py` can be used to extract features and to classify an image. The default config file of this script is `models/vgg.py`.
-
-
-### Prediction
-
-We can run the following script to predict the category of an image. The default device is GPU. If to use CPU, set `-c`.
-
-```bash
-python classify.py --job=predict --model=output/pass-00299 --data=image/dog.png # -c
-```
-
-Here is the result：
-
-```text
-Label of image/dog.png is: 5
-```
-
-### Feature Extraction
-
-We can run the following command to extract features from an image. Here `job` should be `extract` and the default layer is the first convolutional layer. Figure 13 shows the 64 feature maps output from the first convolutional layer of the VGG model.
-
-```bash
-python classify.py --job=extract --model=output/pass-00299 --data=image/dog.png # -c
-```
-
-<p align="center">
-<img src="image/fea_conv0.png" width="500"><br/>
-Figre 13. Visualization of convolution layer feature maps
-</p>
-
-## Conclusion
-
-Traditional image classification methods involve multiple stages of processing and the framework is very complicated. In contrast, CNN models can be trained end-to-end with significant increase of classification accuracy. In this chapter, we introduce three models -- VGG, GoogleNet, ResNet, provide PaddlePaddle config files for training VGG and ResNet on CIFAR10, and explain how to perform prediction and feature extraction using PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
-
-
-## Reference
-
-[1] D. G. Lowe, [Distinctive image features from scale-invariant keypoints](http://www.cs.ubc.ca/~lowe/papers/ijcv04.pdf). IJCV, 60(2):91-110, 2004.
-
-[2] N. Dalal, B. Triggs, [Histograms of Oriented Gradients for Human Detection](http://vision.stanford.edu/teaching/cs231b_spring1213/papers/CVPR05_DalalTriggs.pdf), Proc. IEEE Conf. Computer Vision and Pattern Recognition, 2005.
-
-[3] Ahonen, T., Hadid, A., and Pietikinen, M. (2006). [Face description with local binary patterns: Application to face recognition](http://ieeexplore.ieee.org/document/1717463/). PAMI, 28.
-
-[4] J. Sivic, A. Zisserman, [Video Google: A Text Retrieval Approach to Object Matching in Videos](http://www.robots.ox.ac.uk/~vgg/publications/papers/sivic03.pdf), Proc. Ninth Int'l Conf. Computer Vision, pp. 1470-1478, 2003.
-
-[5] B. Olshausen, D. Field, [Sparse Coding with an Overcomplete Basis Set: A Strategy Employed by V1?](http://redwood.psych.cornell.edu/papers/olshausen_field_1997.pdf), Vision Research, vol. 37, pp. 3311-3325, 1997.
-
-[6] Wang, J., Yang, J., Yu, K., Lv, F., Huang, T., and Gong, Y. (2010). [Locality-constrained Linear Coding for image classification](http://ieeexplore.ieee.org/abstract/document/5540018/). In CVPR.
-
-[7] Perronnin, F., Sánchez, J., & Mensink, T. (2010). [Improving the fisher kernel for large-scale image classification](http://dl.acm.org/citation.cfm?id=1888101). In ECCV (4).
-
-[8] Lin, Y., Lv, F., Cao, L., Zhu, S., Yang, M., Cour, T., Yu, K., and Huang, T. (2011). [Large-scale image clas- sification: Fast feature extraction and SVM training](http://ieeexplore.ieee.org/document/5995477/). In CVPR.
-
-[9] Krizhevsky, A., Sutskever, I., and Hinton, G. (2012). [ImageNet classification with deep convolutional neu- ral networks](http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf). In NIPS.
-
-[10] G.E. Hinton, N. Srivastava, A. Krizhevsky, I. Sutskever, and R.R. Salakhutdinov. [Improving neural networks by preventing co-adaptation of feature detectors](https://arxiv.org/abs/1207.0580). arXiv preprint arXiv:1207.0580, 2012.
-
-[11] K. Chatfield, K. Simonyan, A. Vedaldi, A. Zisserman. [Return of the Devil in the Details: Delving Deep into Convolutional Nets](https://arxiv.org/abs/1405.3531). BMVC, 2014。
-
-[12] Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A., [Going deeper with convolutions](https://arxiv.org/abs/1409.4842). In: CVPR. (2015)
-
-[13] Lin, M., Chen, Q., and Yan, S. [Network in network](https://arxiv.org/abs/1312.4400). In Proc. ICLR, 2014.
-
-[14] S. Ioffe and C. Szegedy. [Batch normalization: Accelerating deep network training by reducing internal covariate shift](https://arxiv.org/abs/1502.03167). In ICML, 2015.
-
-[15] K. He, X. Zhang, S. Ren, J. Sun. [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385). CVPR 2016.
-
-[16] Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z. [Rethinking the incep-tion architecture for computer vision](https://arxiv.org/abs/1512.00567). In: CVPR. (2016).
-
-[17] Szegedy, C., Ioffe, S., Vanhoucke, V. [Inception-v4, inception-resnet and the impact of residual connections on learning](https://arxiv.org/abs/1602.07261). arXiv:1602.07261 (2016).
-
-[18] Everingham, M., Eslami, S. M. A., Van Gool, L., Williams, C. K. I., Winn, J. and Zisserman, A. [The Pascal Visual Object Classes Challenge: A Retrospective]((http://link.springer.com/article/10.1007/s11263-014-0733-5)). International Journal of Computer Vision, 111(1), 98-136, 2015.
-
-[19] He, K., Zhang, X., Ren, S., and Sun, J. [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852). ArXiv e-prints, February 2015.
-
-[20] http://deeplearning.net/tutorial/lenet.html
-
-[21] https://www.cs.toronto.edu/~kriz/cifar.html
-
-[22] http://cs231n.github.io/classification/
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/index.cn.html b/index.cn.html
new file mode 100644
index 0000000000000000000000000000000000000000..f60aef0bb7a0a93754c76f9a1a8477b6a571a1a5
--- /dev/null
+++ b/index.cn.html
@@ -0,0 +1,173 @@
+<!DOCTYPE html>
+<html lang="zh-Hans">
+<head>
+    <meta charset="UTF-8">
+    <title>深度学习入门</title>
+    <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+    <script src="http://cdn.bootcss.com/tether/1.4.0/js/tether.js"></script>
+    <script src="http://cdn.bootcss.com/jquery/3.1.0/jquery.slim.js"></script>
+    <script src="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js"></script>
+    <style>
+* {
+    font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+.left-panel {
+    background: #E5E6EA;
+}
+
+.left-panel .card-block a.click_active {
+    background-color: #597cf1;
+    color: #fff;
+}
+ 
+.left-panel .card-header {
+    background: #ecedee;
+}
+
+
+.left-panel .card-header a {
+    font-size: 14px;
+}
+
+
+.left-panel .card-block a:not(.click_active) {
+    background: #e4e6e9;
+}
+
+.left-panel .card-block a {
+    border-radius: 0px;
+    font-size: 13px;
+    color: #2F323A;
+    padding-left: 40px;
+    border: 1px solid rgba(0,0,0,.125);
+}
+
+.left-panel .card-block a.active_color {
+    color:  rgb(70, 74, 76);
+    font-weight: bolder;
+}
+
+.left-panel .list-group-item {
+    -moz-box-align: center;
+    align-items: center;
+    display: flex;
+    flex-flow: row wrap;
+    margin-bottom: -1px;
+    padding: 0.75rem 1.25rem;
+    position: relative;
+    color: #2f323a;
+}
+
+.navbar img {
+    height: 90%;
+    width: 90%;
+}
+
+   </style>
+</head>
+<body>
+    <nav class="navbar navbar-toggleable-md navbar-inverse bg-inverse">
+        <a class="navbar-brand mr-auto" href="#">
+            <img alt="PaddlePaddle" src="./.tools/theme/PP_w.png">
+        </a>
+        <ul class="nav navbar-nav">
+          <li class="nav-item">
+            <a class="nav-link" href="http://paddlepaddle.org">
+                首页
+            </a>
+          </li>
+          <li class="nav-item active">
+            <a class="nav-link" href="#">
+                深度学习入门
+            </a>
+          </li>
+          <li class="nav-item ">
+            <a class="nav-link" href="./index.html">
+                English
+            </a>
+          </li>
+          <li class="nav-item">
+            <a class="nav-link" href="https://github.com/PaddlePaddle/book/">Fork on Github</a>
+          </li>
+        </ul>
+    </nav>
+    <div class="container-fluid">
+        <div class="row">
+            <div class="col-3 pl-1 pr-1 left-panel">
+                <div id="accordion" role="tablist" aria-multiselectable="true">
+                  <div class="card">
+                    <div class="card-header" role="tab" id="headingOne">
+                      <h5 class="mb-0">
+                        <a aria-expanded="true" aria-controls="collapseOne">
+                            深度学习入门
+                        </a>
+                      </h5>
+                    </div>
+                    <div id="collapseOne" class="rounded-0 collapse show" role="tabpanel" aria-labelledby="headingOne">
+                      <div class="card-block pl-0 pr-0 pt-0 pb-0">
+                        <div class="list-group ">
+                          
+                            <a href="./01.fit_a_line/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                新手入门
+                            </a>
+                          
+                            <a href="./02.recognize_digits/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                识别数字
+                            </a>
+                          
+                            <a href="./03.image_classification/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                图像分类
+                            </a>
+                          
+                            <a href="./04.word2vec/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                词向量
+                            </a>
+                          
+                            <a href="./05.recommender_system/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                个性化推荐
+                            </a>
+                          
+                            <a href="./06.understand_sentiment/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                情感分析
+                            </a>
+                          
+                            <a href="./07.label_semantic_roles/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                语义角色标注
+                            </a>
+                          
+                            <a href="./08.machine_translation/index.cn.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                机器翻译
+                            </a>
+                          
+                        </div>
+                      </div>
+                    </div>
+                  </div>
+                </div>
+            </div>
+            <div class="col">
+                <iframe src="./01.fit_a_line/index.cn.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe">
+                </iframe>
+            </div>
+        </div>
+    </div>
+    <script>
+    $('#content_iframe').on('load', function(){
+        $("#content_iframe").height(200)  // trick code to shrink iframe size
+        var body = $('#content_iframe').contents().find("body")
+        body.css("overflow-y", "hidden")
+        $("#content_iframe").height(body.height()+20)
+        var alllinks = $('#content_iframe').contents().find("a")
+        for (var i =0; i<alllinks.length; ++i) {
+            alllinks[i].setAttribute("target", "_blank")
+        }
+    });
+    $(".list-group a").click(function(){ 
+       $(".list-group a.click_active").removeClass("click_active"); 
+       $(this).addClass("click_active"); 
+    })
+    $($(".list-group a")[0]).addClass("click_active") 
+    </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/index.html b/index.html
index 8e83e91e087fdf59bebf41d11988c49e2b813e01..d23324634e13dcf56a87ab5b45c2404e141bec71 100644
--- a/index.html
+++ b/index.html
@@ -1,7 +1,169 @@
-<html>
+<!DOCTYPE html>
+<html lang="en">
 <head>
-	<meta http-equiv="refresh" content="0; url=https://github.com/paddlepaddle/book" />
+    <meta charset="UTF-8">
+    <title>Deep Learning 101</title>
+    <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+    <script src="http://cdn.bootcss.com/tether/1.4.0/js/tether.js"></script>
+    <script src="http://cdn.bootcss.com/jquery/3.1.0/jquery.slim.js"></script>
+    <script src="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js"></script>
+    <style>
+* {
+    font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
+
+.left-panel {
+    background: #E5E6EA;
+}
+
+.left-panel .card-block a.click_active {
+    background-color: #597cf1;
+    color: #fff;
+}
+ 
+.left-panel .card-header {
+    background: #ecedee;
+}
+
+
+
+.left-panel .card-block a:not(.click_active) {
+    background: #e4e6e9;
+}
+
+.left-panel .card-block a {
+    border-radius: 0px;
+    font-size: 13px;
+    color: #2F323A;
+    padding-left: 40px;
+    border: 1px solid rgba(0,0,0,.125);
+}
+
+.left-panel .card-block a.active_color {
+    color:  rgb(70, 74, 76);
+    font-weight: bolder;
+}
+
+.left-panel .list-group-item {
+    -moz-box-align: center;
+    align-items: center;
+    display: flex;
+    flex-flow: row wrap;
+    margin-bottom: -1px;
+    padding: 0.75rem 1.25rem;
+    position: relative;
+    color: #2f323a;
+}
+
+.navbar img {
+    height: 90%;
+    width: 90%;
+}
+
+   </style>
 </head>
 <body>
-	<a href="https://github.com/paddlepaddle/book">Please access github home page</a>
+    <nav class="navbar navbar-toggleable-md navbar-inverse bg-inverse">
+        <a class="navbar-brand mr-auto" href="#">
+            <img alt="PaddlePaddle" src="./.tools/theme/PP_w.png">
+        </a>
+        <ul class="nav navbar-nav">
+          <li class="nav-item">
+            <a class="nav-link" href="http://paddlepaddle.org">
+                Home
+            </a>
+          </li>
+          <li class="nav-item ">
+            <a class="nav-link" href="./index.cn.html">
+                中文
+            </a>
+          </li>
+          <li class="nav-item active">
+            <a class="nav-link" href="#">
+                Deep Learning 101
+            </a>
+          </li>
+          <li class="nav-item">
+            <a class="nav-link" href="https://github.com/PaddlePaddle/book/">Fork on Github</a>
+          </li>
+        </ul>
+    </nav>
+    <div class="container-fluid">
+        <div class="row">
+            <div class="col-3 pl-1 pr-1 left-panel">
+                <div id="accordion" role="tablist" aria-multiselectable="true">
+                  <div class="card">
+                    <div class="card-header" role="tab" id="headingOne">
+                      <h5 class="mb-0">
+                        <a aria-expanded="true" aria-controls="collapseOne">
+                            Deep Learning 101<span class="sr-only">(current)</span>
+                        </a>
+                      </h5>
+                    </div>
+                    <div id="collapseOne" class="rounded-0 collapse show" role="tabpanel" aria-labelledby="headingOne">
+                      <div class="card-block pl-0 pr-0 pt-0 pb-0">
+                        <div class="list-group ">
+                          
+                            <a href="./01.fit_a_line/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Linear Regression
+                            </a>
+                          
+                            <a href="./02.recognize_digits/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Recognize Digits
+                            </a>
+                          
+                            <a href="./03.image_classification/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Image Classification
+                            </a>
+                          
+                            <a href="./04.word2vec/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Word2Vec
+                            </a>
+                          
+                            <a href="./05.recommender_system/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Personalized Recommendation
+                            </a>
+                          
+                            <a href="./06.understand_sentiment/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Sentiment Analysis
+                            </a>
+                          
+                            <a href="./07.label_semantic_roles/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Semantic Role Labeling
+                            </a>
+                          
+                            <a href="./08.machine_translation/index.html" target="content_iframe" class="list-group-item list-group-item-action" style="border: 0px; border-bottom: 2px solid #dddfe3;">
+                                Machine Translation
+                            </a>
+                          
+                        </div>
+                      </div>
+                    </div>
+                  </div>
+                </div>
+            </div>
+            <div class="col">
+                <iframe src="./01.fit_a_line/index.html" style="border: none; overflow-y : hidden" width="100%" height="100%" name="content_iframe" id="content_iframe">
+                </iframe>
+            </div>
+        </div>
+    </div>
+    <script>
+    $('#content_iframe').on('load', function(){
+        $("#content_iframe").height(200)  // trick code to shrink iframe size
+        var body = $('#content_iframe').contents().find("body")
+        body.css("overflow-y", "hidden")
+        $("#content_iframe").height(body.height()+20)
+        var alllinks = $('#content_iframe').contents().find("a")
+        for (var i =0; i<alllinks.length; ++i) {
+            alllinks[i].setAttribute("target", "_blank")
+        }
+    });
+    $(".list-group a").click(function(){ 
+       $(".list-group a.click_active").removeClass("click_active"); 
+       $(this).addClass("click_active"); 
+    })
+    $($(".list-group a")[0]).addClass("click_active") 
+    </script>
 </body>
+</html>
\ No newline at end of file
diff --git a/label_semantic_roles/README.en.md b/label_semantic_roles/README.en.md
deleted file mode 100644
index b2a75cb03db3a2ef90a5b2a492fdac020507d396..0000000000000000000000000000000000000000
--- a/label_semantic_roles/README.en.md
+++ /dev/null
@@ -1,501 +0,0 @@
-# Semantic Role Labeling
-
-Source code of this chapter is in [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles).
-
-## Background
-
-Natural Language Analysis contains three components: Lexical Analysis, Syntactic Analysis, and Semantic Analysis. Semantic Role Labelling (SRL) is one way for Shallow Semantic Analysis. A predicate of a sentence is a property that a subject possesses or is characterized, such as what it does, what it is or how it is, which mostly corresponds to the core of an event. The noun associated with a predicate is called Argument. Semantic roles express the abstract roles that arguments of a predicate can take in the event, such as Agent, Patient, Theme, Experiencer, Beneficiary, Instrument, Location, Goal and Source, etc.
-
-In the following example, “遇到” (encounters) is a Predicate (“Pred”)，“小明” (Ming) is an Agent，“小红” (Hong) is a Patient，“昨天” (yesterday) indicates the Time, and “公园” (park) is the Location.
-
-$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$
-
-Instead of in-depth analysis on semantic information, the goal of Semantic Role Labeling is to identify the relation of predicate and other constituents, e.g., predicate-argument structure, as specific semantic roles, which is an important intermediate step in a wide range of natural language understanding tasks (Information Extraction, Discourse Analysis, DeepQA etc). Predicates are always assumed to be given; the only thing is to identify arguments and their semantic roles.
-
-Standard SRL system mostly builds on top of Syntactic Analysis and contains five steps:
-
-1. Construct a syntactic parse tree, as shown in Fig. 1
-2. Identity candidate arguments of given predicate from constructed syntactic parse tree.
-3. Prune most unlikely candidate arguments.
-4. Identify arguments, often by a binary classifier.
-5. Multi-class semantic role labeling. Steps 2-3 usually introduce hand-designed features based on Syntactic Analysis (step 1).
-
-
-<div  align="center">
-<img src="image/dependency_parsing.png" width = "80%" align=center /><br>
-Fig 1. Syntactic parse tree
-</div>
-
-核心关系-> HED
-定中关系-> ATT
-主谓关系-> SBV
-状中结构-> ADV
-介宾关系-> POB
-右附加关系-> RAD
-动宾关系-> VOB
-标点-> WP
-
-
-However, complete syntactic analysis requires identifying the relation among all constitutes and the performance of SRL is sensitive to the precision of syntactic analysis, which makes SRL a very challenging task. To reduce the complexity and obtain some syntactic structure information, we often use shallow syntactic analysis. Shallow Syntactic Analysis is also called partial parsing or chunking. Unlike complete syntactic analysis which requires the construction of the complete parsing tree, Shallow Syntactic Analysis only need to identify some independent components with relatively simple structure, such as verb phrases (chunk). To avoid difficulties in constructing a syntactic tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking based SRL methods, which convert SRL as a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using BIO representation. For syntactic chunks forming a chunk of type A, the first chunk receives the B-A tag (Begin), the remaining ones receive the tag I-A (Inside), and all chunks outside receive the tag O-A.
-
-The BIO representation of above example is shown in Fig.1.
-
-<div  align="center">
-<img src="image/bio_example.png" width = "90%"  align=center /><br>
-Fig 2. BIO represention
-</div>
-
-输入序列-> input sequence
-语块-> chunk
-标注序列-> label sequence
-角色-> role
-
-This example illustrates the simplicity of sequence tagging because (1) shallow syntactic analysis reduces the precision requirement of syntactic analysis; (2) pruning candidate arguments is removed; 3) argument identification and tagging are finished at the same time. Such unified methods simplify the procedure, reduce the risk of accumulating errors and boost the performance further.
-
-In this tutorial, our SRL system is built as an end-to-end system via a neural network. We take only text sequences, without using any syntactic parsing results or complex hand-designed features. We give public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) as an example to illustrate: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles by sequence tagging method.
-
-## Model
-
-Recurrent Neural Networks are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike Feed-forward neural networks, RNNs can model the dependency between elements of sequences. LSTMs as variants of RNNs aim to model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
-
-### Stacked Recurrent Neural Network
-
-Deep Neural Networks allows extracting hierarchical representations. Higher layers can form more abstract/complex representations on top of lower layers. LSTMs, when unfolded in time, is a deep feed-forward neural network, because a computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. However, the computation carried out at each time-step is only linear transformation, which makes LSTMs a shallow model. Deep LSTMs are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be much efficient at representing some functions and modeling varying-length dependencies\[[2](#Reference)\].
-
-
-However, deep LSTMs increases the number of nonlinear steps the gradient has to traverse when propagated back in depth. For example, four layer LSTMs can be trained properly, but the performance becomes worse as the number of layers up to 4-8. Conventional LSTMs prevent backpropagated errors from vanishing and exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well.
-
-
-The operation of a single LSTM cell contain 3 parts: (1) input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping); (2) hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs; (3)hidden-to-output: this part typically involves an activation operation on hidden states. Based on the stacked LSTMs, we add a shortcut connection: take the input-to-hidden from the previous layer as a new input and learn another linear transformation.
-
-Fig.3 illustrate the final stacked recurrent neural networks.
-
-<p align="center">    
-<img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
-Fig 3. Stacked Recurrent Neural Networks
-</p>
-
-线性变换-> linear transformation
-输入层到隐层-> input-to-hidden
-
-### Bidirectional Recurrent Neural Network
-
-LSTMs can summarize the history of previous inputs seen up to now, but can not see the future. In most of NLP (natural language processing) tasks, the entire sentences are ready to use. Therefore, sequential learning might be much efficient if the future can be encoded as well like histories.
-
-To address the above drawbacks, we can design bidirectional recurrent neural networks by making a minor modification. Higher LSTM layers process the sequence in reversed direction with previous lower LSTM layers, i.e., Deep LSTMs operate from left-to-right, right-to-left, left-to-right,..., in depth. Therefore, LSTM layers at time-step $t$ can see both histories and the future since the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
-
-
-<p align="center">    
-<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
-Fig 4. Bidirectional LSTMs
-</p>
-
-线性变换-> linear transformation
-输入层到隐层-> input-to-hidden
-正向处理输出序列->process sequence in the forward direction
-反向处理上一层序列-> process sequence from the previous layer in backward direction
-
-Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks[machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md)
-
-### Conditional Random Field
-
-The basic pipeline of Neural Networks solving problems is 1) all lower layers aim to learn representations; 2) the top layer is designed for learning the final task. In SRL tasks, CRF is built on top of the network for the final tag sequence prediction. It takes the representations provided by the last LSTM layer as input.
-
-
-CRF is a probabilistic graph model (undirected) with nodes denoting random variables and edges denoting dependencies between nodes. To be simplicity, CRFs learn conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input, $Y = (y_1, y_2, ... , y_n)$ are label sequences; Decoding is to search sequence $Y$ to maximize conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
-
-Sequence tagging tasks only consider input and output as linear sequences without extra dependent assumptions on graph model. Thus, the graph model of sequence tagging tasks is simple chain or line, which results in a Linear-Chain Conditional Random Field, shown in Fig.5.
-
-<p align="center">    
-<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
-Fig 5. Linear Chain Conditional Random Field used in SRL tasks
-</p>
-
-By the fundamental theorem of random fields \[[5](#Reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form: 
-
-$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
-
-
-where, $Z(X)$ is normalization constant, $t_j$ is feature function defined on edges, called transition feature, depending on $y_i$ and $y_{i-1}$ which represents transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. $s_k$ is feature function defined on nodes, called state feature, depending on $y_i$ and represents the probality of $y_i$ given input sequence $X$. $\lambda_j$ 和 $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Actually, $t$ and $s$ can be wrtten in the same form, then take summation over all nodes $i$: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, $f$ is defined as feature function. Thus, $P(Y|X)$ can be wrtten as:
-
-$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
-
-$\omega$ are weights of feature function which should be learned in CRF models. At training stage, given input sequences and label sequences $D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, solve following objective function using MLE:
-
-
-$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
-
-
-This objective function can be solved via back-propagation in an end-to-end manner. At decoding stage, given input sequences $X$, search sequence $\bar{Y}$ to maximize conditional probability $\bar{P}(Y|X)$ via decoding methods (such as Viterbi, Beam Search).
-
-### DB-LSTM SRL model
-
-Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has n predicates, we will process this sequence n times. One model is as follows:
-
-1. Construct inputs;
- - input 1: predicate, input 2: sentence
- - expand input 1 as a sequence with the same length with input 2 using one-hot representation;
-2. Convert one-hot sequences from step 1 to vector sequences via lookup table;
-3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
-4. Take representations from step 3 as inputs, label sequence as supervision signal, do sequence tagging tasks
-
-We can try above method. Here, we propose some modifications by introducing two simple but effective features:
-
-- predicate context (ctx-p): A single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk.
-
-- region mark ($m_r$): $m_r = 1$ to denote word in that position locates in the predicate context region, or $m_r = 0$ if not.
-
-After modification, the model is as follows:
-
-1. Construct inputs
- - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, element value will be 1 if word locates in the predicate context region, 0 otherwise.
- - expand input 2~3 as sequences with the same length with input 1
-2. Convert input 1~4 to vector sequences via lookup table; input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables
-3. Take four vector sequences from step 2 as inputs of bidirectional LSTMs; Train LSTMs to update representations
-4. Take representation from step 3 as input of CRF, label sequence as supervision signal, do sequence tagging tasks
-
-
-<div  align="center">    
-<img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
-Fig 6. DB-LSTM for SRL tasks
-</div>
-
-论元-> argu
-谓词-> pred
-谓词上下文-> ctx-p
-谓词上下文区域标记-> $m_r$
-输入-> input
-原句-> sentence
-反向LSTM-> LSTM Reverse
-
-## Data Preparation
-
-In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. It is important to note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, if you want to train a usable neural network SRL system, consider paying for the full corpus.
-
-The original data includes a variety of information such as POS tagging, naming entity recognition, parsing tree, and so on. In this tutorial, we only use the data under the words folder (text sequence) and the props folder (label results) inside test.wsj parent folder. The data directory used in this tutorial is as follows:
-
-```text
-conll05st-release/
-└── test.wsj
-    ├── props  # 标注结果
-    └── words  # 输入文本序列
-```
-
-The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The label of the PropBank is different from the label that we used in the example at the beginning of the article, but the principle is the same. For the description of the label, please refer to the paper \[[9](#references)\].
-
-The raw data needs to be preprocessed before used by PaddlePaddle. The preprocessing consists of the following steps:
-
-1. Merge the text sequence and the tag sequence into the same record;
-2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate;
-3. Extract the predicate context and construct the predicate context region marker;
-4. Construct the markings in BIO format;
-5. Obtain the integer index corresponding to the word according to the dictionary.
-
-```python
-# import paddle.v2.dataset.conll05 as conll05
-# conll05.corpus_reader does step 1 and 2 as mentioned above.
-# conll05.reader_creator does step 3 to 5.
-# conll05.test gets preprocessed training instances.
-```
-
-After preprocessing completes, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. Following table is an example of a training sample.
-
-| word sequence | predicate | predicate context（5 columns） | region mark sequence | label sequence|
-|---|---|---|---|---|
-| A | set | n't been set . × | 0 | B-A1 |
-| record | set | n't been set . × | 0 | I-A1 |
-| date | set | n't been set . × | 0 | I-A1 |
-| has | set | n't been set . × | 0 | O |
-| n't | set | n't been set . × | 1 | B-AM-NEG |
-| been | set | n't been set . × | 1 | O |
-| set | set | n't been set . × | 1 | B-V |
-| . | set | n't been set . × | 1 | O |
-
-In addition to the data, we provide following resources:
-
-| filename | explanation |
-|---|---|
-| word_dict | dictionary of input sentences, total 44068 words |
-| label_dict | dictionary of labels, total 106 labels |
-| predicate_dict | predicate dictionary, total 3162 predicates |
-| emb | a pre-trained word vector lookup table, 32-dimentional |
-
-We trained in the English Wikipedia language model to get a word vector lookup table used to initialize the SRL model. During the SRL model training process, the word vector lookup table is no longer updated. About the language model and the word vector lookup table can refer to [word vector](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) tutorial. There are 995,000,000 token in training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
-
-Get dictionary, print dictionary size:
-
-```python
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_len = len(verb_dict)
-
-print len(word_dict_len)
-print len(label_dict_len)
-print len(pred_len)
-```
-
-## Model configuration
-
-1. Define input data dimensions and model hyperparameters.
-
-    ```python
-    mark_dict_len = 2    # Value range of region mark. Region mark is either 0 or 1, so range is 2
-    word_dim = 32        # word vector dimension
-    mark_dim = 5         # adjacent dimension
-    hidden_dim = 512     # the dimension of LSTM hidden layer vector is 128 (512/4)
-    depth = 8            # depth of stacked LSTM
-    
-    # There are 9 features per sample, so we will define 9 data layers.
-    # They type for each layer is integer_value_sequence.
-    def d_type(value_range):
-        return paddle.data_type.integer_value_sequence(value_range)
-
-    # word sequence
-    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
-    # predicate
-    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) 
-
-    # 5 features for predicate context
-    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) 
-    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
-    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
-    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
-    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
-    
-    # region marker sequence
-    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
-    
-    # label sequence
-    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-    ```
-    
-   Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
-
-2. The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences.
-
-    ```python   
-   
-    # Since word vectorlookup table is pre-trained, we won't update it this time.
-    # is_static being True prevents updating the lookup table during training.
-    emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
-    # hyperparameter configurations
-    default_std = 1 / math.sqrt(hidden_dim) / 3.0
-    std_default = paddle.attr.Param(initial_std=default_std)
-    std_0 = paddle.attr.Param(initial_std=0.)
-
-    predicate_embedding = paddle.layer.embedding(
-        size=word_dim,
-        input=predicate,
-        param_attr=paddle.attr.Param(
-            name='vemb', initial_std=default_std))
-    mark_embedding = paddle.layer.embedding(
-        size=mark_dim, input=mark, param_attr=std_0)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        paddle.layer.embedding(
-            size=word_dim, input=x, param_attr=emb_para) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-    ```
-
-3. 8 LSTM units will be trained in "forward / backward" order.
-
-    ```python  
-    hidden_0 = paddle.layer.mixed(
-        size=hidden_dim,
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=emb, param_attr=std_default) for emb in emb_layers
-        ])
-
-    mix_hidden_lr = 1e-3
-    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
-    hidden_para_attr = paddle.attr.Param(
-        initial_std=default_std, learning_rate=mix_hidden_lr)
-
-    lstm_0 = paddle.layer.lstmemory(
-        input=hidden_0,
-        act=paddle.activation.Relu(),
-        gate_act=paddle.activation.Sigmoid(),
-        state_act=paddle.activation.Sigmoid(),
-        bias_attr=std_0,
-        param_attr=lstm_para_attr)
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = paddle.layer.mixed(
-            size=hidden_dim,
-            bias_attr=std_default,
-            input=[
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[0], param_attr=hidden_para_attr),
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[1], param_attr=lstm_para_attr)
-            ])
-
-        lstm = paddle.layer.lstmemory(
-            input=mix_hidden,
-            act=paddle.activation.Relu(),
-            gate_act=paddle.activation.Sigmoid(),
-            state_act=paddle.activation.Sigmoid(),
-            reverse=((i % 2) == 1),
-            bias_attr=std_0,
-            param_attr=lstm_para_attr)
-
-        input_tmp = [mix_hidden, lstm]
-    ```
-
-4. We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation.
-
-    ```python
-    feature_out = paddle.layer.mixed(
-    size=label_dict_len,
-    bias_attr=std_default,
-    input=[
-        paddle.layer.full_matrix_projection(
-            input=input_tmp[0], param_attr=hidden_para_attr),
-        paddle.layer.full_matrix_projection(
-            input=input_tmp[1], param_attr=lstm_para_attr)
-    ], )
-    ```
-
-5.  We use CRF as cost function, the parameter of CRF cost will be named `crfw`.
-
-    ```python
-    crf_cost = paddle.layer.crf(
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=paddle.attr.Param(
-            name='crfw',
-            initial_std=default_std,
-            learning_rate=mix_hidden_lr))
-    ```
-
-6.  CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer.  The sharing of parameters among multiple layers is specified by the same parameter name in these layers.
-
-    ```python
-    crf_dec = paddle.layer.crf_decoding(
-       name='crf_dec_l',
-       size=label_dict_len,
-       input=feature_out,
-       label=target,
-       param_attr=paddle.attr.Param(name='crfw'))
-    ```
-
-## Train model
-
-### Create Parameters
-
-All necessary parameters will be traced created given output layers that we need to use.
-
-```python
-parameters = paddle.parameters.create([crf_cost, crf_dec])
-```
-
-We can print out parameter name. It will be generated if not specified.
-   
-```python
-print parameters.keys()
-```
-
-Now we load pre-trained word lookup table.
-
-```python
-def load_parameter(file_name, h, w):
-    with open(file_name, 'rb') as f:
-         f.read(16)
-         return np.fromfile(f, dtype=np.float32).reshape(h, w)
-parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
-```
-
-### Create Trainer
-
-We will create trainer given model topology, parameters and optimization method. We will use most basic SGD method (momentum optimizer with 0 momentum). In the meantime, we will set learning rate and regularization.
-
-```python
-optimizer = paddle.optimizer.Momentum(
-    momentum=0,
-    learning_rate=2e-2,
-    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-    model_average=paddle.optimizer.ModelAverage(
-        average_window=0.5, max_average_window=10000), )
-
-trainer = paddle.trainer.SGD(cost=crf_cost,
-                             parameters=parameters,
-                             update_equation=optimizer)
-```
-
-### Trainer
-
-As mentioned in data preparation section, we will use CoNLL 2005 test corpus as training data set. `conll05.test()` outputs one training instance at a time. It will be shuffled, and batched into mini batches as input.
-
-```python
-reader = paddle.batch(
-    paddle.reader.shuffle(
-        conll05.test(), buf_size=8192), batch_size=20)
-```
-
-`feeding` is used to specify relationship between data instance and layer layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`.
-
-```python
-feeding = {
-    'word_data': 0,
-    'ctx_n2_data': 1,
-    'ctx_n1_data': 2,
-    'ctx_0_data': 3,
-    'ctx_p1_data': 4,
-    'ctx_p2_data': 5,
-    'verb_data': 6,
-    'mark_data': 7,
-    'target': 8
-}
-```
-
-`event_handle` can be used as callback for training events, it will be used as an argument for `train`. Following `event_handle` prints cost during training.
-
-```python
-def event_handler(event):
-    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
-            print "Pass %d, Batch %d, Cost %f" % (
-                event.pass_id, event.batch_id, event.cost)
-```
-
-`trainer.train` will train the model.
-
-```python
-trainer.train(
-    reader=reader,
-    event_handler=event_handler,
-    num_passes=10000,
-    feeding=feeding)
-```
-
-## Conclusion
-
-Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we give SRL as an example to introduce how to use PaddlePaddle to do sequence tagging tasks. Proposed models are from our published paper\[[10](#Reference)\]. We only use test data as an illustration since train data on CoNLL 2005 dataset is not completely public. We hope to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models. Please check out our paper for more information and discussions.
-
-## Reference
-1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
-2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013.
-3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
-4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014.
-5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289.
-6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012.
-7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330.
-8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106.
-9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164.
-10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/label_semantic_roles/data/extract_dict_feature.py b/label_semantic_roles/data/extract_dict_feature.py
deleted file mode 100644
index da44111976a0dec68345fc139d0aa459ca9211c2..0000000000000000000000000000000000000000
--- a/label_semantic_roles/data/extract_dict_feature.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-from optparse import OptionParser
-
-
-def extract_dict_features(pair_file, feature_file):
-
-    with open(pair_file) as fin, open(feature_file, 'w') as feature_out:
-        for line in fin:
-            sentence, predicate, labels = line.strip().split('\t')
-            sentence_list = sentence.split()
-            labels_list = labels.split()
-
-            verb_index = labels_list.index('B-V')
-
-            mark = [0] * len(labels_list)
-            if verb_index > 0:
-                mark[verb_index - 1] = 1
-                ctx_n1 = sentence_list[verb_index - 1]
-            else:
-                ctx_n1 = 'bos'
-
-            if verb_index > 1:
-                mark[verb_index - 2] = 1
-                ctx_n2 = sentence_list[verb_index - 2]
-            else:
-                ctx_n2 = 'bos'
-
-            mark[verb_index] = 1
-            ctx_0 = sentence_list[verb_index]
-
-            if verb_index < len(labels_list) - 1:
-                mark[verb_index + 1] = 1
-                ctx_p1 = sentence_list[verb_index + 1]
-            else:
-                ctx_p1 = 'eos'
-
-            if verb_index < len(labels_list) - 2:
-                mark[verb_index + 2] = 1
-                ctx_p2 = sentence_list[verb_index + 2]
-            else:
-                ctx_p2 = 'eos'
-
-
-            feature_str  = sentence + '\t' \
-                           + predicate + '\t' \
-                           + ctx_n2 + '\t' \
-                           + ctx_n1 + '\t' \
-                           + ctx_0 + '\t' \
-                           + ctx_p1 + '\t' \
-                           + ctx_p2 + '\t' \
-                           + ' '.join([str(i) for i in mark]) + '\t' \
-                           + labels
-
-            feature_out.write(feature_str + '\n')
-
-
-if __name__ == '__main__':
-
-    usage = '-p pair_file -f feature_file'
-    parser = OptionParser(usage)
-    parser.add_option('-p', dest='pair_file', help='the pair file')
-    parser.add_option('-f', dest='feature_file', help='the feature file')
-
-    (options, args) = parser.parse_args()
-
-    extract_dict_features(options.pair_file, options.feature_file)
diff --git a/label_semantic_roles/data/extract_pairs.py b/label_semantic_roles/data/extract_pairs.py
deleted file mode 100644
index 94a8488c16734eb1882d54f7ec36f4b9308c09d4..0000000000000000000000000000000000000000
--- a/label_semantic_roles/data/extract_pairs.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-from optparse import OptionParser
-
-
-def read_labels(props_file):
-    '''
-    a sentence maybe has more than one verb, each verb has its label sequence
-    label[],  is a 3-dimension list. 
-    the first dim is to store all sentence's label seqs, len is the sentence number
-    the second dim is to store all label sequences for one sentences
-    the third dim is to store each label for one word
-    '''
-    labels = []
-    with open(props_file) as fin:
-        label_seqs_for_one_sentences = []
-        one_seg_in_file = []
-        for line in fin:
-            line = line.strip()
-            if line == '':
-                for i in xrange(len(one_seg_in_file[0])):
-                    a_kind_lable = [x[i] for x in one_seg_in_file]
-                    label_seqs_for_one_sentences.append(a_kind_lable)
-                labels.append(label_seqs_for_one_sentences)
-                one_seg_in_file = []
-                label_seqs_for_one_sentences = []
-            else:
-                part = line.split()
-                one_seg_in_file.append(part)
-    return labels
-
-
-def read_sentences(words_file):
-    sentences = []
-    with open(words_file) as fin:
-        s = ''
-        for line in fin:
-            line = line.strip()
-            if line == '':
-                sentences.append(s)
-                s = ''
-            else:
-                s += line + ' '
-    return sentences
-
-
-def transform_labels(sentences, labels):
-    sen_lab_pair = []
-    for i in xrange(len(sentences)):
-        if len(labels[i]) == 1:
-            continue
-        else:
-            verb_list = []
-            for x in labels[i][0]:
-                if x != '-':
-                    verb_list.append(x)
-
-            for j in xrange(1, len(labels[i])):
-                label_list = labels[i][j]
-                current_tag = 'O'
-                is_in_bracket = False
-                label_seq = []
-                verb_word = ''
-                for ll in label_list:
-                    if ll == '*' and is_in_bracket == False:
-                        label_seq.append('O')
-                    elif ll == '*' and is_in_bracket == True:
-                        label_seq.append('I-' + current_tag)
-                    elif ll == '*)':
-                        label_seq.append('I-' + current_tag)
-                        is_in_bracket = False
-                    elif ll.find('(') != -1 and ll.find(')') != -1:
-                        current_tag = ll[1:ll.find('*')]
-                        label_seq.append('B-' + current_tag)
-                        is_in_bracket = False
-                    elif ll.find('(') != -1 and ll.find(')') == -1:
-                        current_tag = ll[1:ll.find('*')]
-                        label_seq.append('B-' + current_tag)
-                        is_in_bracket = True
-                    else:
-                        print 'error:', ll
-                sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq))
-    return sen_lab_pair
-
-
-def write_file(sen_lab_pair, output_file):
-    with open(output_file, 'w') as fout:
-        for x in sen_lab_pair:
-            sentence = x[0]
-            label_seq = ' '.join(x[2])
-            assert len(sentence.split()) == len(x[2])
-            fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n')
-
-
-if __name__ == '__main__':
-
-    usage = '-w words_file -p props_file -o output_file'
-    parser = OptionParser(usage)
-    parser.add_option('-w', dest='words_file', help='the words file')
-    parser.add_option('-p', dest='props_file', help='the props file')
-    parser.add_option('-o', dest='output_file', help='the output_file')
-    (options, args) = parser.parse_args()
-
-    sentences = read_sentences(options.words_file)
-    labels = read_labels(options.props_file)
-    sen_lab_pair = transform_labels(sentences, labels)
-
-    write_file(sen_lab_pair, options.output_file)
diff --git a/label_semantic_roles/data/get_data.sh b/label_semantic_roles/data/get_data.sh
deleted file mode 100644
index dafa218988e480ed826c69328b5df7cb50be437d..0000000000000000000000000000000000000000
--- a/label_semantic_roles/data/get_data.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-wget http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt
-wget http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb
-tar -xzvf conll05st-tests.tar.gz
-
-mv verbDict.txt predicate_dict
-mv targetDict.txt label_dict
-mv wordDict.txt word_dict
-rm conll05st-tests.tar.gz
-
-cp ./conll05st-release/test.wsj/words/test.wsj.words.gz  .
-cp ./conll05st-release/test.wsj/props/test.wsj.props.gz  .
-gunzip test.wsj.words.gz
-gunzip test.wsj.props.gz
-
-python extract_pairs.py  -w test.wsj.words -p test.wsj.props -o test.wsj.seq_pair
-python extract_dict_feature.py -p test.wsj.seq_pair -f feature
-
-echo `pwd`/feature > train.list
-echo `pwd`/feature > test.list
diff --git a/label_semantic_roles/dataprovider.py b/label_semantic_roles/dataprovider.py
deleted file mode 100644
index ca2dcdff61681ec673b7ca97cc8ca3bb628cbb6e..0000000000000000000000000000000000000000
--- a/label_semantic_roles/dataprovider.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-UNK_IDX = 0
-
-
-def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    settings.predicate_dict = predicate_dict
-
-    #all inputs are integral and sequential type
-    settings.input_types = {
-        'word_data': integer_value_sequence(len(word_dict)),
-        'ctx_n2_data': integer_value_sequence(len(word_dict)),
-        'ctx_n1_data': integer_value_sequence(len(word_dict)),
-        'ctx_0_data': integer_value_sequence(len(word_dict)),
-        'ctx_p1_data': integer_value_sequence(len(word_dict)),
-        'ctx_p2_data': integer_value_sequence(len(word_dict)),
-        'verb_data': integer_value_sequence(len(predicate_dict)),
-        'mark_data': integer_value_sequence(2),
-        'target': integer_value_sequence(len(label_dict))
-    }
-
-
-def get_batch_size(yield_data):
-    return len(yield_data[0])
-
-
-@provider(
-    init_hook=hook,
-    should_shuffle=True,
-    calc_batch_size=get_batch_size,
-    can_over_batch_size=True,
-    cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield {
-                'word_data': word_slot,
-                'ctx_n2_data': ctx_n2_slot,
-                'ctx_n1_data': ctx_n1_slot,
-                'ctx_0_data': ctx_0_slot,
-                'ctx_p1_data': ctx_p1_slot,
-                'ctx_p2_data': ctx_p2_slot,
-                'verb_data': predicate_slot,
-                'mark_data': mark_slot,
-                'target': label_slot
-            }
diff --git a/label_semantic_roles/db_lstm.py b/label_semantic_roles/db_lstm.py
deleted file mode 100755
index 6baaf254b6fdfd3fc273240b017e3ae5cb08a855..0000000000000000000000000000000000000000
--- a/label_semantic_roles/db_lstm.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import os
-import sys
-from paddle.trainer_config_helpers import *
-
-is_test = get_config_arg('is_test', bool, False)
-is_predict = get_config_arg('is_predict', bool, False)
-
-#file paths
-word_dict_file = './data/word_dict'
-label_dict_file = './data/label_dict'
-predicate_file = './data/predicate_dict'
-train_list_file = './data/train.list' if not (is_test or is_predict) else None
-test_list_file = './data/test.list'
-
-
-def load_dict(dict_file_path):
-    data_dict = {}
-    with open(dict_file_path, "r") as fdict:
-        for idx, line in enumerate(fdict):
-            data_dict[line.strip()] = idx
-    return data_dict
-
-
-if not is_predict:
-    #load dictionaries
-    word_dict = load_dict(word_dict_file)
-    label_dict = load_dict(label_dict_file)
-    predicate_dict = load_dict(predicate_file)
-
-    #define data provider
-    define_py_data_sources2(
-        train_list=train_list_file,
-        test_list=test_list_file,
-        module='dataprovider',
-        obj='process',
-        args={
-            'word_dict': word_dict,
-            'label_dict': label_dict,
-            'predicate_dict': predicate_dict
-        })
-
-word_dict_len = get_config_arg('dict_len',
-                               int) if is_predict else len(word_dict)
-label_dict_len = get_config_arg('label_len',
-                                int) if is_predict else len(label_dict)
-pred_len = get_config_arg('pred_len',
-                          int) if is_predict else len(predicate_dict)
-
-############################## Hyper-parameters ##################################
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-
-########################### Optimizer #######################################
-
-settings(
-    batch_size=1,
-    learning_method=MomentumOptimizer(momentum=0),
-    learning_rate=2e-2,
-    regularization=L2Regularization(8e-4),
-    model_average=ModelAverage(
-        average_window=0.5, max_average_window=10000), )
-
-####################################### network ##############################
-#8 features and 1 target
-word = data_layer(name='word_data', size=word_dict_len)
-predicate = data_layer(name='verb_data', size=pred_len)
-
-ctx_n2 = data_layer(name='ctx_n2_data', size=word_dict_len)
-ctx_n1 = data_layer(name='ctx_n1_data', size=word_dict_len)
-ctx_0 = data_layer(name='ctx_0_data', size=word_dict_len)
-ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
-ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
-mark = data_layer(name='mark_data', size=mark_dict_len)
-
-if not is_predict:
-    target = data_layer(name='target', size=label_dict_len)
-
-default_std = 1 / math.sqrt(hidden_dim) / 3.0
-
-emb_para = ParameterAttribute(name='emb', initial_std=0., is_static=True)
-std_0 = ParameterAttribute(initial_std=0.)
-std_default = ParameterAttribute(initial_std=default_std)
-
-predicate_embedding = embedding_layer(
-    size=word_dim,
-    input=predicate,
-    param_attr=ParameterAttribute(
-        name='vemb', initial_std=default_std))
-
-word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-emb_layers = [
-    embedding_layer(
-        size=word_dim, input=x, param_attr=emb_para) for x in word_input
-]
-emb_layers.append(predicate_embedding)
-mark_embedding = embedding_layer(
-    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
-emb_layers.append(mark_embedding)
-
-hidden_0 = mixed_layer(
-    name='hidden0',
-    size=hidden_dim,
-    bias_attr=std_default,
-    input=[
-        full_matrix_projection(
-            input=emb, param_attr=std_default) for emb in emb_layers
-    ])
-
-mix_hidden_lr = 1e-3
-lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
-hidden_para_attr = ParameterAttribute(
-    initial_std=default_std, learning_rate=mix_hidden_lr)
-
-lstm_0 = lstmemory(
-    name='lstm0',
-    input=hidden_0,
-    act=ReluActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=SigmoidActivation(),
-    bias_attr=std_0,
-    param_attr=lstm_para_attr)
-
-#stack L-LSTM and R-LSTM with direct edges
-input_tmp = [hidden_0, lstm_0]
-
-for i in range(1, depth):
-
-    mix_hidden = mixed_layer(
-        name='hidden' + str(i),
-        size=hidden_dim,
-        bias_attr=std_default,
-        input=[
-            full_matrix_projection(
-                input=input_tmp[0], param_attr=hidden_para_attr),
-            full_matrix_projection(
-                input=input_tmp[1], param_attr=lstm_para_attr)
-        ])
-
-    lstm = lstmemory(
-        name='lstm' + str(i),
-        input=mix_hidden,
-        act=ReluActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=SigmoidActivation(),
-        reverse=((i % 2) == 1),
-        bias_attr=std_0,
-        param_attr=lstm_para_attr)
-
-    input_tmp = [mix_hidden, lstm]
-
-feature_out = mixed_layer(
-    name='output',
-    size=label_dict_len,
-    bias_attr=std_default,
-    input=[
-        full_matrix_projection(
-            input=input_tmp[0], param_attr=hidden_para_attr),
-        full_matrix_projection(
-            input=input_tmp[1], param_attr=lstm_para_attr)
-    ], )
-
-if not is_predict:
-    crf_l = crf_layer(
-        name='crf',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=ParameterAttribute(
-            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
-
-    crf_dec_l = crf_decoding_layer(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=ParameterAttribute(name='crfw'))
-
-    eval = sum_evaluator(input=crf_dec_l)
-
-    outputs(crf_l)
-
-else:
-    crf_dec_l = crf_decoding_layer(
-        name='crf_dec_l',
-        size=label_dict_len,
-        input=feature_out,
-        param_attr=ParameterAttribute(name='crfw'))
-
-    outputs(crf_dec_l)
diff --git a/label_semantic_roles/image/bio_example.png b/label_semantic_roles/image/bio_example.png
deleted file mode 100644
index 9ffebf26e6b5f879849e24061bfcc1a3b36d2f9d..0000000000000000000000000000000000000000
Binary files a/label_semantic_roles/image/bio_example.png and /dev/null differ
diff --git a/label_semantic_roles/image/dependency_parsing.png b/label_semantic_roles/image/dependency_parsing.png
deleted file mode 100644
index e54df49321d0607b0c3ae3300d38176a21f50d57..0000000000000000000000000000000000000000
Binary files a/label_semantic_roles/image/dependency_parsing.png and /dev/null differ
diff --git a/label_semantic_roles/index.en.html b/label_semantic_roles/index.en.html
deleted file mode 100644
index b29478eb08e715d00df046da831241e0f3fd8e74..0000000000000000000000000000000000000000
--- a/label_semantic_roles/index.en.html
+++ /dev/null
@@ -1,563 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# Semantic Role Labeling
-
-Source code of this chapter is in [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/label_semantic_roles).
-
-## Background
-
-Natural Language Analysis contains three components: Lexical Analysis, Syntactic Analysis, and Semantic Analysis. Semantic Role Labelling (SRL) is one way for Shallow Semantic Analysis. A predicate of a sentence is a property that a subject possesses or is characterized, such as what it does, what it is or how it is, which mostly corresponds to the core of an event. The noun associated with a predicate is called Argument. Semantic roles express the abstract roles that arguments of a predicate can take in the event, such as Agent, Patient, Theme, Experiencer, Beneficiary, Instrument, Location, Goal and Source, etc.
-
-In the following example, “遇到” (encounters) is a Predicate (“Pred”)，“小明” (Ming) is an Agent，“小红” (Hong) is a Patient，“昨天” (yesterday) indicates the Time, and “公园” (park) is the Location.
-
-$$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mbox{Time}\mbox{在[公园]}_{\mbox{Location}}\mbox{[遇到]}_{\mbox{Predicate}}\mbox{了[小红]}_{\mbox{Patient}}\mbox{。}$$
-
-Instead of in-depth analysis on semantic information, the goal of Semantic Role Labeling is to identify the relation of predicate and other constituents, e.g., predicate-argument structure, as specific semantic roles, which is an important intermediate step in a wide range of natural language understanding tasks (Information Extraction, Discourse Analysis, DeepQA etc). Predicates are always assumed to be given; the only thing is to identify arguments and their semantic roles.
-
-Standard SRL system mostly builds on top of Syntactic Analysis and contains five steps:
-
-1. Construct a syntactic parse tree, as shown in Fig. 1
-2. Identity candidate arguments of given predicate from constructed syntactic parse tree.
-3. Prune most unlikely candidate arguments.
-4. Identify arguments, often by a binary classifier.
-5. Multi-class semantic role labeling. Steps 2-3 usually introduce hand-designed features based on Syntactic Analysis (step 1).
-
-
-<div  align="center">
-<img src="image/dependency_parsing.png" width = "80%" align=center /><br>
-Fig 1. Syntactic parse tree
-</div>
-
-核心关系-> HED
-定中关系-> ATT
-主谓关系-> SBV
-状中结构-> ADV
-介宾关系-> POB
-右附加关系-> RAD
-动宾关系-> VOB
-标点-> WP
-
-
-However, complete syntactic analysis requires identifying the relation among all constitutes and the performance of SRL is sensitive to the precision of syntactic analysis, which makes SRL a very challenging task. To reduce the complexity and obtain some syntactic structure information, we often use shallow syntactic analysis. Shallow Syntactic Analysis is also called partial parsing or chunking. Unlike complete syntactic analysis which requires the construction of the complete parsing tree, Shallow Syntactic Analysis only need to identify some independent components with relatively simple structure, such as verb phrases (chunk). To avoid difficulties in constructing a syntactic tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking based SRL methods, which convert SRL as a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using BIO representation. For syntactic chunks forming a chunk of type A, the first chunk receives the B-A tag (Begin), the remaining ones receive the tag I-A (Inside), and all chunks outside receive the tag O-A.
-
-The BIO representation of above example is shown in Fig.1.
-
-<div  align="center">
-<img src="image/bio_example.png" width = "90%"  align=center /><br>
-Fig 2. BIO represention
-</div>
-
-输入序列-> input sequence
-语块-> chunk
-标注序列-> label sequence
-角色-> role
-
-This example illustrates the simplicity of sequence tagging because (1) shallow syntactic analysis reduces the precision requirement of syntactic analysis; (2) pruning candidate arguments is removed; 3) argument identification and tagging are finished at the same time. Such unified methods simplify the procedure, reduce the risk of accumulating errors and boost the performance further.
-
-In this tutorial, our SRL system is built as an end-to-end system via a neural network. We take only text sequences, without using any syntactic parsing results or complex hand-designed features. We give public dataset [CoNLL-2004 and CoNLL-2005 Shared Tasks](http://www.cs.upc.edu/~srlconll/) as an example to illustrate: given a sentence with predicates marked, identify the corresponding arguments and their semantic roles by sequence tagging method.
-
-## Model
-
-Recurrent Neural Networks are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike Feed-forward neural networks, RNNs can model the dependency between elements of sequences. LSTMs as variants of RNNs aim to model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
-
-### Stacked Recurrent Neural Network
-
-Deep Neural Networks allows extracting hierarchical representations. Higher layers can form more abstract/complex representations on top of lower layers. LSTMs, when unfolded in time, is a deep feed-forward neural network, because a computational path between the input at time $k < t$ to the output at time $t$ crosses several nonlinear layers. However, the computation carried out at each time-step is only linear transformation, which makes LSTMs a shallow model. Deep LSTMs are typically constructed by stacking multiple LSTM layers on top of each other and taking the output from lower LSTM layer at time $t$ as the input of upper LSTM layer at time $t$. Deep, hierarchical neural networks can be much efficient at representing some functions and modeling varying-length dependencies\[[2](#Reference)\].
-
-
-However, deep LSTMs increases the number of nonlinear steps the gradient has to traverse when propagated back in depth. For example, four layer LSTMs can be trained properly, but the performance becomes worse as the number of layers up to 4-8. Conventional LSTMs prevent backpropagated errors from vanishing and exploding by introducing shortcut connections to skip the intermediate nonlinear layers. Therefore, deep LSTMs can consider shortcut connections in depth as well.
-
-
-The operation of a single LSTM cell contain 3 parts: (1) input-to-hidden: map input $x$ to the input of the forget gates, input gates, memory cells and output gates by linear transformation (i.e., matrix mapping); (2) hidden-to-hidden: calculate forget gates, input gates, output gates and update memory cell, this is the main part of LSTMs; (3)hidden-to-output: this part typically involves an activation operation on hidden states. Based on the stacked LSTMs, we add a shortcut connection: take the input-to-hidden from the previous layer as a new input and learn another linear transformation.
-
-Fig.3 illustrate the final stacked recurrent neural networks.
-
-<p align="center">    
-<img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
-Fig 3. Stacked Recurrent Neural Networks
-</p>
-
-线性变换-> linear transformation
-输入层到隐层-> input-to-hidden
-
-### Bidirectional Recurrent Neural Network
-
-LSTMs can summarize the history of previous inputs seen up to now, but can not see the future. In most of NLP (natural language processing) tasks, the entire sentences are ready to use. Therefore, sequential learning might be much efficient if the future can be encoded as well like histories.
-
-To address the above drawbacks, we can design bidirectional recurrent neural networks by making a minor modification. Higher LSTM layers process the sequence in reversed direction with previous lower LSTM layers, i.e., Deep LSTMs operate from left-to-right, right-to-left, left-to-right,..., in depth. Therefore, LSTM layers at time-step $t$ can see both histories and the future since the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
-
-
-<p align="center">    
-<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
-Fig 4. Bidirectional LSTMs
-</p>
-
-线性变换-> linear transformation
-输入层到隐层-> input-to-hidden
-正向处理输出序列->process sequence in the forward direction
-反向处理上一层序列-> process sequence from the previous layer in backward direction
-
-Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks[machine translation](https://github.com/PaddlePaddle/book/blob/develop/machine_translation/README.md)
-
-### Conditional Random Field
-
-The basic pipeline of Neural Networks solving problems is 1) all lower layers aim to learn representations; 2) the top layer is designed for learning the final task. In SRL tasks, CRF is built on top of the network for the final tag sequence prediction. It takes the representations provided by the last LSTM layer as input.
-
-
-CRF is a probabilistic graph model (undirected) with nodes denoting random variables and edges denoting dependencies between nodes. To be simplicity, CRFs learn conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input, $Y = (y_1, y_2, ... , y_n)$ are label sequences; Decoding is to search sequence $Y$ to maximize conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
-
-Sequence tagging tasks only consider input and output as linear sequences without extra dependent assumptions on graph model. Thus, the graph model of sequence tagging tasks is simple chain or line, which results in a Linear-Chain Conditional Random Field, shown in Fig.5.
-
-<p align="center">    
-<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
-Fig 5. Linear Chain Conditional Random Field used in SRL tasks
-</p>
-
-By the fundamental theorem of random fields \[[5](#Reference)\], the joint distribution over the label sequence $Y$ given $X$ has the form: 
-
-$$p(Y | X) = \frac{1}{Z(X)} \text{exp}\left(\sum_{i=1}^{n}\left(\sum_{j}\lambda_{j}t_{j} (y_{i - 1}, y_{i}, X, i) + \sum_{k} \mu_k s_k (y_i, X, i)\right)\right)$$
-
-
-where, $Z(X)$ is normalization constant, $t_j$ is feature function defined on edges, called transition feature, depending on $y_i$ and $y_{i-1}$ which represents transition probabilities from $y_{i-1}$ to $y_i$ given input sequence $X$. $s_k$ is feature function defined on nodes, called state feature, depending on $y_i$ and represents the probality of $y_i$ given input sequence $X$. $\lambda_j$ 和 $\mu_k$ are weights corresponding to $t_j$ and $s_k$. Actually, $t$ and $s$ can be wrtten in the same form, then take summation over all nodes $i$: $f_{k}(Y, X) = \sum_{i=1}^{n}f_k({y_{i - 1}, y_i, X, i})$, $f$ is defined as feature function. Thus, $P(Y|X)$ can be wrtten as:
-
-$$p(Y|X, W) = \frac{1}{Z(X)}\text{exp}\sum_{k}\omega_{k}f_{k}(Y, X)$$
-
-$\omega$ are weights of feature function which should be learned in CRF models. At training stage, given input sequences and label sequences $D = \left[(X_1,  Y_1), (X_2 , Y_2) , ... , (X_N, Y_N)\right]$, solve following objective function using MLE:
-
-
-$$L(\lambda, D) = - \text{log}\left(\prod_{m=1}^{N}p(Y_m|X_m, W)\right) + C \frac{1}{2}\lVert W\rVert^{2}$$
-
-
-This objective function can be solved via back-propagation in an end-to-end manner. At decoding stage, given input sequences $X$, search sequence $\bar{Y}$ to maximize conditional probability $\bar{P}(Y|X)$ via decoding methods (such as Viterbi, Beam Search).
-
-### DB-LSTM SRL model
-
-Given predicates and a sentence, SRL tasks aim to identify arguments of the given predicate and their semantic roles. If a sequence has n predicates, we will process this sequence n times. One model is as follows:
-
-1. Construct inputs;
- - input 1: predicate, input 2: sentence
- - expand input 1 as a sequence with the same length with input 2 using one-hot representation;
-2. Convert one-hot sequences from step 1 to vector sequences via lookup table;
-3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
-4. Take representations from step 3 as inputs, label sequence as supervision signal, do sequence tagging tasks
-
-We can try above method. Here, we propose some modifications by introducing two simple but effective features:
-
-- predicate context (ctx-p): A single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the expanded context, the ambiguity can be largely eliminated. Thus, we extract $n$ words before and after predicate to construct a window chunk.
-
-- region mark ($m_r$): $m_r = 1$ to denote word in that position locates in the predicate context region, or $m_r = 0$ if not.
-
-After modification, the model is as follows:
-
-1. Construct inputs
- - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, element value will be 1 if word locates in the predicate context region, 0 otherwise.
- - expand input 2~3 as sequences with the same length with input 1
-2. Convert input 1~4 to vector sequences via lookup table; input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables
-3. Take four vector sequences from step 2 as inputs of bidirectional LSTMs; Train LSTMs to update representations
-4. Take representation from step 3 as input of CRF, label sequence as supervision signal, do sequence tagging tasks
-
-
-<div  align="center">    
-<img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
-Fig 6. DB-LSTM for SRL tasks
-</div>
-
-论元-> argu
-谓词-> pred
-谓词上下文-> ctx-p
-谓词上下文区域标记-> $m_r$
-输入-> input
-原句-> sentence
-反向LSTM-> LSTM Reverse
-
-## Data Preparation
-
-In the tutorial, we use [CoNLL 2005](http://www.cs.upc.edu/~srlconll/) SRL task open dataset as an example. It is important to note that the training set and development set of the CoNLL 2005 SRL task are not free to download after the competition. Currently, only the test set can be obtained, including 23 sections of the Wall Street Journal and three sections of the Brown corpus. In this tutorial, we use the WSJ corpus as the training dataset to explain the model. However, since the training set is small, if you want to train a usable neural network SRL system, consider paying for the full corpus.
-
-The original data includes a variety of information such as POS tagging, naming entity recognition, parsing tree, and so on. In this tutorial, we only use the data under the words folder (text sequence) and the props folder (label results) inside test.wsj parent folder. The data directory used in this tutorial is as follows:
-
-```text
-conll05st-release/
-└── test.wsj
-    ├── props  # 标注结果
-    └── words  # 输入文本序列
-```
-
-The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The label of the PropBank is different from the label that we used in the example at the beginning of the article, but the principle is the same. For the description of the label, please refer to the paper \[[9](#references)\].
-
-The raw data needs to be preprocessed before used by PaddlePaddle. The preprocessing consists of the following steps:
-
-1. Merge the text sequence and the tag sequence into the same record;
-2. If a sentence contains $n$ predicates, the sentence will be processed $n$ times into $n$ separate training samples, each sample with a different predicate;
-3. Extract the predicate context and construct the predicate context region marker;
-4. Construct the markings in BIO format;
-5. Obtain the integer index corresponding to the word according to the dictionary.
-
-```python
-# import paddle.v2.dataset.conll05 as conll05
-# conll05.corpus_reader does step 1 and 2 as mentioned above.
-# conll05.reader_creator does step 3 to 5.
-# conll05.test gets preprocessed training instances.
-```
-
-After preprocessing completes, a training sample contains nine features, namely: word sequence, predicate, predicate context (5 columns), region mark sequence, label sequence. Following table is an example of a training sample.
-
-| word sequence | predicate | predicate context（5 columns） | region mark sequence | label sequence|
-|---|---|---|---|---|
-| A | set | n't been set . × | 0 | B-A1 |
-| record | set | n't been set . × | 0 | I-A1 |
-| date | set | n't been set . × | 0 | I-A1 |
-| has | set | n't been set . × | 0 | O |
-| n't | set | n't been set . × | 1 | B-AM-NEG |
-| been | set | n't been set . × | 1 | O |
-| set | set | n't been set . × | 1 | B-V |
-| . | set | n't been set . × | 1 | O |
-
-In addition to the data, we provide following resources:
-
-| filename | explanation |
-|---|---|
-| word_dict | dictionary of input sentences, total 44068 words |
-| label_dict | dictionary of labels, total 106 labels |
-| predicate_dict | predicate dictionary, total 3162 predicates |
-| emb | a pre-trained word vector lookup table, 32-dimentional |
-
-We trained in the English Wikipedia language model to get a word vector lookup table used to initialize the SRL model. During the SRL model training process, the word vector lookup table is no longer updated. About the language model and the word vector lookup table can refer to [word vector](https://github.com/PaddlePaddle/book/blob/develop/word2vec/README.md) tutorial. There are 995,000,000 token in training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
-
-Get dictionary, print dictionary size:
-
-```python
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_len = len(verb_dict)
-
-print len(word_dict_len)
-print len(label_dict_len)
-print len(pred_len)
-```
-
-## Model configuration
-
-1. Define input data dimensions and model hyperparameters.
-
-    ```python
-    mark_dict_len = 2    # Value range of region mark. Region mark is either 0 or 1, so range is 2
-    word_dim = 32        # word vector dimension
-    mark_dim = 5         # adjacent dimension
-    hidden_dim = 512     # the dimension of LSTM hidden layer vector is 128 (512/4)
-    depth = 8            # depth of stacked LSTM
-    
-    # There are 9 features per sample, so we will define 9 data layers.
-    # They type for each layer is integer_value_sequence.
-    def d_type(value_range):
-        return paddle.data_type.integer_value_sequence(value_range)
-
-    # word sequence
-    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
-    # predicate
-    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len)) 
-
-    # 5 features for predicate context
-    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len)) 
-    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
-    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
-    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
-    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
-    
-    # region marker sequence
-    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
-    
-    # label sequence
-    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
-    ```
-    
-   Speciala note: hidden_dim = 512 means LSTM hidden vector of 128 dimension (512/4). Please refer PaddlePaddle official documentation for detail: [lstmemory](http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/layers.html#lstmemory)。
-
-2. The word sequence, predicate, predicate context, and region mark sequence are transformed into embedding vector sequences.
-
-    ```python   
-   
-    # Since word vectorlookup table is pre-trained, we won't update it this time.
-    # is_static being True prevents updating the lookup table during training.
-    emb_para = paddle.attr.Param(name='emb', initial_std=0., is_static=True)
-    # hyperparameter configurations
-    default_std = 1 / math.sqrt(hidden_dim) / 3.0
-    std_default = paddle.attr.Param(initial_std=default_std)
-    std_0 = paddle.attr.Param(initial_std=0.)
-
-    predicate_embedding = paddle.layer.embedding(
-        size=word_dim,
-        input=predicate,
-        param_attr=paddle.attr.Param(
-            name='vemb', initial_std=default_std))
-    mark_embedding = paddle.layer.embedding(
-        size=mark_dim, input=mark, param_attr=std_0)
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        paddle.layer.embedding(
-            size=word_dim, input=x, param_attr=emb_para) for x in word_input
-    ]
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-    ```
-
-3. 8 LSTM units will be trained in "forward / backward" order.
-
-    ```python  
-    hidden_0 = paddle.layer.mixed(
-        size=hidden_dim,
-        bias_attr=std_default,
-        input=[
-            paddle.layer.full_matrix_projection(
-                input=emb, param_attr=std_default) for emb in emb_layers
-        ])
-
-    mix_hidden_lr = 1e-3
-    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
-    hidden_para_attr = paddle.attr.Param(
-        initial_std=default_std, learning_rate=mix_hidden_lr)
-
-    lstm_0 = paddle.layer.lstmemory(
-        input=hidden_0,
-        act=paddle.activation.Relu(),
-        gate_act=paddle.activation.Sigmoid(),
-        state_act=paddle.activation.Sigmoid(),
-        bias_attr=std_0,
-        param_attr=lstm_para_attr)
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = paddle.layer.mixed(
-            size=hidden_dim,
-            bias_attr=std_default,
-            input=[
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[0], param_attr=hidden_para_attr),
-                paddle.layer.full_matrix_projection(
-                    input=input_tmp[1], param_attr=lstm_para_attr)
-            ])
-
-        lstm = paddle.layer.lstmemory(
-            input=mix_hidden,
-            act=paddle.activation.Relu(),
-            gate_act=paddle.activation.Sigmoid(),
-            state_act=paddle.activation.Sigmoid(),
-            reverse=((i % 2) == 1),
-            bias_attr=std_0,
-            param_attr=lstm_para_attr)
-
-        input_tmp = [mix_hidden, lstm]
-    ```
-
-4. We will concatenate the output of top LSTM unit with it's input, and project into a hidden layer. Then put a fully connected layer on top of it to get the final vector representation.
-
-    ```python
-    feature_out = paddle.layer.mixed(
-    size=label_dict_len,
-    bias_attr=std_default,
-    input=[
-        paddle.layer.full_matrix_projection(
-            input=input_tmp[0], param_attr=hidden_para_attr),
-        paddle.layer.full_matrix_projection(
-            input=input_tmp[1], param_attr=lstm_para_attr)
-    ], )
-    ```
-
-5.  We use CRF as cost function, the parameter of CRF cost will be named `crfw`.
-
-    ```python
-    crf_cost = paddle.layer.crf(
-        size=label_dict_len,
-        input=feature_out,
-        label=target,
-        param_attr=paddle.attr.Param(
-            name='crfw',
-            initial_std=default_std,
-            learning_rate=mix_hidden_lr))
-    ```
-
-6.  CRF decoding layer is used for evaluation and inference. It shares parameter with CRF layer.  The sharing of parameters among multiple layers is specified by the same parameter name in these layers.
-
-    ```python
-    crf_dec = paddle.layer.crf_decoding(
-       name='crf_dec_l',
-       size=label_dict_len,
-       input=feature_out,
-       label=target,
-       param_attr=paddle.attr.Param(name='crfw'))
-    ```
-
-## Train model
-
-### Create Parameters
-
-All necessary parameters will be traced created given output layers that we need to use.
-
-```python
-parameters = paddle.parameters.create([crf_cost, crf_dec])
-```
-
-We can print out parameter name. It will be generated if not specified.
-   
-```python
-print parameters.keys()
-```
-
-Now we load pre-trained word lookup table.
-
-```python
-def load_parameter(file_name, h, w):
-    with open(file_name, 'rb') as f:
-         f.read(16)
-         return np.fromfile(f, dtype=np.float32).reshape(h, w)
-parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
-```
-
-### Create Trainer
-
-We will create trainer given model topology, parameters and optimization method. We will use most basic SGD method (momentum optimizer with 0 momentum). In the meantime, we will set learning rate and regularization.
-
-```python
-optimizer = paddle.optimizer.Momentum(
-    momentum=0,
-    learning_rate=2e-2,
-    regularization=paddle.optimizer.L2Regularization(rate=8e-4),
-    model_average=paddle.optimizer.ModelAverage(
-        average_window=0.5, max_average_window=10000), )
-
-trainer = paddle.trainer.SGD(cost=crf_cost,
-                             parameters=parameters,
-                             update_equation=optimizer)
-```
-
-### Trainer
-
-As mentioned in data preparation section, we will use CoNLL 2005 test corpus as training data set. `conll05.test()` outputs one training instance at a time. It will be shuffled, and batched into mini batches as input.
-
-```python
-reader = paddle.reader.batched(
-    paddle.reader.shuffle(
-        conll05.test(), buf_size=8192), batch_size=20)
-```
-
-`reader_dict` is used to specify relationship between data instance and layer layer. For example, according to following `reader_dict`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`.
-
-```python
-reader_dict = {
-    'word_data': 0,
-    'ctx_n2_data': 1,
-    'ctx_n1_data': 2,
-    'ctx_0_data': 3,
-    'ctx_p1_data': 4,
-    'ctx_p2_data': 5,
-    'verb_data': 6,
-    'mark_data': 7,
-    'target': 8
-}
-```
-
-`event_handle` can be used as callback for training events, it will be used as an argument for `train`. Following `event_handle` prints cost during training.
-
-```python
-def event_handler(event):
-    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
-            print "Pass %d, Batch %d, Cost %f" % (
-                event.pass_id, event.batch_id, event.cost)
-```
-
-`trainer.train` will train the model.
-
-```python
-trainer.train(
-    reader=reader,
-    event_handler=event_handler,
-    num_passes=10000,
-    reader_dict=reader_dict)
-```
-
-## Conclusion
-
-Semantic Role Labeling is an important intermediate step in a wide range of natural language processing tasks. In this tutorial, we give SRL as an example to introduce how to use PaddlePaddle to do sequence tagging tasks. Proposed models are from our published paper\[[10](#Reference)\]. We only use test data as an illustration since train data on CoNLL 2005 dataset is not completely public. We hope to propose an end-to-end neural network model with fewer dependencies on natural language processing tools but is comparable, or even better than traditional models. Please check out our paper for more information and discussions.
-
-## Reference
-1. Sun W, Sui Z, Wang M, et al. [Chinese semantic role labeling with shallow parsing](http://www.aclweb.org/anthology/D09-1#page=1513)[C]//Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing: Volume 3-Volume 3. Association for Computational Linguistics, 2009: 1475-1483.
-2. Pascanu R, Gulcehre C, Cho K, et al. [How to construct deep recurrent neural networks](https://arxiv.org/abs/1312.6026)[J]. arXiv preprint arXiv:1312.6026, 2013.
-3. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](https://arxiv.org/abs/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
-4. Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[J]. arXiv preprint arXiv:1409.0473, 2014.
-5. Lafferty J, McCallum A, Pereira F. [Conditional random fields: Probabilistic models for segmenting and labeling sequence data](http://www.jmlr.org/papers/volume15/doppa14a/source/biblio.bib.old)[C]//Proceedings of the eighteenth international conference on machine learning, ICML. 2001, 1: 282-289.
-6. 李航. 统计学习方法[J]. 清华大学出版社, 北京, 2012.
-7. Marcus M P, Marcinkiewicz M A, Santorini B. [Building a large annotated corpus of English: The Penn Treebank](http://repository.upenn.edu/cgi/viewcontent.cgi?article=1246&context=cis_reports)[J]. Computational linguistics, 1993, 19(2): 313-330.
-8. Palmer M, Gildea D, Kingsbury P. [The proposition bank: An annotated corpus of semantic roles](http://www.mitpressjournals.org/doi/pdfplus/10.1162/0891201053630264)[J]. Computational linguistics, 2005, 31(1): 71-106.
-9. Carreras X, Màrquez L. [Introduction to the CoNLL-2005 shared task: Semantic role labeling](http://www.cs.upc.edu/~srlconll/st05/papers/intro.pdf)[C]//Proceedings of the Ninth Conference on Computational Natural Language Learning. Association for Computational Linguistics, 2005: 152-164.
-10. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/label_semantic_roles/predict.py b/label_semantic_roles/predict.py
deleted file mode 100644
index 372fd090b6e8f08f5bb34697772c2e4976810595..0000000000000000000000000000000000000000
--- a/label_semantic_roles/predict.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import integer_value_sequence
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python predict.py -h
-"""
-UNK_IDX = 0
-
-
-class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file,
-                 predicate_dict_file):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-
-        self.dict = {}
-        self.labels = {}
-        self.predicate_dict = {}
-        self.labels_reverse = {}
-        self.load_dict_label(dict_file, label_file, predicate_dict_file)
-
-        len_dict = len(self.dict)
-        len_label = len(self.labels)
-        len_pred = len(self.predicate_dict)
-
-        conf = parse_config(
-            train_conf, 'dict_len=' + str(len_dict) + ',label_len=' +
-            str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True')
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(model_dir)
-
-        slots = [
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
-            integer_value_sequence(len_pred), integer_value_sequence(2)
-        ]
-        self.converter = DataProviderConverter(slots)
-
-    def load_dict_label(self, dict_file, label_file, predicate_dict_file):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(dict_file, 'r')):
-            self.dict[line.strip()] = line_count
-
-        for line_count, line in enumerate(open(label_file, 'r')):
-            self.labels[line.strip()] = line_count
-            self.labels_reverse[line_count] = line.strip()
-
-        for line_count, line in enumerate(open(predicate_dict_file, 'r')):
-            self.predicate_dict[line.strip()] = line_count
-
-    def get_data(self, data_file):
-        """
-        Get input data of paddle format.
-        """
-        with open(data_file, 'r') as fdata:
-            for line in fdata:
-                sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = line.strip(
-                ).split('\t')
-                words = sentence.split()
-                sen_len = len(words)
-
-                word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)
-                                  ] * sen_len
-                ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
-                ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
-                ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
-                ctx_p1_slot = [self.dict.get(ctx_p1, UNK_IDX)] * sen_len
-                ctx_p2_slot = [self.dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-                marks = mark.split()
-                mark_slot = [int(w) for w in marks]
-
-                yield word_slot, ctx_n2_slot, ctx_n1_slot, \
-                      ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
-
-    def predict(self, data_file, output_file):
-        """
-        data_file: file name of input data.
-        """
-        input = self.converter(self.get_data(data_file))
-        output = self.network.forwardTest(input)
-        lab = output[0]["id"].tolist()
-
-        with open(data_file, 'r') as fin, open(output_file, 'w') as fout:
-            index = 0
-            for line in fin:
-                sen = line.split('\t')[0]
-                len_sen = len(sen.split())
-                line_labels = lab[index:index + len_sen]
-                index += len_sen
-                fout.write(sen + '\t' + ' '.join(
-                    [self.labels_reverse[i] for i in line_labels]) + '\n')
-
-
-def option_parser():
-    usage = (
-        "python predict.py -c config -w model_dir "
-        "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-c",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-l",
-        "--label",
-        action="store",
-        dest="label_file",
-        default=None,
-        help="label file")
-    parser.add_option(
-        "-p",
-        "--predict_dict_file",
-        action="store",
-        dest="predict_dict_file",
-        default=None,
-        help="predict_dict_file")
-    parser.add_option(
-        "-i",
-        "--data",
-        action="store",
-        dest="data_file",
-        help="data file to predict")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-
-    parser.add_option(
-        "-o",
-        "--output_file",
-        action="store",
-        dest="output_file",
-        default=None,
-        help="output file")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    data_file = options.data_file
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label_file = options.label_file
-    predict_dict_file = options.predict_dict_file
-    output_file = options.output_file
-
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file,
-                         predict_dict_file)
-    predict.predict(data_file, output_file)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/label_semantic_roles/predict.sh b/label_semantic_roles/predict.sh
deleted file mode 100755
index 873aad670d16803ce321ab60baabe9fe29ea64bf..0000000000000000000000000000000000000000
--- a/label_semantic_roles/predict.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort -n | head -n 1
-}   
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-best_model_path="output/pass-${LOG[1]}"
-
-config_file=db_lstm.py
-dict_file=./data/wordDict.txt
-label_file=./data/targetDict.txt 
-predicate_dict_file=./data/verbDict.txt
-input_file=./data/feature
-output_file=predict.res
- 
-python predict.py \
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
diff --git a/label_semantic_roles/test.sh b/label_semantic_roles/test.sh
deleted file mode 100755
index 11d9d6a19c1b17ad1b7540ee7a03017f85dd821e..0000000000000000000000000000000000000000
--- a/label_semantic_roles/test.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort -n | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --use_gpu=false \
-  --config_args=is_test=1 \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'test.log'
diff --git a/label_semantic_roles/train.sh b/label_semantic_roles/train.sh
deleted file mode 100755
index b0efaf42270cbd807bf40def7a02e90b54c9cf92..0000000000000000000000000000000000000000
--- a/label_semantic_roles/train.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=10 \
-  --dot_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=500 \
-  --save_dir=./output \
-  --num_passes=150 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-  2>&1 | tee 'train.log'
diff --git a/machine_translation/README.en.md b/machine_translation/README.en.md
deleted file mode 100644
index 0e85dc0f8da18e68e21e28a8445ea827c1e1b1b8..0000000000000000000000000000000000000000
--- a/machine_translation/README.en.md
+++ /dev/null
@@ -1,725 +0,0 @@
-# Machine Translation
-
-The source codes is located at [book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/machine_translation). Please refer to the PaddlePaddle [installation tutorial](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html) if you are a first time user.
-
-## Background
-
-Machine translation (MT) leverages computers to translate from one language to another. The language to be translated is referred to as the source language, while the language to be translated into is referred to as the target language. Thus, Machine translation is the process of translating from the source language to the target language. It is one of the most important research topics in the field of natural language processing.
-
-Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one languge. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#References)\]。
-
-
-To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example: 
-
-1. human designed features cannot cover all possible linguistic variations; 
-
-2. it is difficult to use global features; 
-
-3. the techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
-
-
-
-The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are: 
-
-1. techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1); 
-
-2. techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
-
-<p align="center">
-<img src="image/nmt_en.png" width=400><br/>
-Figure 1. Neural Network based Machine Translation
-</p>
-
-
-This tutorial will mainly introduce an NMT model and how to use PaddlePaddle to train it.
-
-## Illustrative Results
-
-Let's consider an example of Chinese-to-English translation. The model is given the following segmented sentence in Chinese
-```text
-这些 是 希望 的 曙光 和 解脱 的 迹象 .
-```
-After training and with a beam-search size of 3, the generated translations are as follows:
-```text
-0 -5.36816   these are signs of hope and relief . <e>
-1 -6.23177   these are the light of hope and relief . <e>
-2 -7.7914  these are the light of hope and the relief of hope . <e>
-```
-- The first column corresponds to the id of the generated sentence; the second column corresponds to the score of the generated sentence (in descending order), where a larger value indicates better quality; the last column corresponds to the generated sentence.
-- There are two special tokens: `<e>` denotes the end of a sentence while `<unk>` denotes unknown word, i.e., a word not in the training dictionary.
-
-## Overview of the Model
-
-This section will introduce Gated Recurrent Unit (GRU), Bi-directional Recurrent Neural Network, the Encoder-Decoder framework used in NMT, attention mechanism, as well as the beam search algorithm.
-
-### Gated Recurrent Unit (GRU)
-
-We already introduced RNN and LSTM in the [Sentiment Analysis](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md) chapter.
-Compared to a simple RNN, the LSTM added memory cell, input gate, forget gate and output gate. These gates combined with the memory cell greatly improve the ability to handle long-term dependencies.
-
-GRU\[[2](#References)\] proposed by Cho et al is a simplified LSTM and an extension of a simple RNN. It is shown in the figure below. 
-A GRU unit has only two gates:
-- reset gate: when this gate is closed, the history information is discarded, i.e., the irrelevant historical information has no effect on the future output.
-- update gate: it combines the input gate and the forget gate and is used to control the impact of historical information on the hidden output. The historical information is passed over when the update gate is close to 1.
-
-<p align="center">
-<img src="image/gru_en.png" width=700><br/>
-Figure 2. A GRU Gate
-</p>
-
-Generally speaking, sequences with short distance dependencies will have an active reset gate while sequences with long distance dependency will have an active update date.
-In addition, Chung et al.\[[3](#References)\] have empirically shown that although GRU has less parameters, it has similar performance to LSTM on several different tasks.
-
-### Bi-directional Recurrent Neural Network
-
-We already introduced an instance of bi-directional RNN in the [Semantic Role Labeling](https://github.com/PaddlePaddle/book/blob/develop/label_semantic_roles/README.md) chapter. Here we present another bi-directional RNN model with a different architecture proposed by Bengio et al. in \[[2](#References),[4](#References)\]. This model takes a sequence as input and outputs a fixed dimensional feature vector at each step, encoding the context information at the corresponding time step.
-
-Specifically, this bi-directional RNN processes the input sequence in the original and reverse order respectively, and then concatenates the output feature vectors at each time step as the final output. Thus the output node at each time step contains information from the past and future as context. The figure below shows an unrolled bi-directional RNN. This network contains a forward RNN and backward RNN with six weight matrices: weight matrices from input to forward hidden layer and backward hidden ($W_1, W_3$), weight matrices from hidden to itself ($W_2, W_5$), matrices from forward hidden and backward hidden to output layer ($W_4, W_6$). Note that there are no connections between forward hidden and backward hidden layers.
-
-<p align="center">
-<img src="image/bi_rnn_en.png" width=450><br/>
-Figure 3. Temporally unrolled bi-directional RNN
-</p>
-
-### Encoder-Decoder Framework
-
-The Encoder-Decoder\[[2](#References)\] framework aims to solve the mapping of a sequence to another sequence, for sequences with arbitrary lengths. The source sequence is encoded into a vector via an encoder, which is then decoded to a target sequence via a decoder by maximizing the predictive probability. Both the encoder and the decoder are typically implemented via RNN.
-
-<p align="center">
-<img src="image/encoder_decoder_en.png" width=700><br/>
-Figure 4. Encoder-Decoder Framework
-</p>
-
-#### Encoder
-
-There are three steps for encoding a sentence:
-
-1. One-hot vector representation of a word: Each word $x_i$ in the source sentence $x=\left \{ x_1,x_2,...,x_T \right \}$ is represented as a vector $w_i\epsilon R^{\left | V \right |},i=1,2,...,T$   where $w_i$ has the same dimensionality as the size of the dictionary, i.e., $\left | V \right |$, and has an element of one at the location corresponding to the location of the word in the dictionary and zero elsewhere.
-
-2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation 
-
-  * the dimensionality of the vector is typically large, leading to the curse of dimensionality; 
-
-  * it is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
-
-3. Encoding of the source sequence via RNN: This can be described mathematically as:
-
-    $$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$$
-    
-    where 
-    $h_0$ is a zero vector, 
-    $\varnothing _\theta$ is a non-linear activation function, and 
-    $\mathbf{h}=\left \{ h_1,..., h_T \right \}$ 
-    is the sequential encoding of the first $T$ words from the source sequence. The vector representation of the whole sentence can be represented as the encoding vector at the last time step $T$ from $\mathbf{h}$, or by temporal pooling over $\mathbf{h}$.
-
-
-Bi-directional RNN can also be used in step (3) for more a complicated sentence encoding. This can be implemented using a bi-directional GRU. Forward GRU encodes the source sequence in its original order $(x_1,x_2,...,x_T)$, and generates a sequence of hidden states $(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$. The backward GRU encodes the source sequence in reverse order, i.e., $(x_T,x_T-1,...,x_1)$ and generates $(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$. Then for each word $x_i$, its complete hidden state is the concatenation of the corresponding hidden states from the two GRUs, i.e., $h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$.
-
-<p align="center">
-<img src="image/encoder_attention_en.png" width=500><br/>
-Figure 5. Encoder using bi-directional GRU
-</p>
-
-#### Decoder
-
-The goal of the decoder is to maximize the probability of the next correct word in the target language. The main idea is as follows:
-
-1. At each time step $i$, given the encoding vector (or context vector) $c$ of the source sentence, the $i$-th word $u_i$ from the ground-truth target language and the RNN hidden state $z_i$, the next hidden state $z_{i+1}$ is computed as:
-
-   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
-   where $\phi _{\theta '}$ is a non-linear activation function and $c=q\mathbf{h}$ is the context vector of the source sentence. Without using [attention](#Attention Mechanism), if the output of the [encoder](#Encoder) is the encoding vector at the last time step of the source sentence, then $c$ can be defined as $c=h_T$. $u_i$ denotes the $i$-th word from the target language sentence and $u_0$ denotes the beginning of the target language sentence (i.e., `<s>`), indicating the beginning of decoding. $z_i$ is the RNN hidden state at time step $i$ and $z_0$ is an all zero vector.
-
-2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows
-
-   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
-
-   where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word.
-
-3. Compute the cost accoding to $p_{i+1}$ and $u_{i+1}$.
-4. Repeat Steps 1-3, until all the words in the target language sentence have been processed.
-
-The generation process of machine translation is to translate the source sentence into a sentence in the target language according to a pre-trained model. There are some differences between the decoding step in generation and training. Please refer to [Beam Search Algorithm](#Beam Search Algorithm) for details.
-
-### Attention Mechanism
-
-There are a few problems with the fixed dimensional vector representation from the encoding stage: 
-  * It is very challenging to encode both the semantic and syntactic information a sentence with a fixed dimensional vector regardless of the length of the sentence. 
-  * Intuitively, when translating a sentence, we typically pay more attention to the parts in the source sentence more relevant to the current translation. Moreover, the focus changes along the process of the translation. With a fixed dimensional vector, all the information from the source sentence is treated equally in terms of attention. This is not reasonable. Therefore, Bahdanau et al. \[[4](#References)\] introduced attention mechanism, which can decode based on different fragments of the context sequence in order to address the difficulty of feature learning for long sentences. Decoder with attention will be explained in the following.
-
-Different from the simple decoder, $z_i$ is computed as:
-
-$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
-
-It is observed that for each word $u_i$ in the target language sentence, there is a corresponding context vector $c_i$ as the encoding of the source sentence, which is computed as:
-
-$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
-
-It is noted that the attention mechanism is achieved by a weighted average over the RNN hidden states $h_j$. The weight $a_{ij}$ denotes the strength of attention of the $i$-th word in the target language sentence to the $j$-th word in the source sentence and is calculated as
-
-\begin{align}
-a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
-e_{ij}&=align(z_i,h_j)\\\\
-\end{align}
-
-where $align$ is an alignment model that measures the fitness between the $i$-th word in the target language sentence and the $j$-th word in the source sentence. More concretely, the fitness is computed with the $i$-th hidden state $z_i$ of the decoder RNN and the $j$-th context vector $h_j$ of the source sentence. Hard alignment is used in the conventional alignment model, which means each word in the target language explicitly corresponds to one or more words from the target language sentence. In an attention model, soft alignment is used, where any word in source sentence is related to any word in the target language sentence, where the strength of the relation is a real number computed via the model, thus can be incorporated into the NMT framework and can be trained via back-propagation.
-
-<p align="center">
-<img src="image/decoder_attention_en.png" width=500><br/>
-Figure 6. Decoder with Attention Mechanism
-</p>
-
-### Beam Search Algorithm
-
-[Beam Search](http://en.wikipedia.org/wiki/Beam_search) is a heuristic search algorithm that explores a graph by expanding the most promising node in a limited set. It is typically used when the solution space is huge  (e.g., for machine translation, speech recognition), and there is not enough memory for all the possible solutions. For example, if we want to translate “`<s>你好<e>`” into English, even if there are only three words in the dictionary (`<s>`, `<e>`, `hello`), it is still possible to generate an infinite number of sentences, where the word `hello` can appear different number of times. Beam search could be used to find a good translation among them.
-
-Beam search builds a search tree using breadth first search and sorts the nodes according to a heuristic cost (sum of the log probability of the generated words) at each level of the tree. Only a fixed number of nodes according to the pre-specified beam size (or beam width) are considered. Thus, only nodes with highest scores are expanded in the next level. This reduces the space and time requirements significantly. However, a globally optimal solution is not guaranteed. 
-
-The goal is to maximize the probability of the generated sequence when using beam search in decoding, The procedure is as follows:
-
-1. At each time step $i$, compute the hidden state $z_{i+1}$ of the next time step according to the context vector $c$ of the source sentence, the $i$-th word $u_i$ generated for the target language sentence and the RNN hidden state $z_i$.
-2. Normalize $z_{i+1}$ using `softmax` to get the probability $p_{i+1}$ for the $i+1$-th word for the target language sentence.
-3. Sample the word $u_{i+1}$ according to $p_{i+1}$.
-4. Repeat Steps 1-3, until end-of-sentence token `<e>` is generated or the maximum length of the sentence is reached.
-
-Note: $z_{i+1}$ and $p_{i+1}$ are computed the same way as in [Decoder](#Decoder). In generation mode, each step is greedy in so there is no guarantee of a global optimum.
-
-## Data Preparation
-
-### Download and Uncompression
-
-This tutorial uses a dataset from [WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), where [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) is used as the training set, and [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) is used as test and generation set.
-
-Run the following command in Linux to obtain the data:
-```bash
-cd data
-./wmt14_data.sh
-```
-There are three folders in the downloaded dataset `data/wmt14`:
-<p align = "center">
-<table>
-<tr>
-<td>Folder Name</td>
-<td>French-English Parallel Corpus</td>
-<td>Number of Files</td>
-<td>Size of Files</td>
-</tr>
-
-<tr>
-<td>train</td>
-<td>ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td>12</td>
-<td>3.55G</td>
-</tr>
-
-<tr>
-<td>test</td>
-<td>ntst1213.src, ntst1213.trg</td>
-<td>2</td>
-<td>1636k</td>
-</tr>
-
-</tr>
-<tr>
-<td>gen</td>
-<td>ntst14.src, ntst14.trg</td>
-<td>2</td>
-<td>864k</td>
-</tr>
-</table>
-</p>
-
-- `XXX.src` is the source file in French and `XXX.trg`is the target file in English. Each row of the file contains one sentence.
-- `XXX.src` and `XXX.trg` has the same number of rows and there is a one-to-one correspondance between the sentences at any row from the two files.
-
-### User Defined Dataset (Optional)
-
-To use your own dataset, just put it under the `data` folder and organize it as follows
-```text
-user_dataset
-├── train
-│   ├── train_file1.src
-│   ├── train_file1.trg
-│   └── ...
-├── test
-│   ├── test_file1.src
-│   ├── test_file1.trg
-│   └── ...
-├── gen
-│   ├── gen_file1.src
-│   ├── gen_file1.trg
-│   └── ...
-```
-
-Explanation of the directories:
-- First level: `user_dataset`: the name of the user defined dataset.
-- Second level: `train`、`test` and `gen`: these names should not be changed.
-- Third level: Parallel corpus in source language and target language, each with a postfix of `.src` and `.trg`.
-
-### Data Pre-processing
-
-There are two steps for pre-processing:
-- Merge the source and target parallel corpus files into one file
-  - Merge `XXX.src` and `XXX.trg` file pair as `XXX`
-  - The $i$-th row in `XXX` is the concatenation of the $i$-th row from `XXX.src` with the $i$-th row from `XXX.trg`, separated with '\t'.
-
-- Create source dictionary and target dictionary, each containing **DICTSIZE** number of words, including the most frequent (DICTSIZE - 3) fo word from the corpus and 3 special token `<s>` (begin of sequence), `<e>` (end of sequence)  and `<unk>` (unknown words that are not in the vocabulary).
-
-`preprocess.py` is used for pre-processing:
-```python
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-- `-i INPUT`: path to the original dataset.
-- `-d DICTSIZE`: number of words in the dictionary. If unspecified, the dictionary will contain all the words appeared in the input dataset.
-- `-m --mergeDict`: merge the source dictionary with target dictionary, making the two dictionaries have the same content.
-
-The specific command to run the script is as follows:
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-You will see the following messages after a few minutes:
-```text
-concat parallel corpora for dataset
-build source dictionary for train data
-build target dictionary for train data
-dictionary size is 30000
-```
-The pre-processed data is located at `data/pre-wmt14`:
-```text
-pre-wmt14
-├── train
-│   └── train
-├── test
-│   └── test
-├── gen
-│   └── gen
-├── train.list
-├── test.list
-├── gen.list
-├── src.dict
-└── trg.dict
-```
-- `train`, `test` and `gen`: contains French-English parallel corpus for training, testing and generation. Each row from each file is separated into two columns with a "\t", where the first column is the sequence in French and the second one is in English.
-- `train.list`, `test.list` and `gen.list`: record respectively the path to `train`, `test` and `gen` folders.
-- `src.dict` and `trg.dict`: source (French) and target (English) dictionary. Each dictionary contains 30000 words (29997 most frequent words and 3 special tokens).
-
-### Providing Data to PaddlePaddle
-
-We use `dataprovider.py` to provide data to PaddlePaddle as follows:
-
-1. Import PyDataProvider2 package from PaddlePaddle and define three special tokens:
-
-   ```python
-   from paddle.trainer.PyDataProvider2 import *
-   UNK_IDX = 2    #out of vocabulary word
-   START = "<s>"  #begin of sequence
-   END = "<e>"    #end of sequence
-   ```
-2. Use initialization function `hook` to define the input data types (`input_types`) for training and generation:
-   - Training: there are three input sequences, where "source language sequence" and "target language sequence" are input and the "target language next word sequence" is the label.
-   - Generation: there are two input sequences, where the "source language sequence" is the input and “source language sequence id” are the ids for the input data (optional).
-
-  `src_dict_path` in the `hook` function is the path to the source language dictionary, while `trg_dict_path` the path to target language dictionary. `is_generating` is passed from model config file. For more details on the usage of the `hook` function please refer to [Model Config](#Model Config).
-
-   ```python
-   def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
-            **kwargs):
-       # job_mode = 1: training 0: generation
-       settings.job_mode = not is_generating
-
-       def fun(dict_path): # load dictionary according to the path
-           out_dict = dict()
-           with open(dict_path, "r") as fin:
-               out_dict = {
-                   line.strip(): line_count
-                   for line_count, line in enumerate(fin)
-               }
-           return out_dict
-
-       settings.src_dict = fun(src_dict_path)
-       settings.trg_dict = fun(trg_dict_path)
-
-       if settings.job_mode:                                  #training
-           settings.input_types = {
-               'source_language_word':                        #source language sequence
-               integer_value_sequence(len(settings.src_dict)),
-               'target_language_word':                        #target language sequence
-               integer_value_sequence(len(settings.trg_dict)),
-               'target_language_next_word':                   #target language next word sequence
-               integer_value_sequence(len(settings.trg_dict))
-           }
-       else:                                                  #generation
-           settings.input_types = {
-               'source_language_word':                        #source language sequence
-               integer_value_sequence(len(settings.src_dict)),
-               'sent_id':                                     #source language sequence id
-               integer_value_sequence(len(open(file_list[0], "r").readlines()))
-           }
-   ```
-3. Use `process` function to open the file `file_name`, read each row of the file, convert the data to be compatible with `input_types`, and then use `yield` to return to PaddlePaddle process. More specifically
-
-   - add `<s>` to the beginning of each source language sequence and add `<e>` to the end, producing "source_language_word".
-   - add `<s>` to the beginning of each target language senquence, producing "target_language_word".
-   - add `<e>` to the end of each target language senquence, producing "target_language_next_word".
-
-   ```python
-   def _get_ids(s, dictionary): # get the location of each word from the source language sequence in the dictionary
-       words = s.strip().split()
-       return [dictionary[START]] + \
-              [dictionary.get(w, UNK_IDX) for w in words] + \
-              [dictionary[END]]
-
-   @provider(init_hook=hook, pool_size=50000)
-   def process(settings, file_name):
-       with open(file_name, 'r') as f:
-           for line_count, line in enumerate(f):
-               line_split = line.strip().split('\t')
-               if settings.job_mode and len(line_split) != 2:
-                   continue
-               src_seq = line_split[0]
-               src_ids = _get_ids(src_seq, settings.src_dict)
-
-               if settings.job_mode:
-                   trg_seq = line_split[1]
-                   trg_words = trg_seq.split()
-                   trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                   # sequence with length longer than 80 with be removed during training to avoid an overly deep RNN.
-                   if len(src_ids) > 80 or len(trg_ids) > 80:
-                       continue
-                   trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                   trg_ids = [settings.trg_dict[START]] + trg_ids
-                   yield {
-                       'source_language_word': src_ids,
-                       'target_language_word': trg_ids,
-                       'target_language_next_word': trg_ids_next
-                   }
-               else:
-                   yield {'source_language_word': src_ids, 'sent_id': [line_count]}
-   ```
-Note: The size of the training data is 3.55G. For machines with limited memories, it is recommended to use `pool_size` to set the number of data samples stored in memory.
-
-## Model Config
-
-### Data Definition
-
-1. Specify the path to data and source/target dictionaries. `is_generating` accepts argument passed from command lines and is used to denote whether the current configuration is for training (default) or generation. See [Usage and Resutls](#Usage and Results).
-
-   ```python
-   import os
-   from paddle.trainer_config_helpers import *
-
-   data_dir = "./data/pre-wmt14" # data path
-   src_lang_dict = os.path.join(data_dir, 'src.dict') # path to the source language dictionary
-   trg_lang_dict = os.path.join(data_dir, 'trg.dict') # path to the target language dictionary
-   is_generating = get_config_arg("is_generating", bool, False) # config mode
-   ```
-2. Use `define_py_data_sources2` to get data from `dataprovider.py`, and use `args` variable to input the source/target language dicitonary path and config mode.
-
-   ```python
-   if not is_generating:
-       train_list = os.path.join(data_dir, 'train.list')
-       test_list = os.path.join(data_dir, 'test.list')
-   else:
-       train_list = None
-       test_list = os.path.join(data_dir, 'gen.list')
-
-   define_py_data_sources2(
-       train_list,
-       test_list,
-       module="dataprovider",
-       obj="process",
-       args={
-           "src_dict_path": src_lang_dict, # source language dictionary path
-           "trg_dict_path": trg_lang_dict, # target language dictionary path
-           "is_generating": is_generating  # config mode
-       })
-   ```
-
-### Algorithm Configuration
-
-```python
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-```
-This tutorial will use the default SGD and Adam learning algorithm, with a learning rate of 5e-4. Note that the `batch_size = 50` denotes generating 50 sequence each time.
-
-### Model Structure
-1. Define some global variables
-
-   ```python
-   source_dict_dim = len(open(src_lang_dict, "r").readlines()) # size of the source language dictionary
-   target_dict_dim = len(open(trg_lang_dict, "r").readlines()) # size of target language dictionary
-   word_vector_dim = 512 # dimensionality of word vector
-   encoder_size = 512 	 # dimensionality of the hidden state of encoder GRU
-   decoder_size = 512    # dimentionality of the hidden state of decoder GRU
-
-   if is_generating:
-       beam_size=3    # beam size for the beam search algorithm
-       max_length=250 # maximum length for the generated sentence
-       gen_trans_file = get_config_arg("gen_trans_file", str, None) # generate file
-  ```
-
-2. Implement Encoder as follows:
-
-   2.1 Input one-hot vector representations $\mathbf{w}$ converted with `dataprovider.py` from the source language sentence
-
-   ```python
-   src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-   ```
-   2.2 Map the one-hot vector into a word vector $\mathbf{s}$ in a low-dimensional semantic space
-
-   ```python
-   src_embedding = embedding_layer(
-       input=src_word_id,
-       size=word_vector_dim,
-       param_attr=ParamAttr(name='_source_language_embedding'))
-   ```
-   2.3 Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$
-
-   ```python
-   src_forward = simple_gru(input=src_embedding, size=encoder_size)
-   src_backward = simple_gru(
-       input=src_embedding, size=encoder_size, reverse=True)
-   encoded_vector = concat_layer(input=[src_forward, src_backward])
-   ```
-
-3. Implement Attention-based Decoder as follows:
-
-   3.1 Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network
-
-   ```python
-   with mixed_layer(size=decoder_size) as encoded_proj:
-       encoded_proj += full_matrix_projection(input=encoded_vector)
-   ```
-   3.2 Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
-
-   ```python
-   backward_first = first_seq(input=src_backward)
-   with mixed_layer(
-           size=decoder_size,
-           act=TanhActivation(), ) as decoder_boot:
-       decoder_boot += full_matrix_projection(input=backward_first)
-   ```
-   3.3 Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
-
-      - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
-      - context is computed via `simple_attention` as $c_i=\sum {j=1}^{T}a_{ij}h_j$, where enc_vec is the projection of $h_j$ and enc_proj is the projection of $h_j$ (c.f. 3.1). $a_{ij}$ is calculated within `simple_attention`.
-      - decoder_inputs fuse $c_i$ with the representation of the current_word (i.e., $u_i$).
-      - gru_step uses `gru_step_layer` function to compute $z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$.
-      - Softmax normalization is used in the end to computed the probability of words, i.e., $p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$. The output is returned.
-
-   ```python
-   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-       decoder_mem = memory(
-           name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-       context = simple_attention(
-           encoded_sequence=enc_vec,
-           encoded_proj=enc_proj,
-           decoder_state=decoder_mem, )
-
-       with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-           decoder_inputs += full_matrix_projection(input=context)
-           decoder_inputs += full_matrix_projection(input=current_word)
-
-       gru_step = gru_step_layer(
-           name='gru_decoder',
-           input=decoder_inputs,
-           output_mem=decoder_mem,
-           size=decoder_size)
-
-       with mixed_layer(
-               size=target_dict_dim, bias_attr=True,
-               act=SoftmaxActivation()) as out:
-           out += full_matrix_projection(input=gru_step)
-       return out
-    ```
-4. Decoder differences between the training and generation
-
-   4.1 Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
-
-   ```python
-   decoder_group_name = "decoder_group"
-   group_input1 = StaticInput(input=encoded_vector, is_seq=True)
-   group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
-   group_inputs = [group_input1, group_input2]
-   ```
-   4.2 In training mode:
-
-      - word embedding from the target langauge trg_embedding is passed to `gru_decoder_with_attention` as current_word.
-      - `recurrent_group` calls `gru_decoder_with_attention` in a recurrent way
-      - the sequence of next words from the target language is used as label (lbl)
-      - multi-class cross-entropy (`classification_cost`) is used to calculate the cost
-
-   ```python
-   if not is_generating:
-       trg_embedding = embedding_layer(
-           input=data_layer(
-               name='target_language_word', size=target_dict_dim),
-           size=word_vector_dim,
-           param_attr=ParamAttr(name='_target_language_embedding'))
-       group_inputs.append(trg_embedding)
-
-       decoder = recurrent_group(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs)
-
-       lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-       cost = classification_cost(input=decoder, label=lbl)
-       outputs(cost)
-   ```
-   4.3 In generation mode:
-
-      - during generation, as the decoder RNN will take the word vector generated from the previous time step as input, `GeneratedInput` is used to implement this automatically. Please refer to [GeneratedInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for details.
-      - `beam_search` will call `gru_decoder_with_attention` to generate id
-      - `seqtext_printer_evaluator` outputs the generated sentence to `gen_trans_file` according to `trg_lang_dict`
-
-   ```python
-   else:
-       trg_embedding = GeneratedInput(
-           size=target_dict_dim,
-           embedding_name='_target_language_embedding',
-           embedding_size=word_vector_dim)
-       group_inputs.append(trg_embedding)
-
-       beam_gen = beam_search(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs,
-           bos_id=0,
-           eos_id=1,
-           beam_size=beam_size,
-           max_length=max_length)
-
-       seqtext_printer_evaluator(
-           input=beam_gen,
-           id_input=data_layer(
-               name="sent_id", size=1),
-           dict_file=trg_lang_dict,
-           result_file=gen_trans_file)
-       outputs(beam_gen)
-   ```
-Note: Our configuration is based on Bahdanau et al. \[[4](#Reference)\] but with a few simplifications. Please refer to [issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133) for more details.
-
-
-## Model Training
-
-Training can be started with the following command:
-
-```bash
-./train.sh
-```
-where `train.sh` contains
-
-```bash
-paddle train \
---config='seqToseq_net.py' \
---save_dir='model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'train.log'
-```
-- config: configuration file for the network
-- save_dir: path to save the trained model
-- use_gpu: whether to use GPU for training; CPU is used here
-- num_passes: number of passes for training. In PaddlePaddle, one pass meansing one pass of complete training pass using all the data in the training set
-- show_parameter_stats_period: here we show the statistics of parameters every 100 batches
-- trainer_count: the number of CPU processes or GPU devices
-- log_period: here we print log every 10 batches
-- dot_period: we print one "." every 5 batches
-
-The training loss will the printed every 10 batches, and you will see messages like those below:
-```text
-I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-.....
-```
-- AvgCost: average cost from batch-0 to the current batch.
-- CurrentCost: the cost for the current batch
-- classification\_error\_evaluator (Eval): average error rate from evaluator-0 to the current evaluator for each word
-- classification\_error\_evaluator (CurrentEval): error rate for the current evaluator for each word
-
-The model training is successful when the classification\_error\_evaluator is lower than 0.35.
-
-## Model Usage
-
-### Download Pre-trained Model
-
-As the training of an NMT model is very time consuming, we provide a pre-trained model (pass-00012, ~205M). The model is trained with a cluster of 50 physical nodes (each node has two 6-core CPU). We trained 16 passes (taking about 5 days) with each pass taking about 7 hours. The provided model (pass-00012) has the highest [BLEU Score](#BLEU Score) of 26.92. Run the following command to down load the model:
-```bash
-cd pretrained
-./wmt14_model.sh
-```
-
-### Usage and Results
-
-Run the following command to perform translation from French to English:
-
-```bash
-./gen.sh
-```
-where `gen.sh` contains:
-
-```bash
-paddle train \
---job=test \
---config='seqToseq_net.py' \
---save_dir='pretrained/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
---config_args=is_generating=1,gen_trans_file="gen_result" \
-2>&1 | tee 'translation/gen.log'
-```
-Parameters different training are listed as follows:
-- job: set the mode as testing.
-- save_dir: path to the pre-trained model.
-- num_passes and test_pass: load the model parameters from pass $i\epsilon \left [ test\\_pass,num\\_passes-1 \right ]$. Here we only load `data/wmt14_model/pass-00012`.
-- config_args: pass the self-defined command line parameters to model configuration. `is_generating=1` indicates generation mode and `gen_trans_file="gen_result"` represents the file generated.
-
-For translation results please refer to [Illustrative Results](#Illustrative Results).
-
-### BLEU Evaluation
-
-BLEU (Bilingual Evaluation understudy) is a metric widely used for automatic machine translation proposed by IBM Watson Research Center in 2002\[[5](#References)\]. The closer the translation produced by a machine is to the translation produced by a human expert, the better the performance of the translation system.
-To measure the closeness between machine translation and human translation, sentence precision is used. It compares the number of matched n-grams. More matches will lead to higher BLEU scores.
-
-[Moses](http://www.statmt.org/moses/) is an open-source machine translation system, we used [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) for BLEU evaluation. Run the following command for downloading:
-```bash
-./moses_bleu.sh
-```
-BLEU evaluation can be performed using the `eval_bleu` script as follows, where FILE is the name of the file to be evaluated, BEAMSIZE is the beam size value, and `data/wmt14/gen/ntst14.trg` is used as the standard translation in default.
-```bash
-./eval_bleu.sh FILE BEAMSIZE
-```
-Specificaly, the script is run as follows:
-```bash
-./eval_bleu.sh gen_result 3
-```
-You will see the following message as output:
-```text
-BLEU = 26.92
-```
-
-## Summary
-
-End-to-end neural machine translation is a recently developed way to perform machine translations. In this chapter, we introduced the typical "Encoder-Decoder" framework and "attention" mechanism. Since NMT is a typical Sequence-to-Sequence (Seq2Seq) learning problem, tasks such as query rewriting, abstraction generation and single-turn dialogues can all be solved with the model presented in this chapter.
-
-## References
-
-1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
-2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
-3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
-4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
-5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/machine_translation/README.md b/machine_translation/README.md
deleted file mode 100644
index 01cf3dc51e04a7c692dbe1935319258b806c65bd..0000000000000000000000000000000000000000
--- a/machine_translation/README.md
+++ /dev/null
@@ -1,689 +0,0 @@
-# 机器翻译
-
-本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/machine_translation)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
-
-## 背景介绍
-
-机器翻译（machine translation, MT）是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言（source language），翻译成的结果语言称为目标语言（target language）。机器翻译即实现从源语言到目标语言转换的过程，是自然语言处理的重要研究领域之一。
-
-早期机器翻译系统多为基于规则的翻译系统，需要由语言学家编写两种语言之间的转换规则，再将这些规则录入计算机。该方法对语言学家的要求非常高，而且我们几乎无法总结一门语言会用到的所有规则，更何况两种甚至更多的语言。因此，传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。
-
-为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
-
-近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
-<p align="center">
-<img src="image/nmt.png" width=400><br/>
-图1. 基于神经网络的机器翻译系统
-</p>
-
-本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
-
-## 效果展示
-
-以中英翻译（中文翻译到英文）的模型为例，当模型训练完毕时，如果输入如下已分词的中文句子：
-```text
-这些 是 希望 的 曙光 和 解脱 的 迹象 .
-```
-如果设定显示翻译结果的条数（即[柱搜索算法](#柱搜索算法)的宽度）为3，生成的英语句子如下：
-```text
-0 -5.36816   these are signs of hope and relief . <e>
-1 -6.23177   these are the light of hope and relief . <e>
-2 -7.7914  these are the light of hope and the relief of hope . <e>
-```
-- 左起第一列是生成句子的序号；左起第二列是该条句子的得分（从大到小），分值越高越好；左起第三列是生成的英语句子。
-- 另外有两个特殊标志：`<e>`表示句子的结尾，`<unk>`表示未登录词（unknown word），即未在训练字典中出现的词。
-
-## 模型概览
-
-本节依次介绍GRU（Gated Recurrent Unit，门控循环单元），双向循环神经网络（Bi-directional Recurrent Neural Network），NMT模型中典型的编码器-解码器（Encoder-Decoder）框架和注意力（Attention）机制，以及柱搜索（beam search）算法。
-
-### GRU
-
-我们已经在[情感分析](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md)一章中介绍了循环神经网络（RNN）及长短时间记忆网络（LSTM）。相比于简单的RNN，LSTM增加了记忆单元（memory cell）、输入门（input gate）、遗忘门（forget gate）及输出门（output gate），这些门及记忆单元组合起来大大提升了RNN处理远距离依赖问题的能力。
-
-GRU\[[2](#参考文献)\]是Cho等人在LSTM上提出的简化版本，也是RNN的一种扩展，如下图所示。GRU单元只有两个门：
-- 重置门（reset gate）：如果重置门关闭，会忽略掉历史信息，即历史不相干的信息不会影响未来的输出。
-- 更新门（update gate）：将LSTM的输入门和遗忘门合并，用于控制历史信息对当前时刻隐层输出的影响。如果更新门接近1，会把历史信息传递下去。
-<p align="center">
-<img src="image/gru.png" width=700><br/>
-图2. GRU（门控循环单元）
-</p>
-
-一般来说，具有短距离依赖属性的序列，其重置门比较活跃；相反，具有长距离依赖属性的序列，其更新门比较活跃。另外，Chung等人\[[3](#参考文献)\]通过多组实验表明，GRU虽然参数更少，但是在多个任务上都和LSTM有相近的表现。
-
-### 双向循环神经网络
-
-我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/label_semantic_roles/README.md)一章中介绍了一种双向循环神经网络，这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列，得到其在每个时刻的特征表示，即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。
-
-具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（$W_1, W_3$），隐层到隐层自己的权重矩阵（$W_2,W_5$），前向隐层和后向隐层到输出层的权重矩阵（$W_4, W_6$）。注意，该网络的前向隐层和后向隐层之间没有连接。
-
-<p align="center">
-<img src="image/bi_rnn.png" width=450><br/>
-图3. 按时间步展开的双向循环神经网络
-</p>
-
-### 编码器-解码器框架
-
-编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
-<p align="center">
-<img src="image/encoder_decoder.png" width=700><br/>
-图4. 编码器-解码器框架
-**Note: "源语言词序列" 和 "源语编码状态" 位置标反了,需要互换**
-</p>
-
-#### 编码器
-
-编码阶段分为三步：
-
-1. one-hot vector表示：将源语言句子$x=\left \{ x_1,x_2,...,x_T \right \}$的每个词$x_i$表示成一个列向量$w_i\epsilon R^{\left | V \right |},i=1,2,...,T$。这个向量$w_i$的维度与词汇表大小$\left | V \right |$ 相同，并且只有一个维度上有值1（该位置对应该词在词汇表中的位置），其余全是0。
-
-2. 映射到低维语义空间的词向量：one-hot vector表示存在两个问题，1）生成的向量维度往往很大，容易造成维数灾难；2）难以刻画词与词之间的关系（如语义相似性，也就是无法很好地表达语义）。因此，需再one-hot vector映射到低维的语义空间，由一个固定维度的稠密向量（称为词向量）表示。记映射矩阵为$C\epsilon R^{K\times \left | V \right |}$，用$s_i=Cw_i$表示第$i$个词的词向量，$K$为向量维度。
-
-3. 用RNN编码源语言词序列：这一过程的计算公式为$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$，其中$h_0$是一个全零的向量，$\varnothing _\theta$是一个非线性激活函数，最后得到的$\mathbf{h}=\left \{ h_1,..., h_T \right \}$就是RNN依次读入源语言$T$个词的状态编码序列。整句话的向量表示可以采用$\mathbf{h}$在最后一个时间步$T$的状态编码，或使用时间维上的池化（pooling）结果。
-
-第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列$(x_1,x_2,...,x_T)$的顺序依次编码源语言端词，并得到一系列隐层状态$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$。类似的，后向GRU按照$(x_T,x_{T-1},...,x_1)$的顺序依次编码源语言端词，得到$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$。最后对于词$x_i$，通过拼接两个GRU的结果得到它的隐层状态，即$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$。
-
-<p align="center">
-<img src="image/encoder_attention.png" width=500><br/>
-图5. 使用双向GRU的编码器
-</p>
-
-#### 解码器
-
-机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
-
-1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）$c$、真实目标语言序列的第$i$个词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。计算公式如下：
-   
-   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
-
-   其中$\phi _{\theta '}$是一个非线性激活函数；$c=q\mathbf{h}$是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义$c=h_T$；$u_i$是目标语言序列的第$i$个单词，$u_0$是目标语言序列的开始标记`<s>`，表示解码开始；$z_i$是$i$时刻解码RNN的隐层状态，$z_0$是一个全零的向量。
-
-2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下：
-
-   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
-
-   其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分，再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。
-
-3. 根据$p_{i+1}$和$u_{i+1}$计算代价。
-4. 重复步骤1~3，直到目标语言序列中的所有词处理完毕。
-
-机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
-
-### 注意力机制
-
-如果编码阶段的输出是一个固定维度的向量，会带来以下两个问题：1）不论源语言序列的长度是5个词还是50个词，如果都用固定维度的向量去编码其中的语义和句法结构信息，对模型来说是一个非常高的要求，特别是对长句子序列而言；2）直觉上，当人类翻译一句话时，会对与当前译文更相关的源语言片段上给予更多关注，且关注点会随着翻译的进行而改变。而固定维度的向量则相当于，任何时刻都对源语言所有信息给予了同等程度的关注，这是不合理的。因此，Bahdanau等人\[[4](#参考文献)\]引入注意力（attention）机制，可以对编码后的上下文片段进行解码，以此来解决长句子的特征学习问题。下面介绍在注意力机制下的解码器结构。
-
-与简单的解码器不同，这里$z_i$的计算公式为：
-
-$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
-
-可见，源语言句子的编码向量表示为第$i$个词的上下文片段$c_i$，即针对每一个目标语言中的词$u_i$，都有一个特定的$c_i$与之对应。$c_i$的计算公式如下：
-
-$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
-
-从公式中可以看出，注意力机制是通过对编码器中各时刻的RNN状态$h_j$进行加权平均实现的。权重$a_{ij}$表示目标语言中第$i$个词对源语言中第$j$个词的注意力大小，$a_{ij}$的计算公式如下：
-
-\begin{align}
-a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
-e_{ij}&=align(z_i,h_j)\\\\
-\end{align}
-
-其中，$align$可以看作是一个对齐模型，用来衡量目标语言中第$i$个词和源语言中第$j$个词的匹配程度。具体而言，这个程度是通过解码RNN的第$i$个隐层状态$z_i$和源语言句子的第$j$个上下文片段$h_j$计算得到的。传统的对齐模型中，目标语言的每个词明确对应源语言的一个或多个词（hard alignment）；而在注意力模型中采用的是soft alignment，即任何两个目标语言和源语言词间均存在一定的关联，且这个关联强度是由模型计算得到的实数，因此可以融入整个NMT框架，并通过反向传播算法进行训练。
-
-<p align="center">
-<img src="image/decoder_attention.png" width=500><br/>
-图6. 基于注意力机制的解码器
-</p>
-
-### 柱搜索算法
-
-柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
-
-柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
-
-使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
-
-1. 每一个时刻，根据源语言句子的编码信息$c$、生成的第$i$个目标语言序列单词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。
-2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。
-3. 根据$p_{i+1}$采样出单词$u_{i+1}$。
-4. 重复步骤1~3，直到获得句子结束标记`<e>`或超过句子的最大生成长度为止。
-
-注意：$z_{i+1}$和$p_{i+1}$的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的，因此并不能保证得到全局最优解。
-
-## 数据准备
-
-### 下载与解压缩
-
-本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
-
-在Linux下，只需简单地运行以下命令：
-```bash
-cd data
-./wmt14_data.sh
-```
-得到的数据集`data/wmt14`包含如下三个文件夹：
-<p align = "center">
-<table>
-<tr>
-<td>文件夹名</td>
-<td>法英平行语料文件</td>
-<td>文件数</td>
-<td>文件大小</td>
-</tr>
-
-<tr>
-<td>train</td>
-<td>ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td>12</td>
-<td>3.55G</td>
-</tr>
-
-<tr>
-<td>test</td>
-<td>ntst1213.src, ntst1213.trg</td>
-<td>2</td>
-<td>1636k</td>
-</tr>
-
-</tr>
-<tr>
-<td>gen</td>
-<td>ntst14.src, ntst14.trg</td>
-<td>2</td>
-<td>864k</td>
-</tr>
-</table>
-</p>
-
-- `XXX.src`是源法语文件，`XXX.trg`是目标英语文件，文件中的每行存放一个句子
-- `XXX.src`和`XXX.trg`的行数一致，且两者任意第$i$行的句子之间都有着一一对应的关系。
-
-### 用户自定义数据集（可选）
-
-如果您想使用自己的数据集，只需按照如下方式组织，并将它们放在`data`目录下：
-```text
-user_dataset
-├── train
-│   ├── train_file1.src
-│   ├── train_file1.trg
-│   └── ...
-├── test
-│   ├── test_file1.src
-│   ├── test_file1.trg
-│   └── ...
-├── gen
-│   ├── gen_file1.src
-│   ├── gen_file1.trg
-│   └── ...
-```
-  
-- 一级目录`user_dataset`：用户自定义的数据集名字。
-- 二级目录`train`、`test`和`gen`：必须使用这三个文件夹名字。
-- 三级目录：存放源语言到目标语言的平行语料库文件，后缀名必须使用`.src`和`.trg`。
-
-### 数据预处理
-
-我们的预处理流程包括两步：
-- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
-  - 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。
-  - `XXX`中的第$i$行内容为`XXX.src`中的第$i$行和`XXX.trg`中的第$i$行连接，用'\t'分隔。
-- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词，包括：语料中词频最高的（DICTSIZE - 3）个单词，和3个特殊符号`<s>`（序列的开始）、`<e>`（序列的结束）和`<unk>`（未登录词）。
-
-预处理可以使用`preprocess.py`：
-```python
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-- `-i INPUT`：输入的原始数据集路径。
-- `-d DICTSIZE`：指定的字典单词数，如果没有设置，字典会包含输入数据集中的所有单词。
-- `-m --mergeDict`：合并“源字典”和“目标字典”，即这两个字典的内容完全一样。
-
-本教程的具体命令如下：
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-请耐心等待几分钟的时间，您会在屏幕上看到：
-```text
-concat parallel corpora for dataset
-build source dictionary for train data
-build target dictionary for train data
-dictionary size is 30000
-```
-预处理好的数据集存放在`data/pre-wmt14`目录下：
-```text
-pre-wmt14
-├── train
-│   └── train
-├── test
-│   └── test
-├── gen
-│   └── gen
-├── train.list
-├── test.list
-├── gen.list
-├── src.dict
-└── trg.dict
-```
-- `train`、`test`和`gen`：分别包含了法英平行语料库的训练、测试和生成数据。其每个文件的每一行以“\t”分为两列，第一列是法语序列，第二列是对应的英语序列。
-- `train.list`、`test.list`和`gen.list`：分别记录了`train`、`test`和`gen`文件夹中的文件路径。
-- `src.dict`和`trg.dict`：源（法语）和目标（英语）字典。每个字典都含有30000个单词，包括29997个最高频单词和3个特殊符号。
-
-### 提供数据给PaddlePaddle
-
-我们通过`dataprovider.py`将数据提供给PaddlePaddle。具体步骤如下：
-
-1. 首先，引入PaddlePaddle的PyDataProvider2包，并定义三个特殊符号。
-
-   ```python
-   from paddle.trainer.PyDataProvider2 import *
-   UNK_IDX = 2    #未登录词
-   START = "<s>"  #序列的开始
-   END = "<e>"    #序列的结束
-   ```
-2. 其次，使用初始化函数`hook`，分别定义了训练模式和生成模式下的数据输入格式（`input_types`）。
-   - 训练模式：有三个输入序列，其中“源语言序列”和“目标语言序列”作为输入数据，“目标语言的下一个词序列”作为标签数据。
-   - 生成模式：有两个输入序列，其中“源语言序列”作为输入数据，“源语言序列编号”作为输入数据的编号（该输入非必须，可以省略）。
-  
-  `hook`函数中的`src_dict_path`是源语言字典路径，`trg_dict_path`是目标语言字典路径，`is_generating`（训练或生成模式）是从模型配置中传入的对象。`hook`函数的具体调用方式请见[模型配置说明](#模型配置说明)。
-
-   ```python
-   def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list, 
-            **kwargs):
-       # job_mode = 1: 训练模式；0: 生成模式
-       settings.job_mode = not is_generating
-
-       def fun(dict_path): # 根据字典路径加载字典
-           out_dict = dict()
-           with open(dict_path, "r") as fin:
-               out_dict = {
-                   line.strip(): line_count
-                   for line_count, line in enumerate(fin)
-               }
-           return out_dict
-
-       settings.src_dict = fun(src_dict_path)
-       settings.trg_dict = fun(trg_dict_path)
-
-       if settings.job_mode:                                  #训练模式
-           settings.input_types = {
-               'source_language_word':                        #源语言序列
-               integer_value_sequence(len(settings.src_dict)),
-               'target_language_word':                        #目标语言序列
-               integer_value_sequence(len(settings.trg_dict)),
-               'target_language_next_word':                   #目标语言的下一个词序列
-               integer_value_sequence(len(settings.trg_dict))
-           }
-       else:                                                  #生成模式
-           settings.input_types = {
-               'source_language_word':                        #源语言序列
-               integer_value_sequence(len(settings.src_dict)),
-               'sent_id':                                     #源语言序列编号
-               integer_value_sequence(len(open(file_list[0], "r").readlines()))
-           }
-   ```
-3. 最后，使用`process`函数打开文本文件`file_name`，读取每一行，将行中的数据转换成与`input_types`一致的格式，再用`yield`关键字返回给PaddlePaddle进程。具体来说，
-
-   - 在源语言序列的每句话前面补上开始符号`<s>`、末尾补上结束符号`<e>`，得到“source_language_word”；
-   - 在目标语言序列的每句话前面补上`<s>`，得到“target_language_word”；
-   - 在目标语言序列的每句话末尾补上`<e>`，作为目标语言的下一个词序列（“target_language_next_word”）。
-
-   ```python
-   def _get_ids(s, dictionary): # 获得源语言序列中的每个单词在字典中的位置
-       words = s.strip().split()
-       return [dictionary[START]] + \
-              [dictionary.get(w, UNK_IDX) for w in words] + \
-              [dictionary[END]]
-
-   @provider(init_hook=hook, pool_size=50000)
-   def process(settings, file_name):
-       with open(file_name, 'r') as f:
-           for line_count, line in enumerate(f):
-               line_split = line.strip().split('\t')
-               if settings.job_mode and len(line_split) != 2:
-                   continue
-               src_seq = line_split[0]
-               src_ids = _get_ids(src_seq, settings.src_dict)
-
-               if settings.job_mode:
-                   trg_seq = line_split[1]
-                   trg_words = trg_seq.split()
-                   trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                   # 如果任意一个序列长度超过80个单词，在训练模式下会移除这条样本，以防止RNN过深。
-                   if len(src_ids) > 80 or len(trg_ids) > 80:
-                       continue
-                   trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                   trg_ids = [settings.trg_dict[START]] + trg_ids
-                   yield {
-                       'source_language_word': src_ids,
-                       'target_language_word': trg_ids,
-                       'target_language_next_word': trg_ids_next
-                   }
-               else:
-                   yield {'source_language_word': src_ids, 'sent_id': [line_count]}
-   ```
-注意：由于本示例中的训练数据有3.55G，对于内存较小的机器，不能一次性加载进内存，所以推荐使用`pool_size`变量来设置内存中暂存的数据条数。
-
-## 模型配置说明
-
-### 数据定义
-
-1. 首先，定义数据集路径和源/目标语言字典路径，并用`is_generating`变量定义当前配置是训练模式（默认）还是生成模式。该变量接受从命令行传入的参数，使用方法见[应用命令与结果](#应用命令与结果)。
-
-   ```python
-   import os
-   from paddle.trainer_config_helpers import *
-
-   data_dir = "./data/pre-wmt14" # 数据集路径
-   src_lang_dict = os.path.join(data_dir, 'src.dict') # 源语言字典路径
-   trg_lang_dict = os.path.join(data_dir, 'trg.dict') # 目标语言字典路径
-   is_generating = get_config_arg("is_generating", bool, False) # 配置模式
-   ```
-2. 其次，通过`define_py_data_sources2`函数从`dataprovider.py`中读取数据，并用`args`变量传入源/目标语言的字典路径以及配置模式。
-
-   ```python
-   if not is_generating:
-       train_list = os.path.join(data_dir, 'train.list')
-       test_list = os.path.join(data_dir, 'test.list')
-   else:
-       train_list = None
-       test_list = os.path.join(data_dir, 'gen.list')
-
-   define_py_data_sources2(
-       train_list,
-       test_list,
-       module="dataprovider",
-       obj="process",
-       args={
-           "src_dict_path": src_lang_dict, # 源语言字典路径
-           "trg_dict_path": trg_lang_dict, # 目标语言字典路径
-           "is_generating": is_generating  # 配置模式
-       })
-   ```
-
-### 算法配置
-
-```python
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-```
-本教程使用默认的SGD随机梯度下降算法和Adam学习方法，并指定学习率为5e-4。注意：生成模式下的`batch_size = 50`，表示同时生成50条序列。
-
-### 模型结构
-1. 首先，定义了一些全局变量。
-
-   ```python
-   source_dict_dim = len(open(src_lang_dict, "r").readlines()) # 源语言字典维度
-   target_dict_dim = len(open(trg_lang_dict, "r").readlines()) # 目标语言字典维度
-   word_vector_dim = 512 # 词向量维度
-   encoder_size = 512 # 编码器中的GRU隐层大小
-   decoder_size = 512 # 解码器中的GRU隐层大小
-
-   if is_generating:
-       beam_size=3  # 柱搜索算法中的宽度
-       max_length=250 # 生成句子的最大长度
-       gen_trans_file = get_config_arg("gen_trans_file", str, None) # 生成后的文件
-  ```
-
-2. 其次，实现编码器框架。分为三步：
-
-   2.1 传入已经在`dataprovider.py`转换成one-hot vector表示的源语言序列$\mathbf{w}$。
-
-   ```python
-   src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-   ```
-   2.2 将上述编码映射到低维语言空间的词向量$\mathbf{s}$。
-
-   ```python
-   src_embedding = embedding_layer(
-       input=src_word_id,
-       size=word_vector_dim,
-       param_attr=ParamAttr(name='_source_language_embedding'))
-   ```
-   2.3 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到$\mathbf{h}$。
-  
-   ```python
-   src_forward = simple_gru(input=src_embedding, size=encoder_size)
-   src_backward = simple_gru(
-       input=src_embedding, size=encoder_size, reverse=True)
-   encoded_vector = concat_layer(input=[src_forward, src_backward])
-   ```
-
-3. 接着，定义基于注意力机制的解码器框架。分为三步：
-
-   3.1 对源语言序列编码后的结果（见2.3），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。
-   
-   ```python
-   with mixed_layer(size=decoder_size) as encoded_proj:
-       encoded_proj += full_matrix_projection(input=encoded_vector)
-   ```
-   3.2 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。
-
-   ```python
-   backward_first = first_seq(input=src_backward)
-   with mixed_layer(
-           size=decoder_size,
-           act=TanhActivation(), ) as decoder_boot:
-       decoder_boot += full_matrix_projection(input=backward_first)
-   ```
-   3.3 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
-
-      - decoder_mem记录了前一个时间步的隐层状态$z_i$，其初始状态是decoder_boot。
-      - context通过调用`simple_attention`函数，实现公式$c_i=\sum {j=1}^{T}a_{ij}h_j$。其中，enc_vec是$h_j$，enc_proj是$h_j$的映射（见3.1），权重$a_{ij}$的计算已经封装在`simple_attention`函数中。
-      - decoder_inputs融合了$c_i$和当前目标词current_word（即$u_i$）的表示。
-      - gru_step通过调用`gru_step_layer`函数，在decoder_inputs和decoder_mem上做了激活操作，即实现公式$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$。
-      - 最后，使用softmax归一化计算单词的概率，将out结果返回，即实现公式$p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$。 
-        
-   ```python
-   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-       decoder_mem = memory(
-           name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-       context = simple_attention(
-           encoded_sequence=enc_vec,
-           encoded_proj=enc_proj,
-           decoder_state=decoder_mem, )
-
-       with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-           decoder_inputs += full_matrix_projection(input=context)
-           decoder_inputs += full_matrix_projection(input=current_word)
-
-       gru_step = gru_step_layer(
-           name='gru_decoder',
-           input=decoder_inputs,
-           output_mem=decoder_mem,
-           size=decoder_size)
-
-       with mixed_layer(
-               size=target_dict_dim, bias_attr=True,
-               act=SoftmaxActivation()) as out:
-           out += full_matrix_projection(input=gru_step)
-       return out
-    ```
-4. 训练模式与生成模式下的解码器调用区别。
-
-   4.1 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
-
-   ```python
-   decoder_group_name = "decoder_group"
-   group_input1 = StaticInput(input=encoded_vector, is_seq=True)
-   group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
-   group_inputs = [group_input1, group_input2]
-   ```
-   4.2 训练模式下的解码器调用：
-
-      - 首先，将目标语言序列的词向量trg_embedding，直接作为训练模式下的current_word传给`gru_decoder_with_attention`函数。
-      - 其次，使用`recurrent_group`函数循环调用`gru_decoder_with_attention`函数。
-      - 接着，使用目标语言的下一个词序列作为标签层lbl，即预测目标词。
-      - 最后，用多类交叉熵损失函数`classification_cost`来计算损失值。
-
-   ```python
-   if not is_generating:
-       trg_embedding = embedding_layer(
-           input=data_layer(
-               name='target_language_word', size=target_dict_dim),
-           size=word_vector_dim,
-           param_attr=ParamAttr(name='_target_language_embedding'))
-       group_inputs.append(trg_embedding)
-
-       decoder = recurrent_group(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs)
-
-       lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-       cost = classification_cost(input=decoder, label=lbl)
-       outputs(cost)
-   ```
-   4.3 生成模式下的解码器调用：
-
-      - 首先，在序列生成任务中，由于解码阶段的RNN总是引用上一时刻生成出的词的词向量，作为当前时刻的输入，因此，使用`GeneratedInput`来自动完成这一过程。具体说明可见[GeneratedInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
-      - 其次，使用`beam_search`函数循环调用`gru_decoder_with_attention`函数，生成出序列id。
-      - 最后，使用`seqtext_printer_evaluator`函数，根据目标字典`trg_lang_dict`，打印出完整的句子保存在`gen_trans_file`中。
-     
-   ```python
-   else:
-       trg_embedding = GeneratedInput(
-           size=target_dict_dim,
-           embedding_name='_target_language_embedding',
-           embedding_size=word_vector_dim)
-       group_inputs.append(trg_embedding)
-
-       beam_gen = beam_search(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs,
-           bos_id=0,
-           eos_id=1,
-           beam_size=beam_size,
-           max_length=max_length)
-
-       seqtext_printer_evaluator(
-           input=beam_gen,
-           id_input=data_layer(
-               name="sent_id", size=1),
-           dict_file=trg_lang_dict,
-           result_file=gen_trans_file)
-       outputs(beam_gen)
-   ```
-注意：我们提供的配置在Bahdanau的论文\[[4](#参考文献)\]上做了一些简化，可参考[issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133)。
-
-
-## 训练模型
-
-可以通过以下命令来训练模型：
-
-```bash
-./train.sh
-```
-其中`train.sh` 的内容为：
-
-```bash
-paddle train \
---config='seqToseq_net.py' \
---save_dir='model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'train.log'
-```
-- config: 设置神经网络的配置文件。
-- save_dir: 设置保存模型的输出路径。
-- use_gpu: 是否使用GPU训练，这里使用CPU。
-- num_passes: 设置passes的数量。PaddlePaddle中的一个pass表示对数据集中所有样本的一次完整训练。
-- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息。
-- trainer_count: 设置CPU线程数或者GPU设备数。
-- log_period: 这里每隔10个batch打印一次日志。
-- dot_period: 这里每个5个batch打印一个点"."。
-
-训练的损失函数每隔10个batch打印一次，您将会看到如下消息：
-```text
-I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-.....
-```
-- AvgCost：从第0个batch到当前batch的平均损失值。
-- CurrentCost：当前batch的损失值。
-- classification\_error\_evaluator(Eval)：从第0个评估到当前评估中，每个单词的预测错误率。
-- classification\_error\_evaluator(CurrentEval)：当前评估中，每个单词的预测错误率。
-
-当classification\_error\_evaluator的值低于0.35时，模型就训练成功了。
-
-## 应用模型
-
-### 下载预训练的模型
-
-由于NMT模型的训练非常耗时，我们在50个物理节点（每节点含有2颗6核CPU）的集群中，花了5天时间训练了16个pass，其中每个pass耗时7个小时。因此，我们提供了一个预先训练好的模型（pass-00012）供大家直接下载使用。该模型大小为205MB，在所有16个模型中有最高的[BLEU评估](#BLEU评估)值26.92。下载并解压模型的命令如下：
-```bash
-cd pretrained
-./wmt14_model.sh
-```
-
-### 应用命令与结果
-
-可以通过以下命令来进行法英翻译：
-
-```bash
-./gen.sh
-```
-其中`gen.sh` 的内容为：
-
-```bash
-paddle train \
---job=test \
---config='seqToseq_net.py' \
---save_dir='pretrained/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
---config_args=is_generating=1,gen_trans_file="gen_result" \
-2>&1 | tee 'translation/gen.log'
-```
-与训练命令不同的参数如下：
-- job：设置任务的模式为测试。
-- save_dir：设置存放预训练模型的路径。
-- num_passes和test_pass：加载第$i\epsilon \left [ test\_pass,num\_passes-1 \right ]$轮的模型参数，这里只加载 `data/wmt14_model/pass-00012`。
-- config_args：将命令行中的自定义参数传递给模型配置。`is_generating=1`表示当前为生成模式，`gen_trans_file="gen_result"`表示生成结果的存储文件。
-
-翻译结果请见[效果展示](#效果展示)。
-
-### BLEU评估
-
-BLEU(Bilingual Evaluation understudy)是一种广泛使用的机器翻译自动评测指标，由IBM的watson研究中心于2002年提出\[[5](#参考文献)\]，基本出发点是：机器译文越接近专业翻译人员的翻译结果，翻译系统的性能越好。其中，机器译文与人工参考译文之间的接近程度，采用句子精确度（precision）的计算方法，即比较两者的n元词组相匹配的个数，匹配的个数越多，BLEU得分越好。
-
-[Moses](http://www.statmt.org/moses/) 是一个统计学的开源机器翻译系统，我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。下载脚本的命令如下：
-```bash
-./moses_bleu.sh
-```
-BLEU评估可以使用`eval_bleu`脚本如下，其中FILE为需要评估的文件名，BEAMSIZE为柱宽度，默认使用`data/wmt14/gen/ntst14.trg`作为标准的翻译结果。
-```bash
-./eval_bleu.sh FILE BEAMSIZE
-```
-本教程的具体命令如下：
-```bash
-./eval_bleu.sh gen_result 3
-```
-您会在屏幕上看到：
-```text
-BLEU = 26.92
-```
-
-## 总结
-
-端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中，我们介绍了NMT中典型的“编码器-解码器”框架和“注意力”机制。由于NMT是一个典型的Seq2Seq（Sequence to Sequence，序列到序列）学习问题，因此，Seq2Seq中的query改写（query rewriting）、摘要、单轮对话等问题都可以用本教程的模型来解决。
-
-## 参考文献
-
-1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
-2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
-3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
-4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
-5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/machine_translation/data/wmt14_data.sh b/machine_translation/data/wmt14_data.sh
deleted file mode 100755
index 43f67168d2a876ba5401e0f8490a88adac9c5551..0000000000000000000000000000000000000000
--- a/machine_translation/data/wmt14_data.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-mkdir wmt14
-cd wmt14
-
-# download the dataset
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz
-wget http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz
-
-# untar the dataset
-tar -zxvf bitexts.tgz
-tar -zxvf dev+test.tgz
-gunzip bitexts.selected/*
-mv bitexts.selected train
-rm bitexts.tgz
-rm dev+test.tgz
-
-# separate the dev and test dataset
-mkdir test gen
-mv dev/ntst1213.* test
-mv dev/ntst14.* gen 
-rm -rf dev
-
-set +x
-# rename the suffix, .fr->.src, .en->.trg
-for dir in train test gen
-do 
-  filelist=`ls $dir`
-  cd $dir
-  for file in $filelist
-  do 
-    if [ ${file##*.} = "fr" ]; then
-      mv $file ${file/%fr/src}
-    elif [ ${file##*.} = 'en' ]; then
-      mv $file ${file/%en/trg}
-    fi
-  done
-  cd ..
-done
diff --git a/machine_translation/dataprovider.py b/machine_translation/dataprovider.py
deleted file mode 100755
index c2b49804be582d7d0bc3ef6332741be03936eb24..0000000000000000000000000000000000000000
--- a/machine_translation/dataprovider.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-UNK_IDX = 2
-START = "<s>"
-END = "<e>"
-
-
-def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
-         **kwargs):
-    # job_mode = 1: training mode
-    # job_mode = 0: generating mode
-    settings.job_mode = not is_generating
-
-    def fun(dict_path):
-        out_dict = dict()
-        with open(dict_path, "r") as fin:
-            out_dict = {
-                line.strip(): line_count
-                for line_count, line in enumerate(fin)
-            }
-        return out_dict
-
-    settings.src_dict = fun(src_dict_path)
-    settings.trg_dict = fun(trg_dict_path)
-
-    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
-
-    if settings.job_mode:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'target_language_word':
-            integer_value_sequence(len(settings.trg_dict)),
-            'target_language_next_word':
-            integer_value_sequence(len(settings.trg_dict))
-        }
-        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
-    else:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'sent_id':
-            integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        }
-
-
-def _get_ids(s, dictionary):
-    words = s.strip().split()
-    return [dictionary[START]] + \
-           [dictionary.get(w, UNK_IDX) for w in words] + \
-           [dictionary[END]]
-
-
-@provider(init_hook=hook, pool_size=50000)
-def process(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line_count, line in enumerate(f):
-            line_split = line.strip().split('\t')
-            if settings.job_mode and len(line_split) != 2:
-                continue
-            src_seq = line_split[0]  # one source sequence
-            src_ids = _get_ids(src_seq, settings.src_dict)
-
-            if settings.job_mode:
-                trg_seq = line_split[1]  # one target sequence
-                trg_words = trg_seq.split()
-                trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                # remove sequence whose length > 80 in training mode
-                if len(src_ids) > 80 or len(trg_ids) > 80:
-                    continue
-                trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield {
-                    'source_language_word': src_ids,
-                    'target_language_word': trg_ids,
-                    'target_language_next_word': trg_ids_next
-                }
-            else:
-                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
diff --git a/machine_translation/eval_bleu.sh b/machine_translation/eval_bleu.sh
deleted file mode 100755
index 6b96f9acd6874812925466340d6007faee5d8692..0000000000000000000000000000000000000000
--- a/machine_translation/eval_bleu.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-gen_file=$1
-beam_size=$2
-
-# find top1 generating result
-top1=$(printf '%s_top1.txt' `basename $gen_file .txt`)
-if [ $beam_size -eq 1 ]; then
-    awk -F "\t" '{sub(" <e>","",$2);sub(" ","",$2);print $2}' $gen_file >$top1
-else
-    awk 'BEGIN{
-        FS="\t";
-        OFS="\t";
-        read_pos = 2} {
-        if (NR == read_pos){
-            sub(" <e>","",$3);
-            sub(" ","",$3);
-            print $3;
-            read_pos += (2 + res_num);
-      }}' res_num=$beam_size $gen_file >$top1
-fi 
-
-# evalute bleu value
-bleu_script=multi-bleu.perl
-standard_res=data/wmt14/gen/ntst14.trg
-bleu_res=`perl $bleu_script $standard_res <$top1`
-
-echo $bleu_res | cut -d, -f 1
-rm $top1
diff --git a/machine_translation/gen.sh b/machine_translation/gen.sh
deleted file mode 100755
index 140d5626210d0422dbddd1911c11c3bfa22b9d2c..0000000000000000000000000000000000000000
--- a/machine_translation/gen.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-paddle train \
-    --job=test \
-    --config='seqToseq_net.py' \
-    --save_dir='pretrained/wmt14_model' \
-    --use_gpu=false \
-    --num_passes=13 \
-    --test_pass=12 \
-    --trainer_count=1 \
-    --config_args=is_generating=1,gen_trans_file="gen_result" \
-    2>&1 | tee 'gen.log'
diff --git a/machine_translation/index.en.html b/machine_translation/index.en.html
deleted file mode 100644
index 558e368a10c6d64a2ce10afc50a31684eb79ed4e..0000000000000000000000000000000000000000
--- a/machine_translation/index.en.html
+++ /dev/null
@@ -1,787 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# Machine Translation
-
-The source codes is located at [book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/machine_translation). Please refer to the PaddlePaddle [installation tutorial](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html) if you are a first time user.
-
-## Background
-
-Machine translation (MT) leverages computers to translate from one language to another. The language to be translated is referred to as the source language, while the language to be translated into is referred to as the target language. Thus, Machine translation is the process of translating from the source language to the target language. It is one of the most important research topics in the field of natural language processing.
-
-Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one languge. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#References)\]。
-
-
-To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example: 
-
-1. human designed features cannot cover all possible linguistic variations; 
-
-2. it is difficult to use global features; 
-
-3. the techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
-
-
-
-The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are: 
-
-1. techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1); 
-
-2. techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
-
-<p align="center">
-<img src="image/nmt_en.png" width=400><br/>
-Figure 1. Neural Network based Machine Translation
-</p>
-
-
-This tutorial will mainly introduce an NMT model and how to use PaddlePaddle to train it.
-
-## Illustrative Results
-
-Let's consider an example of Chinese-to-English translation. The model is given the following segmented sentence in Chinese
-```text
-这些 是 希望 的 曙光 和 解脱 的 迹象 .
-```
-After training and with a beam-search size of 3, the generated translations are as follows:
-```text
-0 -5.36816   these are signs of hope and relief . <e>
-1 -6.23177   these are the light of hope and relief . <e>
-2 -7.7914  these are the light of hope and the relief of hope . <e>
-```
-- The first column corresponds to the id of the generated sentence; the second column corresponds to the score of the generated sentence (in descending order), where a larger value indicates better quality; the last column corresponds to the generated sentence.
-- There are two special tokens: `<e>` denotes the end of a sentence while `<unk>` denotes unknown word, i.e., a word not in the training dictionary.
-
-## Overview of the Model
-
-This section will introduce Gated Recurrent Unit (GRU), Bi-directional Recurrent Neural Network, the Encoder-Decoder framework used in NMT, attention mechanism, as well as the beam search algorithm.
-
-### Gated Recurrent Unit (GRU)
-
-We already introduced RNN and LSTM in the [Sentiment Analysis](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md) chapter.
-Compared to a simple RNN, the LSTM added memory cell, input gate, forget gate and output gate. These gates combined with the memory cell greatly improve the ability to handle long-term dependencies.
-
-GRU\[[2](#References)\] proposed by Cho et al is a simplified LSTM and an extension of a simple RNN. It is shown in the figure below. 
-A GRU unit has only two gates:
-- reset gate: when this gate is closed, the history information is discarded, i.e., the irrelevant historical information has no effect on the future output.
-- update gate: it combines the input gate and the forget gate and is used to control the impact of historical information on the hidden output. The historical information is passed over when the update gate is close to 1.
-
-<p align="center">
-<img src="image/gru_en.png" width=700><br/>
-Figure 2. A GRU Gate
-</p>
-
-Generally speaking, sequences with short distance dependencies will have an active reset gate while sequences with long distance dependency will have an active update date.
-In addition, Chung et al.\[[3](#References)\] have empirically shown that although GRU has less parameters, it has similar performance to LSTM on several different tasks.
-
-### Bi-directional Recurrent Neural Network
-
-We already introduced an instance of bi-directional RNN in the [Semantic Role Labeling](https://github.com/PaddlePaddle/book/blob/develop/label_semantic_roles/README.md) chapter. Here we present another bi-directional RNN model with a different architecture proposed by Bengio et al. in \[[2](#References),[4](#References)\]. This model takes a sequence as input and outputs a fixed dimensional feature vector at each step, encoding the context information at the corresponding time step.
-
-Specifically, this bi-directional RNN processes the input sequence in the original and reverse order respectively, and then concatenates the output feature vectors at each time step as the final output. Thus the output node at each time step contains information from the past and future as context. The figure below shows an unrolled bi-directional RNN. This network contains a forward RNN and backward RNN with six weight matrices: weight matrices from input to forward hidden layer and backward hidden ($W_1, W_3$), weight matrices from hidden to itself ($W_2, W_5$), matrices from forward hidden and backward hidden to output layer ($W_4, W_6$). Note that there are no connections between forward hidden and backward hidden layers.
-
-<p align="center">
-<img src="image/bi_rnn_en.png" width=450><br/>
-Figure 3. Temporally unrolled bi-directional RNN
-</p>
-
-### Encoder-Decoder Framework
-
-The Encoder-Decoder\[[2](#References)\] framework aims to solve the mapping of a sequence to another sequence, for sequences with arbitrary lengths. The source sequence is encoded into a vector via an encoder, which is then decoded to a target sequence via a decoder by maximizing the predictive probability. Both the encoder and the decoder are typically implemented via RNN.
-
-<p align="center">
-<img src="image/encoder_decoder_en.png" width=700><br/>
-Figure 4. Encoder-Decoder Framework
-</p>
-
-#### Encoder
-
-There are three steps for encoding a sentence:
-
-1. One-hot vector representation of a word: Each word $x_i$ in the source sentence $x=\left \{ x_1,x_2,...,x_T \right \}$ is represented as a vector $w_i\epsilon R^{\left | V \right |},i=1,2,...,T$   where $w_i$ has the same dimensionality as the size of the dictionary, i.e., $\left | V \right |$, and has an element of one at the location corresponding to the location of the word in the dictionary and zero elsewhere.
-
-2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation 
-
-  * the dimensionality of the vector is typically large, leading to the curse of dimensionality; 
-
-  * it is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
-
-3. Encoding of the source sequence via RNN: This can be described mathematically as:
-
-    $$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$$
-    
-    where 
-    $h_0$ is a zero vector, 
-    $\varnothing _\theta$ is a non-linear activation function, and 
-    $\mathbf{h}=\left \{ h_1,..., h_T \right \}$ 
-    is the sequential encoding of the first $T$ words from the source sequence. The vector representation of the whole sentence can be represented as the encoding vector at the last time step $T$ from $\mathbf{h}$, or by temporal pooling over $\mathbf{h}$.
-
-
-Bi-directional RNN can also be used in step (3) for more a complicated sentence encoding. This can be implemented using a bi-directional GRU. Forward GRU encodes the source sequence in its original order $(x_1,x_2,...,x_T)$, and generates a sequence of hidden states $(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$. The backward GRU encodes the source sequence in reverse order, i.e., $(x_T,x_T-1,...,x_1)$ and generates $(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$. Then for each word $x_i$, its complete hidden state is the concatenation of the corresponding hidden states from the two GRUs, i.e., $h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$.
-
-<p align="center">
-<img src="image/encoder_attention_en.png" width=500><br/>
-Figure 5. Encoder using bi-directional GRU
-</p>
-
-#### Decoder
-
-The goal of the decoder is to maximize the probability of the next correct word in the target language. The main idea is as follows:
-
-1. At each time step $i$, given the encoding vector (or context vector) $c$ of the source sentence, the $i$-th word $u_i$ from the ground-truth target language and the RNN hidden state $z_i$, the next hidden state $z_{i+1}$ is computed as:
-
-   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
-   where $\phi _{\theta '}$ is a non-linear activation function and $c=q\mathbf{h}$ is the context vector of the source sentence. Without using [attention](#Attention Mechanism), if the output of the [encoder](#Encoder) is the encoding vector at the last time step of the source sentence, then $c$ can be defined as $c=h_T$. $u_i$ denotes the $i$-th word from the target language sentence and $u_0$ denotes the beginning of the target language sentence (i.e., `<s>`), indicating the beginning of decoding. $z_i$ is the RNN hidden state at time step $i$ and $z_0$ is an all zero vector.
-
-2. Calculate the probability $p_{i+1}$ for the $i+1$-th word in the target language sequence by normalizing $z_{i+1}$ using `softmax` as follows
-
-   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
-
-   where $W_sz_{i+1}+b_z$ scores each possible words and is then normalized via softmax to produce the probability $p_{i+1}$ for the $i+1$-th word.
-
-3. Compute the cost accoding to $p_{i+1}$ and $u_{i+1}$.
-4. Repeat Steps 1-3, until all the words in the target language sentence have been processed.
-
-The generation process of machine translation is to translate the source sentence into a sentence in the target language according to a pre-trained model. There are some differences between the decoding step in generation and training. Please refer to [Beam Search Algorithm](#Beam Search Algorithm) for details.
-
-### Attention Mechanism
-
-There are a few problems with the fixed dimensional vector representation from the encoding stage: 
-  * It is very challenging to encode both the semantic and syntactic information a sentence with a fixed dimensional vector regardless of the length of the sentence. 
-  * Intuitively, when translating a sentence, we typically pay more attention to the parts in the source sentence more relevant to the current translation. Moreover, the focus changes along the process of the translation. With a fixed dimensional vector, all the information from the source sentence is treated equally in terms of attention. This is not reasonable. Therefore, Bahdanau et al. \[[4](#References)\] introduced attention mechanism, which can decode based on different fragments of the context sequence in order to address the difficulty of feature learning for long sentences. Decoder with attention will be explained in the following.
-
-Different from the simple decoder, $z_i$ is computed as:
-
-$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
-
-It is observed that for each word $u_i$ in the target language sentence, there is a corresponding context vector $c_i$ as the encoding of the source sentence, which is computed as:
-
-$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
-
-It is noted that the attention mechanism is achieved by a weighted average over the RNN hidden states $h_j$. The weight $a_{ij}$ denotes the strength of attention of the $i$-th word in the target language sentence to the $j$-th word in the source sentence and is calculated as
-
-\begin{align}
-a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
-e_{ij}&=align(z_i,h_j)\\\\
-\end{align}
-
-where $align$ is an alignment model that measures the fitness between the $i$-th word in the target language sentence and the $j$-th word in the source sentence. More concretely, the fitness is computed with the $i$-th hidden state $z_i$ of the decoder RNN and the $j$-th context vector $h_j$ of the source sentence. Hard alignment is used in the conventional alignment model, which means each word in the target language explicitly corresponds to one or more words from the target language sentence. In an attention model, soft alignment is used, where any word in source sentence is related to any word in the target language sentence, where the strength of the relation is a real number computed via the model, thus can be incorporated into the NMT framework and can be trained via back-propagation.
-
-<p align="center">
-<img src="image/decoder_attention_en.png" width=500><br/>
-Figure 6. Decoder with Attention Mechanism
-</p>
-
-### Beam Search Algorithm
-
-[Beam Search](http://en.wikipedia.org/wiki/Beam_search) is a heuristic search algorithm that explores a graph by expanding the most promising node in a limited set. It is typically used when the solution space is huge  (e.g., for machine translation, speech recognition), and there is not enough memory for all the possible solutions. For example, if we want to translate “`<s>你好<e>`” into English, even if there are only three words in the dictionary (`<s>`, `<e>`, `hello`), it is still possible to generate an infinite number of sentences, where the word `hello` can appear different number of times. Beam search could be used to find a good translation among them.
-
-Beam search builds a search tree using breadth first search and sorts the nodes according to a heuristic cost (sum of the log probability of the generated words) at each level of the tree. Only a fixed number of nodes according to the pre-specified beam size (or beam width) are considered. Thus, only nodes with highest scores are expanded in the next level. This reduces the space and time requirements significantly. However, a globally optimal solution is not guaranteed. 
-
-The goal is to maximize the probability of the generated sequence when using beam search in decoding, The procedure is as follows:
-
-1. At each time step $i$, compute the hidden state $z_{i+1}$ of the next time step according to the context vector $c$ of the source sentence, the $i$-th word $u_i$ generated for the target language sentence and the RNN hidden state $z_i$.
-2. Normalize $z_{i+1}$ using `softmax` to get the probability $p_{i+1}$ for the $i+1$-th word for the target language sentence.
-3. Sample the word $u_{i+1}$ according to $p_{i+1}$.
-4. Repeat Steps 1-3, until end-of-sentence token `<e>` is generated or the maximum length of the sentence is reached.
-
-Note: $z_{i+1}$ and $p_{i+1}$ are computed the same way as in [Decoder](#Decoder). In generation mode, each step is greedy in so there is no guarantee of a global optimum.
-
-## Data Preparation
-
-### Download and Uncompression
-
-This tutorial uses a dataset from [WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), where [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) is used as the training set, and [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) is used as test and generation set.
-
-Run the following command in Linux to obtain the data:
-```bash
-cd data
-./wmt14_data.sh
-```
-There are three folders in the downloaded dataset `data/wmt14`:
-<p align = "center">
-<table>
-<tr>
-<td>Folder Name</td>
-<td>French-English Parallel Corpus</td>
-<td>Number of Files</td>
-<td>Size of Files</td>
-</tr>
-
-<tr>
-<td>train</td>
-<td>ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td>12</td>
-<td>3.55G</td>
-</tr>
-
-<tr>
-<td>test</td>
-<td>ntst1213.src, ntst1213.trg</td>
-<td>2</td>
-<td>1636k</td>
-</tr>
-
-</tr>
-<tr>
-<td>gen</td>
-<td>ntst14.src, ntst14.trg</td>
-<td>2</td>
-<td>864k</td>
-</tr>
-</table>
-</p>
-
-- `XXX.src` is the source file in French and `XXX.trg`is the target file in English. Each row of the file contains one sentence.
-- `XXX.src` and `XXX.trg` has the same number of rows and there is a one-to-one correspondance between the sentences at any row from the two files.
-
-### User Defined Dataset (Optional)
-
-To use your own dataset, just put it under the `data` folder and organize it as follows
-```text
-user_dataset
-├── train
-│   ├── train_file1.src
-│   ├── train_file1.trg
-│   └── ...
-├── test
-│   ├── test_file1.src
-│   ├── test_file1.trg
-│   └── ...
-├── gen
-│   ├── gen_file1.src
-│   ├── gen_file1.trg
-│   └── ...
-```
-
-Explanation of the directories:
-- First level: `user_dataset`: the name of the user defined dataset.
-- Second level: `train`、`test` and `gen`: these names should not be changed.
-- Third level: Parallel corpus in source language and target language, each with a postfix of `.src` and `.trg`.
-
-### Data Pre-processing
-
-There are two steps for pre-processing:
-- Merge the source and target parallel corpus files into one file
-  - Merge `XXX.src` and `XXX.trg` file pair as `XXX`
-  - The $i$-th row in `XXX` is the concatenation of the $i$-th row from `XXX.src` with the $i$-th row from `XXX.trg`, separated with '\t'.
-
-- Create source dictionary and target dictionary, each containing **DICTSIZE** number of words, including the most frequent (DICTSIZE - 3) fo word from the corpus and 3 special token `<s>` (begin of sequence), `<e>` (end of sequence)  and `<unk>` (unknown words that are not in the vocabulary).
-
-`preprocess.py` is used for pre-processing:
-```python
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-- `-i INPUT`: path to the original dataset.
-- `-d DICTSIZE`: number of words in the dictionary. If unspecified, the dictionary will contain all the words appeared in the input dataset.
-- `-m --mergeDict`: merge the source dictionary with target dictionary, making the two dictionaries have the same content.
-
-The specific command to run the script is as follows:
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-You will see the following messages after a few minutes:
-```text
-concat parallel corpora for dataset
-build source dictionary for train data
-build target dictionary for train data
-dictionary size is 30000
-```
-The pre-processed data is located at `data/pre-wmt14`:
-```text
-pre-wmt14
-├── train
-│   └── train
-├── test
-│   └── test
-├── gen
-│   └── gen
-├── train.list
-├── test.list
-├── gen.list
-├── src.dict
-└── trg.dict
-```
-- `train`, `test` and `gen`: contains French-English parallel corpus for training, testing and generation. Each row from each file is separated into two columns with a "\t", where the first column is the sequence in French and the second one is in English.
-- `train.list`, `test.list` and `gen.list`: record respectively the path to `train`, `test` and `gen` folders.
-- `src.dict` and `trg.dict`: source (French) and target (English) dictionary. Each dictionary contains 30000 words (29997 most frequent words and 3 special tokens).
-
-### Providing Data to PaddlePaddle
-
-We use `dataprovider.py` to provide data to PaddlePaddle as follows:
-
-1. Import PyDataProvider2 package from PaddlePaddle and define three special tokens:
-
-   ```python
-   from paddle.trainer.PyDataProvider2 import *
-   UNK_IDX = 2    #out of vocabulary word
-   START = "<s>"  #begin of sequence
-   END = "<e>"    #end of sequence
-   ```
-2. Use initialization function `hook` to define the input data types (`input_types`) for training and generation:
-   - Training: there are three input sequences, where "source language sequence" and "target language sequence" are input and the "target language next word sequence" is the label.
-   - Generation: there are two input sequences, where the "source language sequence" is the input and “source language sequence id” are the ids for the input data (optional).
-
-  `src_dict_path` in the `hook` function is the path to the source language dictionary, while `trg_dict_path` the path to target language dictionary. `is_generating` is passed from model config file. For more details on the usage of the `hook` function please refer to [Model Config](#Model Config).
-
-   ```python
-   def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
-            **kwargs):
-       # job_mode = 1: training 0: generation
-       settings.job_mode = not is_generating
-
-       def fun(dict_path): # load dictionary according to the path
-           out_dict = dict()
-           with open(dict_path, "r") as fin:
-               out_dict = {
-                   line.strip(): line_count
-                   for line_count, line in enumerate(fin)
-               }
-           return out_dict
-
-       settings.src_dict = fun(src_dict_path)
-       settings.trg_dict = fun(trg_dict_path)
-
-       if settings.job_mode:                                  #training
-           settings.input_types = {
-               'source_language_word':                        #source language sequence
-               integer_value_sequence(len(settings.src_dict)),
-               'target_language_word':                        #target language sequence
-               integer_value_sequence(len(settings.trg_dict)),
-               'target_language_next_word':                   #target language next word sequence
-               integer_value_sequence(len(settings.trg_dict))
-           }
-       else:                                                  #generation
-           settings.input_types = {
-               'source_language_word':                        #source language sequence
-               integer_value_sequence(len(settings.src_dict)),
-               'sent_id':                                     #source language sequence id
-               integer_value_sequence(len(open(file_list[0], "r").readlines()))
-           }
-   ```
-3. Use `process` function to open the file `file_name`, read each row of the file, convert the data to be compatible with `input_types`, and then use `yield` to return to PaddlePaddle process. More specifically
-
-   - add `<s>` to the beginning of each source language sequence and add `<e>` to the end, producing "source_language_word".
-   - add `<s>` to the beginning of each target language senquence, producing "target_language_word".
-   - add `<e>` to the end of each target language senquence, producing "target_language_next_word".
-
-   ```python
-   def _get_ids(s, dictionary): # get the location of each word from the source language sequence in the dictionary
-       words = s.strip().split()
-       return [dictionary[START]] + \
-              [dictionary.get(w, UNK_IDX) for w in words] + \
-              [dictionary[END]]
-
-   @provider(init_hook=hook, pool_size=50000)
-   def process(settings, file_name):
-       with open(file_name, 'r') as f:
-           for line_count, line in enumerate(f):
-               line_split = line.strip().split('\t')
-               if settings.job_mode and len(line_split) != 2:
-                   continue
-               src_seq = line_split[0]
-               src_ids = _get_ids(src_seq, settings.src_dict)
-
-               if settings.job_mode:
-                   trg_seq = line_split[1]
-                   trg_words = trg_seq.split()
-                   trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                   # sequence with length longer than 80 with be removed during training to avoid an overly deep RNN.
-                   if len(src_ids) > 80 or len(trg_ids) > 80:
-                       continue
-                   trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                   trg_ids = [settings.trg_dict[START]] + trg_ids
-                   yield {
-                       'source_language_word': src_ids,
-                       'target_language_word': trg_ids,
-                       'target_language_next_word': trg_ids_next
-                   }
-               else:
-                   yield {'source_language_word': src_ids, 'sent_id': [line_count]}
-   ```
-Note: The size of the training data is 3.55G. For machines with limited memories, it is recommended to use `pool_size` to set the number of data samples stored in memory.
-
-## Model Config
-
-### Data Definition
-
-1. Specify the path to data and source/target dictionaries. `is_generating` accepts argument passed from command lines and is used to denote whether the current configuration is for training (default) or generation. See [Usage and Resutls](#Usage and Results).
-
-   ```python
-   import os
-   from paddle.trainer_config_helpers import *
-
-   data_dir = "./data/pre-wmt14" # data path
-   src_lang_dict = os.path.join(data_dir, 'src.dict') # path to the source language dictionary
-   trg_lang_dict = os.path.join(data_dir, 'trg.dict') # path to the target language dictionary
-   is_generating = get_config_arg("is_generating", bool, False) # config mode
-   ```
-2. Use `define_py_data_sources2` to get data from `dataprovider.py`, and use `args` variable to input the source/target language dicitonary path and config mode.
-
-   ```python
-   if not is_generating:
-       train_list = os.path.join(data_dir, 'train.list')
-       test_list = os.path.join(data_dir, 'test.list')
-   else:
-       train_list = None
-       test_list = os.path.join(data_dir, 'gen.list')
-
-   define_py_data_sources2(
-       train_list,
-       test_list,
-       module="dataprovider",
-       obj="process",
-       args={
-           "src_dict_path": src_lang_dict, # source language dictionary path
-           "trg_dict_path": trg_lang_dict, # target language dictionary path
-           "is_generating": is_generating  # config mode
-       })
-   ```
-
-### Algorithm Configuration
-
-```python
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-```
-This tutorial will use the default SGD and Adam learning algorithm, with a learning rate of 5e-4. Note that the `batch_size = 50` denotes generating 50 sequence each time.
-
-### Model Structure
-1. Define some global variables
-
-   ```python
-   source_dict_dim = len(open(src_lang_dict, "r").readlines()) # size of the source language dictionary
-   target_dict_dim = len(open(trg_lang_dict, "r").readlines()) # size of target language dictionary
-   word_vector_dim = 512 # dimensionality of word vector
-   encoder_size = 512 	 # dimensionality of the hidden state of encoder GRU
-   decoder_size = 512    # dimentionality of the hidden state of decoder GRU
-
-   if is_generating:
-       beam_size=3    # beam size for the beam search algorithm
-       max_length=250 # maximum length for the generated sentence
-       gen_trans_file = get_config_arg("gen_trans_file", str, None) # generate file
-  ```
-
-2. Implement Encoder as follows:
-
-   2.1 Input one-hot vector representations $\mathbf{w}$ converted with `dataprovider.py` from the source language sentence
-
-   ```python
-   src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-   ```
-   2.2 Map the one-hot vector into a word vector $\mathbf{s}$ in a low-dimensional semantic space
-
-   ```python
-   src_embedding = embedding_layer(
-       input=src_word_id,
-       size=word_vector_dim,
-       param_attr=ParamAttr(name='_source_language_embedding'))
-   ```
-   2.3 Use bi-direcitonal GRU to encode the source language sequence, and concatenate the encoding outputs from the two GRUs to get $\mathbf{h}$
-
-   ```python
-   src_forward = simple_gru(input=src_embedding, size=encoder_size)
-   src_backward = simple_gru(
-       input=src_embedding, size=encoder_size, reverse=True)
-   encoded_vector = concat_layer(input=[src_forward, src_backward])
-   ```
-
-3. Implement Attention-based Decoder as follows:
-
-   3.1 Get a projection of the encoding (c.f. 2.3) of the source language sequence by passing it into a feed forward neural network
-
-   ```python
-   with mixed_layer(size=decoder_size) as encoded_proj:
-       encoded_proj += full_matrix_projection(input=encoded_vector)
-   ```
-   3.2 Use a non-linear transformation of the last hidden state of the backward GRU on the source language sentence as the initial state of the decoder RNN $c_0=h_T$
-
-   ```python
-   backward_first = first_seq(input=src_backward)
-   with mixed_layer(
-           size=decoder_size,
-           act=TanhActivation(), ) as decoder_boot:
-       decoder_boot += full_matrix_projection(input=backward_first)
-   ```
-   3.3 Define the computation in each time step for the decoder RNN, i.e., according to the current context vector $c_i$, hidden state for the decoder $z_i$ and the $i$-th word $u_i$ in the target language to predict the probability $p_{i+1}$ for the $i+1$-th word.
-
-      - decoder_mem records the hidden state $z_i$ from the previous time step, with an initial state as decoder_boot.
-      - context is computed via `simple_attention` as $c_i=\sum {j=1}^{T}a_{ij}h_j$, where enc_vec is the projection of $h_j$ and enc_proj is the projection of $h_j$ (c.f. 3.1). $a_{ij}$ is calculated within `simple_attention`.
-      - decoder_inputs fuse $c_i$ with the representation of the current_word (i.e., $u_i$).
-      - gru_step uses `gru_step_layer` function to compute $z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$.
-      - Softmax normalization is used in the end to computed the probability of words, i.e., $p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$. The output is returned.
-
-   ```python
-   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-       decoder_mem = memory(
-           name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-       context = simple_attention(
-           encoded_sequence=enc_vec,
-           encoded_proj=enc_proj,
-           decoder_state=decoder_mem, )
-
-       with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-           decoder_inputs += full_matrix_projection(input=context)
-           decoder_inputs += full_matrix_projection(input=current_word)
-
-       gru_step = gru_step_layer(
-           name='gru_decoder',
-           input=decoder_inputs,
-           output_mem=decoder_mem,
-           size=decoder_size)
-
-       with mixed_layer(
-               size=target_dict_dim, bias_attr=True,
-               act=SoftmaxActivation()) as out:
-           out += full_matrix_projection(input=gru_step)
-       return out
-    ```
-4. Decoder differences between the training and generation
-
-   4.1 Define the name for the decoder and the first two input for `gru_decoder_with_attention`. Note that `StaticInput` is used for the two inputs. Please refer to [StaticInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for more details.
-
-   ```python
-   decoder_group_name = "decoder_group"
-   group_input1 = StaticInput(input=encoded_vector, is_seq=True)
-   group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
-   group_inputs = [group_input1, group_input2]
-   ```
-   4.2 In training mode:
-
-      - word embedding from the target langauge trg_embedding is passed to `gru_decoder_with_attention` as current_word.
-      - `recurrent_group` calls `gru_decoder_with_attention` in a recurrent way
-      - the sequence of next words from the target language is used as label (lbl)
-      - multi-class cross-entropy (`classification_cost`) is used to calculate the cost
-
-   ```python
-   if not is_generating:
-       trg_embedding = embedding_layer(
-           input=data_layer(
-               name='target_language_word', size=target_dict_dim),
-           size=word_vector_dim,
-           param_attr=ParamAttr(name='_target_language_embedding'))
-       group_inputs.append(trg_embedding)
-
-       decoder = recurrent_group(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs)
-
-       lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-       cost = classification_cost(input=decoder, label=lbl)
-       outputs(cost)
-   ```
-   4.3 In generation mode:
-
-      - during generation, as the decoder RNN will take the word vector generated from the previous time step as input, `GeneratedInput` is used to implement this automatically. Please refer to [GeneratedInput Document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入) for details.
-      - `beam_search` will call `gru_decoder_with_attention` to generate id
-      - `seqtext_printer_evaluator` outputs the generated sentence to `gen_trans_file` according to `trg_lang_dict`
-
-   ```python
-   else:
-       trg_embedding = GeneratedInput(
-           size=target_dict_dim,
-           embedding_name='_target_language_embedding',
-           embedding_size=word_vector_dim)
-       group_inputs.append(trg_embedding)
-
-       beam_gen = beam_search(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs,
-           bos_id=0,
-           eos_id=1,
-           beam_size=beam_size,
-           max_length=max_length)
-
-       seqtext_printer_evaluator(
-           input=beam_gen,
-           id_input=data_layer(
-               name="sent_id", size=1),
-           dict_file=trg_lang_dict,
-           result_file=gen_trans_file)
-       outputs(beam_gen)
-   ```
-Note: Our configuration is based on Bahdanau et al. \[[4](#Reference)\] but with a few simplifications. Please refer to [issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133) for more details.
-
-
-## Model Training
-
-Training can be started with the following command:
-
-```bash
-./train.sh
-```
-where `train.sh` contains
-
-```bash
-paddle train \
---config='seqToseq_net.py' \
---save_dir='model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'train.log'
-```
-- config: configuration file for the network
-- save_dir: path to save the trained model
-- use_gpu: whether to use GPU for training; CPU is used here
-- num_passes: number of passes for training. In PaddlePaddle, one pass meansing one pass of complete training pass using all the data in the training set
-- show_parameter_stats_period: here we show the statistics of parameters every 100 batches
-- trainer_count: the number of CPU processes or GPU devices
-- log_period: here we print log every 10 batches
-- dot_period: we print one "." every 5 batches
-
-The training loss will the printed every 10 batches, and you will see messages like those below:
-```text
-I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-.....
-```
-- AvgCost: average cost from batch-0 to the current batch.
-- CurrentCost: the cost for the current batch
-- classification\_error\_evaluator (Eval): average error rate from evaluator-0 to the current evaluator for each word
-- classification\_error\_evaluator (CurrentEval): error rate for the current evaluator for each word
-
-The model training is successful when the classification\_error\_evaluator is lower than 0.35.
-
-## Model Usage
-
-### Download Pre-trained Model
-
-As the training of an NMT model is very time consuming, we provide a pre-trained model (pass-00012, ~205M). The model is trained with a cluster of 50 physical nodes (each node has two 6-core CPU). We trained 16 passes (taking about 5 days) with each pass taking about 7 hours. The provided model (pass-00012) has the highest [BLEU Score](#BLEU Score) of 26.92. Run the following command to down load the model:
-```bash
-cd pretrained
-./wmt14_model.sh
-```
-
-### Usage and Results
-
-Run the following command to perform translation from French to English:
-
-```bash
-./gen.sh
-```
-where `gen.sh` contains:
-
-```bash
-paddle train \
---job=test \
---config='seqToseq_net.py' \
---save_dir='pretrained/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
---config_args=is_generating=1,gen_trans_file="gen_result" \
-2>&1 | tee 'translation/gen.log'
-```
-Parameters different training are listed as follows:
-- job: set the mode as testing.
-- save_dir: path to the pre-trained model.
-- num_passes and test_pass: load the model parameters from pass $i\epsilon \left [ test\\_pass,num\\_passes-1 \right ]$. Here we only load `data/wmt14_model/pass-00012`.
-- config_args: pass the self-defined command line parameters to model configuration. `is_generating=1` indicates generation mode and `gen_trans_file="gen_result"` represents the file generated.
-
-For translation results please refer to [Illustrative Results](#Illustrative Results).
-
-### BLEU Evaluation
-
-BLEU (Bilingual Evaluation understudy) is a metric widely used for automatic machine translation proposed by IBM Watson Research Center in 2002\[[5](#References)\]. The closer the translation produced by a machine is to the translation produced by a human expert, the better the performance of the translation system.
-To measure the closeness between machine translation and human translation, sentence precision is used. It compares the number of matched n-grams. More matches will lead to higher BLEU scores.
-
-[Moses](http://www.statmt.org/moses/) is an open-source machine translation system, we used [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) for BLEU evaluation. Run the following command for downloading:
-```bash
-./moses_bleu.sh
-```
-BLEU evaluation can be performed using the `eval_bleu` script as follows, where FILE is the name of the file to be evaluated, BEAMSIZE is the beam size value, and `data/wmt14/gen/ntst14.trg` is used as the standard translation in default.
-```bash
-./eval_bleu.sh FILE BEAMSIZE
-```
-Specificaly, the script is run as follows:
-```bash
-./eval_bleu.sh gen_result 3
-```
-You will see the following message as output:
-```text
-BLEU = 26.92
-```
-
-## Summary
-
-End-to-end neural machine translation is a recently developed way to perform machine translations. In this chapter, we introduced the typical "Encoder-Decoder" framework and "attention" mechanism. Since NMT is a typical Sequence-to-Sequence (Seq2Seq) learning problem, tasks such as query rewriting, abstraction generation and single-turn dialogues can all be solved with the model presented in this chapter.
-
-## References
-
-1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
-2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
-3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
-4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
-5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/machine_translation/index.html b/machine_translation/index.html
deleted file mode 100644
index 09d4a4e654366a5d030ed954e7400483efdd2093..0000000000000000000000000000000000000000
--- a/machine_translation/index.html
+++ /dev/null
@@ -1,751 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# 机器翻译
-
-本教程源代码目录在[book/machine_translation](https://github.com/PaddlePaddle/book/tree/develop/machine_translation)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
-
-## 背景介绍
-
-机器翻译（machine translation, MT）是用计算机来实现不同语言之间翻译的技术。被翻译的语言通常称为源语言（source language），翻译成的结果语言称为目标语言（target language）。机器翻译即实现从源语言到目标语言转换的过程，是自然语言处理的重要研究领域之一。
-
-早期机器翻译系统多为基于规则的翻译系统，需要由语言学家编写两种语言之间的转换规则，再将这些规则录入计算机。该方法对语言学家的要求非常高，而且我们几乎无法总结一门语言会用到的所有规则，更何况两种甚至更多的语言。因此，传统机器翻译方法面临的主要挑战是无法得到一个完备的规则集合\[[1](#参考文献)\]。
-
-为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
-
-近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
-<p align="center">
-<img src="image/nmt.png" width=400><br/>
-图1. 基于神经网络的机器翻译系统
-</p>
-
-本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
-
-## 效果展示
-
-以中英翻译（中文翻译到英文）的模型为例，当模型训练完毕时，如果输入如下已分词的中文句子：
-```text
-这些 是 希望 的 曙光 和 解脱 的 迹象 .
-```
-如果设定显示翻译结果的条数（即[柱搜索算法](#柱搜索算法)的宽度）为3，生成的英语句子如下：
-```text
-0 -5.36816   these are signs of hope and relief . <e>
-1 -6.23177   these are the light of hope and relief . <e>
-2 -7.7914  these are the light of hope and the relief of hope . <e>
-```
-- 左起第一列是生成句子的序号；左起第二列是该条句子的得分（从大到小），分值越高越好；左起第三列是生成的英语句子。
-- 另外有两个特殊标志：`<e>`表示句子的结尾，`<unk>`表示未登录词（unknown word），即未在训练字典中出现的词。
-
-## 模型概览
-
-本节依次介绍GRU（Gated Recurrent Unit，门控循环单元），双向循环神经网络（Bi-directional Recurrent Neural Network），NMT模型中典型的编码器-解码器（Encoder-Decoder）框架和注意力（Attention）机制，以及柱搜索（beam search）算法。
-
-### GRU
-
-我们已经在[情感分析](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md)一章中介绍了循环神经网络（RNN）及长短时间记忆网络（LSTM）。相比于简单的RNN，LSTM增加了记忆单元（memory cell）、输入门（input gate）、遗忘门（forget gate）及输出门（output gate），这些门及记忆单元组合起来大大提升了RNN处理远距离依赖问题的能力。
-
-GRU\[[2](#参考文献)\]是Cho等人在LSTM上提出的简化版本，也是RNN的一种扩展，如下图所示。GRU单元只有两个门：
-- 重置门（reset gate）：如果重置门关闭，会忽略掉历史信息，即历史不相干的信息不会影响未来的输出。
-- 更新门（update gate）：将LSTM的输入门和遗忘门合并，用于控制历史信息对当前时刻隐层输出的影响。如果更新门接近1，会把历史信息传递下去。
-<p align="center">
-<img src="image/gru.png" width=700><br/>
-图2. GRU（门控循环单元）
-</p>
-
-一般来说，具有短距离依赖属性的序列，其重置门比较活跃；相反，具有长距离依赖属性的序列，其更新门比较活跃。另外，Chung等人\[[3](#参考文献)\]通过多组实验表明，GRU虽然参数更少，但是在多个任务上都和LSTM有相近的表现。
-
-### 双向循环神经网络
-
-我们已经在[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/label_semantic_roles/README.md)一章中介绍了一种双向循环神经网络，这里介绍Bengio团队在论文\[[2](#参考文献),[4](#参考文献)\]中提出的另一种结构。该结构的目的是输入一个序列，得到其在每个时刻的特征表示，即输出的每个时刻都用定长向量表示到该时刻的上下文语义信息。
-
-具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（$W_1, W_3$），隐层到隐层自己的权重矩阵（$W_2,W_5$），前向隐层和后向隐层到输出层的权重矩阵（$W_4, W_6$）。注意，该网络的前向隐层和后向隐层之间没有连接。
-
-<p align="center">
-<img src="image/bi_rnn.png" width=450><br/>
-图3. 按时间步展开的双向循环神经网络
-</p>
-
-### 编码器-解码器框架
-
-编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
-<p align="center">
-<img src="image/encoder_decoder.png" width=700><br/>
-图4. 编码器-解码器框架
-**Note: "源语言词序列" 和 "源语编码状态" 位置标反了,需要互换**
-</p>
-
-#### 编码器
-
-编码阶段分为三步：
-
-1. one-hot vector表示：将源语言句子$x=\left \{ x_1,x_2,...,x_T \right \}$的每个词$x_i$表示成一个列向量$w_i\epsilon R^{\left | V \right |},i=1,2,...,T$。这个向量$w_i$的维度与词汇表大小$\left | V \right |$ 相同，并且只有一个维度上有值1（该位置对应该词在词汇表中的位置），其余全是0。
-
-2. 映射到低维语义空间的词向量：one-hot vector表示存在两个问题，1）生成的向量维度往往很大，容易造成维数灾难；2）难以刻画词与词之间的关系（如语义相似性，也就是无法很好地表达语义）。因此，需再one-hot vector映射到低维的语义空间，由一个固定维度的稠密向量（称为词向量）表示。记映射矩阵为$C\epsilon R^{K\times \left | V \right |}$，用$s_i=Cw_i$表示第$i$个词的词向量，$K$为向量维度。
-
-3. 用RNN编码源语言词序列：这一过程的计算公式为$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$，其中$h_0$是一个全零的向量，$\varnothing _\theta$是一个非线性激活函数，最后得到的$\mathbf{h}=\left \{ h_1,..., h_T \right \}$就是RNN依次读入源语言$T$个词的状态编码序列。整句话的向量表示可以采用$\mathbf{h}$在最后一个时间步$T$的状态编码，或使用时间维上的池化（pooling）结果。
-
-第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列$(x_1,x_2,...,x_T)$的顺序依次编码源语言端词，并得到一系列隐层状态$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$。类似的，后向GRU按照$(x_T,x_{T-1},...,x_1)$的顺序依次编码源语言端词，得到$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$。最后对于词$x_i$，通过拼接两个GRU的结果得到它的隐层状态，即$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$。
-
-<p align="center">
-<img src="image/encoder_attention.png" width=500><br/>
-图5. 使用双向GRU的编码器
-</p>
-
-#### 解码器
-
-机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
-
-1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）$c$、真实目标语言序列的第$i$个词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。计算公式如下：
-   
-   $$z_{i+1}=\phi _{\theta '}\left ( c,u_i,z_i \right )$$
-
-   其中$\phi _{\theta '}$是一个非线性激活函数；$c=q\mathbf{h}$是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义$c=h_T$；$u_i$是目标语言序列的第$i$个单词，$u_0$是目标语言序列的开始标记`<s>`，表示解码开始；$z_i$是$i$时刻解码RNN的隐层状态，$z_0$是一个全零的向量。
-
-2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。概率分布公式如下：
-
-   $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
-
-   其中$W_sz_{i+1}+b_z$是对每个可能的输出单词进行打分，再用softmax归一化就可以得到第$i+1$个词的概率$p_{i+1}$。
-
-3. 根据$p_{i+1}$和$u_{i+1}$计算代价。
-4. 重复步骤1~3，直到目标语言序列中的所有词处理完毕。
-
-机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
-
-### 注意力机制
-
-如果编码阶段的输出是一个固定维度的向量，会带来以下两个问题：1）不论源语言序列的长度是5个词还是50个词，如果都用固定维度的向量去编码其中的语义和句法结构信息，对模型来说是一个非常高的要求，特别是对长句子序列而言；2）直觉上，当人类翻译一句话时，会对与当前译文更相关的源语言片段上给予更多关注，且关注点会随着翻译的进行而改变。而固定维度的向量则相当于，任何时刻都对源语言所有信息给予了同等程度的关注，这是不合理的。因此，Bahdanau等人\[[4](#参考文献)\]引入注意力（attention）机制，可以对编码后的上下文片段进行解码，以此来解决长句子的特征学习问题。下面介绍在注意力机制下的解码器结构。
-
-与简单的解码器不同，这里$z_i$的计算公式为：
-
-$$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$$
-
-可见，源语言句子的编码向量表示为第$i$个词的上下文片段$c_i$，即针对每一个目标语言中的词$u_i$，都有一个特定的$c_i$与之对应。$c_i$的计算公式如下：
-
-$$c_i=\sum _{j=1}^{T}a_{ij}h_j, a_i=\left[ a_{i1},a_{i2},...,a_{iT}\right ]$$
-
-从公式中可以看出，注意力机制是通过对编码器中各时刻的RNN状态$h_j$进行加权平均实现的。权重$a_{ij}$表示目标语言中第$i$个词对源语言中第$j$个词的注意力大小，$a_{ij}$的计算公式如下：
-
-\begin{align}
-a_{ij}&=\frac{exp(e_{ij})}{\sum_{k=1}^{T}exp(e_{ik})}\\\\
-e_{ij}&=align(z_i,h_j)\\\\
-\end{align}
-
-其中，$align$可以看作是一个对齐模型，用来衡量目标语言中第$i$个词和源语言中第$j$个词的匹配程度。具体而言，这个程度是通过解码RNN的第$i$个隐层状态$z_i$和源语言句子的第$j$个上下文片段$h_j$计算得到的。传统的对齐模型中，目标语言的每个词明确对应源语言的一个或多个词（hard alignment）；而在注意力模型中采用的是soft alignment，即任何两个目标语言和源语言词间均存在一定的关联，且这个关联强度是由模型计算得到的实数，因此可以融入整个NMT框架，并通过反向传播算法进行训练。
-
-<p align="center">
-<img src="image/decoder_attention.png" width=500><br/>
-图6. 基于注意力机制的解码器
-</p>
-
-### 柱搜索算法
-
-柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
-
-柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
-
-使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
-
-1. 每一个时刻，根据源语言句子的编码信息$c$、生成的第$i$个目标语言序列单词$u_i$和$i$时刻RNN的隐层状态$z_i$，计算出下一个隐层状态$z_{i+1}$。
-2. 将$z_{i+1}$通过`softmax`归一化，得到目标语言序列的第$i+1$个单词的概率分布$p_{i+1}$。
-3. 根据$p_{i+1}$采样出单词$u_{i+1}$。
-4. 重复步骤1~3，直到获得句子结束标记`<e>`或超过句子的最大生成长度为止。
-
-注意：$z_{i+1}$和$p_{i+1}$的计算公式同[解码器](#解码器)中的一样。且由于生成时的每一步都是通过贪心法实现的，因此并不能保证得到全局最优解。
-
-## 数据准备
-
-### 下载与解压缩
-
-本教程使用[WMT-14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)数据集中的[bitexts(after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练集，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为测试集和生成集。
-
-在Linux下，只需简单地运行以下命令：
-```bash
-cd data
-./wmt14_data.sh
-```
-得到的数据集`data/wmt14`包含如下三个文件夹：
-<p align = "center">
-<table>
-<tr>
-<td>文件夹名</td>
-<td>法英平行语料文件</td>
-<td>文件数</td>
-<td>文件大小</td>
-</tr>
-
-<tr>
-<td>train</td>
-<td>ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td>12</td>
-<td>3.55G</td>
-</tr>
-
-<tr>
-<td>test</td>
-<td>ntst1213.src, ntst1213.trg</td>
-<td>2</td>
-<td>1636k</td>
-</tr>
-
-</tr>
-<tr>
-<td>gen</td>
-<td>ntst14.src, ntst14.trg</td>
-<td>2</td>
-<td>864k</td>
-</tr>
-</table>
-</p>
-
-- `XXX.src`是源法语文件，`XXX.trg`是目标英语文件，文件中的每行存放一个句子
-- `XXX.src`和`XXX.trg`的行数一致，且两者任意第$i$行的句子之间都有着一一对应的关系。
-
-### 用户自定义数据集（可选）
-
-如果您想使用自己的数据集，只需按照如下方式组织，并将它们放在`data`目录下：
-```text
-user_dataset
-├── train
-│   ├── train_file1.src
-│   ├── train_file1.trg
-│   └── ...
-├── test
-│   ├── test_file1.src
-│   ├── test_file1.trg
-│   └── ...
-├── gen
-│   ├── gen_file1.src
-│   ├── gen_file1.trg
-│   └── ...
-```
-  
-- 一级目录`user_dataset`：用户自定义的数据集名字。
-- 二级目录`train`、`test`和`gen`：必须使用这三个文件夹名字。
-- 三级目录：存放源语言到目标语言的平行语料库文件，后缀名必须使用`.src`和`.trg`。
-
-### 数据预处理
-
-我们的预处理流程包括两步：
-- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
-  - 合并每个`XXX.src`和`XXX.trg`文件为`XXX`。
-  - `XXX`中的第$i$行内容为`XXX.src`中的第$i$行和`XXX.trg`中的第$i$行连接，用'\t'分隔。
-- 创建训练数据的“源字典”和“目标字典”。每个字典都有**DICTSIZE**个单词，包括：语料中词频最高的（DICTSIZE - 3）个单词，和3个特殊符号`<s>`（序列的开始）、`<e>`（序列的结束）和`<unk>`（未登录词）。
-
-预处理可以使用`preprocess.py`：
-```python
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-- `-i INPUT`：输入的原始数据集路径。
-- `-d DICTSIZE`：指定的字典单词数，如果没有设置，字典会包含输入数据集中的所有单词。
-- `-m --mergeDict`：合并“源字典”和“目标字典”，即这两个字典的内容完全一样。
-
-本教程的具体命令如下：
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-请耐心等待几分钟的时间，您会在屏幕上看到：
-```text
-concat parallel corpora for dataset
-build source dictionary for train data
-build target dictionary for train data
-dictionary size is 30000
-```
-预处理好的数据集存放在`data/pre-wmt14`目录下：
-```text
-pre-wmt14
-├── train
-│   └── train
-├── test
-│   └── test
-├── gen
-│   └── gen
-├── train.list
-├── test.list
-├── gen.list
-├── src.dict
-└── trg.dict
-```
-- `train`、`test`和`gen`：分别包含了法英平行语料库的训练、测试和生成数据。其每个文件的每一行以“\t”分为两列，第一列是法语序列，第二列是对应的英语序列。
-- `train.list`、`test.list`和`gen.list`：分别记录了`train`、`test`和`gen`文件夹中的文件路径。
-- `src.dict`和`trg.dict`：源（法语）和目标（英语）字典。每个字典都含有30000个单词，包括29997个最高频单词和3个特殊符号。
-
-### 提供数据给PaddlePaddle
-
-我们通过`dataprovider.py`将数据提供给PaddlePaddle。具体步骤如下：
-
-1. 首先，引入PaddlePaddle的PyDataProvider2包，并定义三个特殊符号。
-
-   ```python
-   from paddle.trainer.PyDataProvider2 import *
-   UNK_IDX = 2    #未登录词
-   START = "<s>"  #序列的开始
-   END = "<e>"    #序列的结束
-   ```
-2. 其次，使用初始化函数`hook`，分别定义了训练模式和生成模式下的数据输入格式（`input_types`）。
-   - 训练模式：有三个输入序列，其中“源语言序列”和“目标语言序列”作为输入数据，“目标语言的下一个词序列”作为标签数据。
-   - 生成模式：有两个输入序列，其中“源语言序列”作为输入数据，“源语言序列编号”作为输入数据的编号（该输入非必须，可以省略）。
-  
-  `hook`函数中的`src_dict_path`是源语言字典路径，`trg_dict_path`是目标语言字典路径，`is_generating`（训练或生成模式）是从模型配置中传入的对象。`hook`函数的具体调用方式请见[模型配置说明](#模型配置说明)。
-
-   ```python
-   def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list, 
-            **kwargs):
-       # job_mode = 1: 训练模式；0: 生成模式
-       settings.job_mode = not is_generating
-
-       def fun(dict_path): # 根据字典路径加载字典
-           out_dict = dict()
-           with open(dict_path, "r") as fin:
-               out_dict = {
-                   line.strip(): line_count
-                   for line_count, line in enumerate(fin)
-               }
-           return out_dict
-
-       settings.src_dict = fun(src_dict_path)
-       settings.trg_dict = fun(trg_dict_path)
-
-       if settings.job_mode:                                  #训练模式
-           settings.input_types = {
-               'source_language_word':                        #源语言序列
-               integer_value_sequence(len(settings.src_dict)),
-               'target_language_word':                        #目标语言序列
-               integer_value_sequence(len(settings.trg_dict)),
-               'target_language_next_word':                   #目标语言的下一个词序列
-               integer_value_sequence(len(settings.trg_dict))
-           }
-       else:                                                  #生成模式
-           settings.input_types = {
-               'source_language_word':                        #源语言序列
-               integer_value_sequence(len(settings.src_dict)),
-               'sent_id':                                     #源语言序列编号
-               integer_value_sequence(len(open(file_list[0], "r").readlines()))
-           }
-   ```
-3. 最后，使用`process`函数打开文本文件`file_name`，读取每一行，将行中的数据转换成与`input_types`一致的格式，再用`yield`关键字返回给PaddlePaddle进程。具体来说，
-
-   - 在源语言序列的每句话前面补上开始符号`<s>`、末尾补上结束符号`<e>`，得到“source_language_word”；
-   - 在目标语言序列的每句话前面补上`<s>`，得到“target_language_word”；
-   - 在目标语言序列的每句话末尾补上`<e>`，作为目标语言的下一个词序列（“target_language_next_word”）。
-
-   ```python
-   def _get_ids(s, dictionary): # 获得源语言序列中的每个单词在字典中的位置
-       words = s.strip().split()
-       return [dictionary[START]] + \
-              [dictionary.get(w, UNK_IDX) for w in words] + \
-              [dictionary[END]]
-
-   @provider(init_hook=hook, pool_size=50000)
-   def process(settings, file_name):
-       with open(file_name, 'r') as f:
-           for line_count, line in enumerate(f):
-               line_split = line.strip().split('\t')
-               if settings.job_mode and len(line_split) != 2:
-                   continue
-               src_seq = line_split[0]
-               src_ids = _get_ids(src_seq, settings.src_dict)
-
-               if settings.job_mode:
-                   trg_seq = line_split[1]
-                   trg_words = trg_seq.split()
-                   trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                   # 如果任意一个序列长度超过80个单词，在训练模式下会移除这条样本，以防止RNN过深。
-                   if len(src_ids) > 80 or len(trg_ids) > 80:
-                       continue
-                   trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                   trg_ids = [settings.trg_dict[START]] + trg_ids
-                   yield {
-                       'source_language_word': src_ids,
-                       'target_language_word': trg_ids,
-                       'target_language_next_word': trg_ids_next
-                   }
-               else:
-                   yield {'source_language_word': src_ids, 'sent_id': [line_count]}
-   ```
-注意：由于本示例中的训练数据有3.55G，对于内存较小的机器，不能一次性加载进内存，所以推荐使用`pool_size`变量来设置内存中暂存的数据条数。
-
-## 模型配置说明
-
-### 数据定义
-
-1. 首先，定义数据集路径和源/目标语言字典路径，并用`is_generating`变量定义当前配置是训练模式（默认）还是生成模式。该变量接受从命令行传入的参数，使用方法见[应用命令与结果](#应用命令与结果)。
-
-   ```python
-   import os
-   from paddle.trainer_config_helpers import *
-
-   data_dir = "./data/pre-wmt14" # 数据集路径
-   src_lang_dict = os.path.join(data_dir, 'src.dict') # 源语言字典路径
-   trg_lang_dict = os.path.join(data_dir, 'trg.dict') # 目标语言字典路径
-   is_generating = get_config_arg("is_generating", bool, False) # 配置模式
-   ```
-2. 其次，通过`define_py_data_sources2`函数从`dataprovider.py`中读取数据，并用`args`变量传入源/目标语言的字典路径以及配置模式。
-
-   ```python
-   if not is_generating:
-       train_list = os.path.join(data_dir, 'train.list')
-       test_list = os.path.join(data_dir, 'test.list')
-   else:
-       train_list = None
-       test_list = os.path.join(data_dir, 'gen.list')
-
-   define_py_data_sources2(
-       train_list,
-       test_list,
-       module="dataprovider",
-       obj="process",
-       args={
-           "src_dict_path": src_lang_dict, # 源语言字典路径
-           "trg_dict_path": trg_lang_dict, # 目标语言字典路径
-           "is_generating": is_generating  # 配置模式
-       })
-   ```
-
-### 算法配置
-
-```python
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-```
-本教程使用默认的SGD随机梯度下降算法和Adam学习方法，并指定学习率为5e-4。注意：生成模式下的`batch_size = 50`，表示同时生成50条序列。
-
-### 模型结构
-1. 首先，定义了一些全局变量。
-
-   ```python
-   source_dict_dim = len(open(src_lang_dict, "r").readlines()) # 源语言字典维度
-   target_dict_dim = len(open(trg_lang_dict, "r").readlines()) # 目标语言字典维度
-   word_vector_dim = 512 # 词向量维度
-   encoder_size = 512 # 编码器中的GRU隐层大小
-   decoder_size = 512 # 解码器中的GRU隐层大小
-
-   if is_generating:
-       beam_size=3  # 柱搜索算法中的宽度
-       max_length=250 # 生成句子的最大长度
-       gen_trans_file = get_config_arg("gen_trans_file", str, None) # 生成后的文件
-  ```
-
-2. 其次，实现编码器框架。分为三步：
-
-   2.1 传入已经在`dataprovider.py`转换成one-hot vector表示的源语言序列$\mathbf{w}$。
-
-   ```python
-   src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-   ```
-   2.2 将上述编码映射到低维语言空间的词向量$\mathbf{s}$。
-
-   ```python
-   src_embedding = embedding_layer(
-       input=src_word_id,
-       size=word_vector_dim,
-       param_attr=ParamAttr(name='_source_language_embedding'))
-   ```
-   2.3 用双向GRU编码源语言序列，拼接两个GRU的编码结果得到$\mathbf{h}$。
-  
-   ```python
-   src_forward = simple_gru(input=src_embedding, size=encoder_size)
-   src_backward = simple_gru(
-       input=src_embedding, size=encoder_size, reverse=True)
-   encoded_vector = concat_layer(input=[src_forward, src_backward])
-   ```
-
-3. 接着，定义基于注意力机制的解码器框架。分为三步：
-
-   3.1 对源语言序列编码后的结果（见2.3），过一个前馈神经网络（Feed Forward Neural Network），得到其映射。
-   
-   ```python
-   with mixed_layer(size=decoder_size) as encoded_proj:
-       encoded_proj += full_matrix_projection(input=encoded_vector)
-   ```
-   3.2 构造解码器RNN的初始状态。由于解码器需要预测时序目标序列，但在0时刻并没有初始值，所以我们希望对其进行初始化。这里采用的是将源语言序列逆序编码后的最后一个状态进行非线性映射，作为该初始值，即$c_0=h_T$。
-
-   ```python
-   backward_first = first_seq(input=src_backward)
-   with mixed_layer(
-           size=decoder_size,
-           act=TanhActivation(), ) as decoder_boot:
-       decoder_boot += full_matrix_projection(input=backward_first)
-   ```
-   3.3 定义解码阶段每一个时间步的RNN行为，即根据当前时刻的源语言上下文向量$c_i$、解码器隐层状态$z_i$和目标语言中第$i$个词$u_i$，来预测第$i+1$个词的概率$p_{i+1}$。
-
-      - decoder_mem记录了前一个时间步的隐层状态$z_i$，其初始状态是decoder_boot。
-      - context通过调用`simple_attention`函数，实现公式$c_i=\sum {j=1}^{T}a_{ij}h_j$。其中，enc_vec是$h_j$，enc_proj是$h_j$的映射（见3.1），权重$a_{ij}$的计算已经封装在`simple_attention`函数中。
-      - decoder_inputs融合了$c_i$和当前目标词current_word（即$u_i$）的表示。
-      - gru_step通过调用`gru_step_layer`函数，在decoder_inputs和decoder_mem上做了激活操作，即实现公式$z_{i+1}=\phi _{\theta '}\left ( c_i,u_i,z_i \right )$。
-      - 最后，使用softmax归一化计算单词的概率，将out结果返回，即实现公式$p\left ( u_i|u_{&lt;i},\mathbf{x} \right )=softmax(W_sz_i+b_z)$。 
-        
-   ```python
-   def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-       decoder_mem = memory(
-           name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-       context = simple_attention(
-           encoded_sequence=enc_vec,
-           encoded_proj=enc_proj,
-           decoder_state=decoder_mem, )
-
-       with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-           decoder_inputs += full_matrix_projection(input=context)
-           decoder_inputs += full_matrix_projection(input=current_word)
-
-       gru_step = gru_step_layer(
-           name='gru_decoder',
-           input=decoder_inputs,
-           output_mem=decoder_mem,
-           size=decoder_size)
-
-       with mixed_layer(
-               size=target_dict_dim, bias_attr=True,
-               act=SoftmaxActivation()) as out:
-           out += full_matrix_projection(input=gru_step)
-       return out
-    ```
-4. 训练模式与生成模式下的解码器调用区别。
-
-   4.1 定义解码器框架名字，和`gru_decoder_with_attention`函数的前两个输入。注意：这两个输入使用`StaticInput`，具体说明可见[StaticInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
-
-   ```python
-   decoder_group_name = "decoder_group"
-   group_input1 = StaticInput(input=encoded_vector, is_seq=True)
-   group_input2 =  StaticInput(input=encoded_proj, is_seq=True)
-   group_inputs = [group_input1, group_input2]
-   ```
-   4.2 训练模式下的解码器调用：
-
-      - 首先，将目标语言序列的词向量trg_embedding，直接作为训练模式下的current_word传给`gru_decoder_with_attention`函数。
-      - 其次，使用`recurrent_group`函数循环调用`gru_decoder_with_attention`函数。
-      - 接着，使用目标语言的下一个词序列作为标签层lbl，即预测目标词。
-      - 最后，用多类交叉熵损失函数`classification_cost`来计算损失值。
-
-   ```python
-   if not is_generating:
-       trg_embedding = embedding_layer(
-           input=data_layer(
-               name='target_language_word', size=target_dict_dim),
-           size=word_vector_dim,
-           param_attr=ParamAttr(name='_target_language_embedding'))
-       group_inputs.append(trg_embedding)
-
-       decoder = recurrent_group(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs)
-
-       lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-       cost = classification_cost(input=decoder, label=lbl)
-       outputs(cost)
-   ```
-   4.3 生成模式下的解码器调用：
-
-      - 首先，在序列生成任务中，由于解码阶段的RNN总是引用上一时刻生成出的词的词向量，作为当前时刻的输入，因此，使用`GeneratedInput`来自动完成这一过程。具体说明可见[GeneratedInput文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/deep_model/rnn/recurrent_group_cn.md#输入)。
-      - 其次，使用`beam_search`函数循环调用`gru_decoder_with_attention`函数，生成出序列id。
-      - 最后，使用`seqtext_printer_evaluator`函数，根据目标字典`trg_lang_dict`，打印出完整的句子保存在`gen_trans_file`中。
-     
-   ```python
-   else:
-       trg_embedding = GeneratedInput(
-           size=target_dict_dim,
-           embedding_name='_target_language_embedding',
-           embedding_size=word_vector_dim)
-       group_inputs.append(trg_embedding)
-
-       beam_gen = beam_search(
-           name=decoder_group_name,
-           step=gru_decoder_with_attention,
-           input=group_inputs,
-           bos_id=0,
-           eos_id=1,
-           beam_size=beam_size,
-           max_length=max_length)
-
-       seqtext_printer_evaluator(
-           input=beam_gen,
-           id_input=data_layer(
-               name="sent_id", size=1),
-           dict_file=trg_lang_dict,
-           result_file=gen_trans_file)
-       outputs(beam_gen)
-   ```
-注意：我们提供的配置在Bahdanau的论文\[[4](#参考文献)\]上做了一些简化，可参考[issue #1133](https://github.com/PaddlePaddle/Paddle/issues/1133)。
-
-
-## 训练模型
-
-可以通过以下命令来训练模型：
-
-```bash
-./train.sh
-```
-其中`train.sh` 的内容为：
-
-```bash
-paddle train \
---config='seqToseq_net.py' \
---save_dir='model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'train.log'
-```
-- config: 设置神经网络的配置文件。
-- save_dir: 设置保存模型的输出路径。
-- use_gpu: 是否使用GPU训练，这里使用CPU。
-- num_passes: 设置passes的数量。PaddlePaddle中的一个pass表示对数据集中所有样本的一次完整训练。
-- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息。
-- trainer_count: 设置CPU线程数或者GPU设备数。
-- log_period: 这里每隔10个batch打印一次日志。
-- dot_period: 这里每个5个batch打印一个点"."。
-
-训练的损失函数每隔10个batch打印一次，您将会看到如下消息：
-```text
-I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-.....
-```
-- AvgCost：从第0个batch到当前batch的平均损失值。
-- CurrentCost：当前batch的损失值。
-- classification\_error\_evaluator(Eval)：从第0个评估到当前评估中，每个单词的预测错误率。
-- classification\_error\_evaluator(CurrentEval)：当前评估中，每个单词的预测错误率。
-
-当classification\_error\_evaluator的值低于0.35时，模型就训练成功了。
-
-## 应用模型
-
-### 下载预训练的模型
-
-由于NMT模型的训练非常耗时，我们在50个物理节点（每节点含有2颗6核CPU）的集群中，花了5天时间训练了16个pass，其中每个pass耗时7个小时。因此，我们提供了一个预先训练好的模型（pass-00012）供大家直接下载使用。该模型大小为205MB，在所有16个模型中有最高的[BLEU评估](#BLEU评估)值26.92。下载并解压模型的命令如下：
-```bash
-cd pretrained
-./wmt14_model.sh
-```
-
-### 应用命令与结果
-
-可以通过以下命令来进行法英翻译：
-
-```bash
-./gen.sh
-```
-其中`gen.sh` 的内容为：
-
-```bash
-paddle train \
---job=test \
---config='seqToseq_net.py' \
---save_dir='pretrained/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
---config_args=is_generating=1,gen_trans_file="gen_result" \
-2>&1 | tee 'translation/gen.log'
-```
-与训练命令不同的参数如下：
-- job：设置任务的模式为测试。
-- save_dir：设置存放预训练模型的路径。
-- num_passes和test_pass：加载第$i\epsilon \left [ test\_pass,num\_passes-1 \right ]$轮的模型参数，这里只加载 `data/wmt14_model/pass-00012`。
-- config_args：将命令行中的自定义参数传递给模型配置。`is_generating=1`表示当前为生成模式，`gen_trans_file="gen_result"`表示生成结果的存储文件。
-
-翻译结果请见[效果展示](#效果展示)。
-
-### BLEU评估
-
-BLEU(Bilingual Evaluation understudy)是一种广泛使用的机器翻译自动评测指标，由IBM的watson研究中心于2002年提出\[[5](#参考文献)\]，基本出发点是：机器译文越接近专业翻译人员的翻译结果，翻译系统的性能越好。其中，机器译文与人工参考译文之间的接近程度，采用句子精确度（precision）的计算方法，即比较两者的n元词组相匹配的个数，匹配的个数越多，BLEU得分越好。
-
-[Moses](http://www.statmt.org/moses/) 是一个统计学的开源机器翻译系统，我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。下载脚本的命令如下：
-```bash
-./moses_bleu.sh
-```
-BLEU评估可以使用`eval_bleu`脚本如下，其中FILE为需要评估的文件名，BEAMSIZE为柱宽度，默认使用`data/wmt14/gen/ntst14.trg`作为标准的翻译结果。
-```bash
-./eval_bleu.sh FILE BEAMSIZE
-```
-本教程的具体命令如下：
-```bash
-./eval_bleu.sh gen_result 3
-```
-您会在屏幕上看到：
-```text
-BLEU = 26.92
-```
-
-## 总结
-
-端到端的神经网络机器翻译是近几年兴起的一种全新的机器翻译方法。本章中，我们介绍了NMT中典型的“编码器-解码器”框架和“注意力”机制。由于NMT是一个典型的Seq2Seq（Sequence to Sequence，序列到序列）学习问题，因此，Seq2Seq中的query改写（query rewriting）、摘要、单轮对话等问题都可以用本教程的模型来解决。
-
-## 参考文献
-
-1. Koehn P. [Statistical machine translation](https://books.google.com.hk/books?id=4v_Cx1wIMLkC&printsec=frontcover&hl=zh-CN&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false)[M]. Cambridge University Press, 2009.
-2. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]//Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
-3. Chung J, Gulcehre C, Cho K H, et al. [Empirical evaluation of gated recurrent neural networks on sequence modeling](https://arxiv.org/abs/1412.3555)[J]. arXiv preprint arXiv:1412.3555, 2014.
-4.  Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]//Proceedings of ICLR 2015, 2015.
-5. Papineni K, Roukos S, Ward T, et al. [BLEU: a method for automatic evaluation of machine translation](http://dl.acm.org/citation.cfm?id=1073135)[C]//Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 2002: 311-318.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/machine_translation/moses_bleu.sh b/machine_translation/moses_bleu.sh
deleted file mode 100755
index 2f230d7f4c736da003966fbdb277f6b8b1ec952c..0000000000000000000000000000000000000000
--- a/machine_translation/moses_bleu.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-echo "Downloading multi-bleu.perl"
-wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl --no-check-certificate
diff --git a/machine_translation/pretrained/wmt14_model.sh b/machine_translation/pretrained/wmt14_model.sh
deleted file mode 100755
index c4b55b90a3eb98f94e0eb3be028c6de1ef57326b..0000000000000000000000000000000000000000
--- a/machine_translation/pretrained/wmt14_model.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-
-# download the pretrained model
-wget http://paddlepaddle.bj.bcebos.com/model_zoo/wmt14_model.tar.gz
-
-# untar the model
-tar -zxvf wmt14_model.tar.gz
-rm wmt14_model.tar.gz 
diff --git a/machine_translation/seqToseq_net.py b/machine_translation/seqToseq_net.py
deleted file mode 100644
index 750d35c0c6b62801d70802ac4dc97f89d09fc612..0000000000000000000000000000000000000000
--- a/machine_translation/seqToseq_net.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from paddle.trainer_config_helpers import *
-
-### Data Definiation
-data_dir = "./data/pre-wmt14"
-src_lang_dict = os.path.join(data_dir, 'src.dict')
-trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-is_generating = get_config_arg("is_generating", bool, False)
-
-if not is_generating:
-    train_list = os.path.join(data_dir, 'train.list')
-    test_list = os.path.join(data_dir, 'test.list')
-else:
-    train_list = None
-    test_list = os.path.join(data_dir, 'gen.list')
-
-define_py_data_sources2(
-    train_list,
-    test_list,
-    module="dataprovider",
-    obj="process",
-    args={
-        "src_dict_path": src_lang_dict,
-        "trg_dict_path": trg_lang_dict,
-        "is_generating": is_generating
-    })
-
-### Algorithm Configuration
-settings(learning_method=AdamOptimizer(), batch_size=50, learning_rate=5e-4)
-
-### Network Architecture
-source_dict_dim = len(open(src_lang_dict, "r").readlines())
-target_dict_dim = len(open(trg_lang_dict, "r").readlines())
-word_vector_dim = 512  # dimension of word vector
-decoder_size = 512  # dimension of hidden unit in GRU Decoder network
-encoder_size = 512  # dimension of hidden unit in GRU Encoder network
-
-if is_generating:
-    beam_size = 3  # expand width in beam search
-    max_length = 250  # a stop condition of sequence generation
-    gen_trans_file = get_config_arg("gen_trans_file", str, None)
-
-#### Encoder
-src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-src_embedding = embedding_layer(
-    input=src_word_id,
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_source_language_embedding'))
-src_forward = simple_gru(input=src_embedding, size=encoder_size)
-src_backward = simple_gru(input=src_embedding, size=encoder_size, reverse=True)
-encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-#### Decoder
-with mixed_layer(size=decoder_size) as encoded_proj:
-    encoded_proj += full_matrix_projection(input=encoded_vector)
-
-backward_first = first_seq(input=src_backward)
-with mixed_layer(
-        size=decoder_size,
-        act=TanhActivation(), ) as decoder_boot:
-    decoder_boot += full_matrix_projection(input=backward_first)
-
-
-def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-    decoder_mem = memory(
-        name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-    context = simple_attention(
-        encoded_sequence=enc_vec,
-        encoded_proj=enc_proj,
-        decoder_state=decoder_mem, )
-
-    with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-        decoder_inputs += full_matrix_projection(input=context)
-        decoder_inputs += full_matrix_projection(input=current_word)
-
-    gru_step = gru_step_layer(
-        name='gru_decoder',
-        input=decoder_inputs,
-        output_mem=decoder_mem,
-        size=decoder_size)
-
-    with mixed_layer(
-            size=target_dict_dim, bias_attr=True,
-            act=SoftmaxActivation()) as out:
-        out += full_matrix_projection(input=gru_step)
-    return out
-
-
-decoder_group_name = "decoder_group"
-group_input1 = StaticInput(input=encoded_vector, is_seq=True)
-group_input2 = StaticInput(input=encoded_proj, is_seq=True)
-group_inputs = [group_input1, group_input2]
-
-if not is_generating:
-    trg_embedding = embedding_layer(
-        input=data_layer(
-            name='target_language_word', size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
-    group_inputs.append(trg_embedding)
-
-    # For decoder equipped with attention mechanism, in training,
-    # target embeding (the groudtruth) is the data input,
-    # while encoded source sequence is accessed to as an unbounded memory.
-    # Here, the StaticInput defines a read-only memory
-    # for the recurrent_group.
-    decoder = recurrent_group(
-        name=decoder_group_name,
-        step=gru_decoder_with_attention,
-        input=group_inputs)
-
-    lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-    cost = classification_cost(input=decoder, label=lbl)
-    outputs(cost)
-else:
-    # In generation, the decoder predicts a next target word based on
-    # the encoded source sequence and the last generated target word.
-
-    # The encoded source sequence (encoder's output) must be specified by
-    # StaticInput, which is a read-only memory.
-    # Embedding of the last generated word is automatically gotten by
-    # GeneratedInputs, which is initialized by a start mark, such as <s>,
-    # and must be included in generation.
-
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
-    group_inputs.append(trg_embedding)
-
-    beam_gen = beam_search(
-        name=decoder_group_name,
-        step=gru_decoder_with_attention,
-        input=group_inputs,
-        bos_id=0,
-        eos_id=1,
-        beam_size=beam_size,
-        max_length=max_length)
-
-    seqtext_printer_evaluator(
-        input=beam_gen,
-        id_input=data_layer(
-            name="sent_id", size=1),
-        dict_file=trg_lang_dict,
-        result_file=gen_trans_file)
-    outputs(beam_gen)
diff --git a/machine_translation/train.sh b/machine_translation/train.sh
deleted file mode 100755
index 724f61fab9c04ec198633d8f922e8c1089b3388e..0000000000000000000000000000000000000000
--- a/machine_translation/train.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-paddle train \
---config='seqToseq_net.py' \
---save_dir='model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'train.log'
diff --git a/mnist-client/.eslintrc b/mnist-client/.eslintrc
new file mode 100644
index 0000000000000000000000000000000000000000..0a491514e49ebf01a81c2a8f780f8af7d94719c4
--- /dev/null
+++ b/mnist-client/.eslintrc
@@ -0,0 +1,26 @@
+{
+    "rules": {
+        "indent": [
+            2,
+            4
+        ],
+        "quotes": [
+            2,
+            "single"
+        ],
+        "linebreak-style": [
+            2,
+            "unix"
+        ],
+        "semi": [
+            2,
+            "always"
+        ]
+    },
+    "env": {
+        "es6": true,
+        "node": true,
+        "browser": true
+    },
+    "extends": "eslint:recommended"
+}
\ No newline at end of file
diff --git a/mnist-client/.gitignore b/mnist-client/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e96aff17c55590f36077a9f22ffde2c0bcf31a30
--- /dev/null
+++ b/mnist-client/.gitignore
@@ -0,0 +1,5 @@
+venv
+*.pyc
+node_modules
+static/js/main.js
+index.html
diff --git a/mnist-client/Dockerfile b/mnist-client/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d7ef76940ce1e7360f178dea26b006d74ba22023
--- /dev/null
+++ b/mnist-client/Dockerfile
@@ -0,0 +1,9 @@
+FROM mhart/alpine-node:6.11.3
+
+RUN mkdir /workspace
+WORKDIR /workspace/
+ADD * /workspace/
+RUN apk add --no-cache python py-pip
+RUN pip install -r /workspace/requirements.txt
+RUN cd /workspace && npm install && mkdir templates && mv index.html templates && mkdir static && mv js static && mv css static
+CMD ["python", "main.py"]
diff --git a/mnist-client/Procfile b/mnist-client/Procfile
new file mode 100644
index 0000000000000000000000000000000000000000..5e85b13710c7065efbd629ce897c194ae1ded316
--- /dev/null
+++ b/mnist-client/Procfile
@@ -0,0 +1 @@
+web: gunicorn main:app --log-file=-
diff --git a/mnist-client/README.md b/mnist-client/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..118790adb4682a081ff1c0351eb514d0857115f0
--- /dev/null
+++ b/mnist-client/README.md
@@ -0,0 +1,72 @@
+# MNIST classification by PaddlePaddle
+
+![screencast](https://cloud.githubusercontent.com/assets/80381/11339453/f04f885e-923c-11e5-8845-33c16978c54d.gif)
+
+## Usage
+
+This MNIST classification demo consists of two parts: a PaddlePaddle
+inference server and a Javascript front end. We will start them
+separately.
+
+We will use Docker to run the demo, if you are not familiar with
+Docker, please checkout
+this
+[tutorial](https://github.com/PaddlePaddle/Paddle/wiki/TLDR-for-new-docker-user).
+
+### Start the Inference Server
+
+The inference server can be used to inference any model trained by
+PaddlePaddle. Please see [here](../serve/README.md) for more details.
+
+1. Download the MNIST inference model topylogy and parameters to the
+   current working directory.
+
+    ```bash
+    wget https://s3.us-east-2.amazonaws.com/models.paddlepaddle/end-to-end-mnist/inference_topology.pkl
+    wget https://s3.us-east-2.amazonaws.com/models.paddlepaddle/end-to-end-mnist/param.tar
+    ```
+
+1. Run following command to start the inference server:
+
+    ```bash
+    docker run --name paddle_serve -v `pwd`:/data -d -p 8000:80 -e WITH_GPU=0 paddlepaddle/book:serve
+    ```
+
+    The above command will mount the current working directory to the
+    `/data` directory inside the docker container. The inference
+    server will load the model topology and parameters that we just
+    downloaded from there.
+
+    After you are done with the demo, you can run `docker stop
+    paddle_serve` to stop this docker container.
+
+### Start the Front End
+
+1. Run the following command
+   ```bash
+   docker run -it -p 5000:5000 -e BACKEND_URL=http://localhost:8000/ paddlepaddle/book:mnist
+   ```
+
+   `BACKEND_URL` in the above command specifies the inference server
+   endpoint. If you started the inference server on another machine,
+   or want to visit the front end remotely, you may want to change its
+   value.
+
+1. Visit http://localhost:5000 and you will see the PaddlePaddle MNIST demo.
+
+
+## Build
+
+We have already prepared the pre-built docker image
+`paddlepaddle/book:mnist`, here is the command if you want to build
+the docker image again.
+
+```bash
+docker build -t paddlepaddle/book:mnist .
+```
+
+
+## Acknowledgement
+
+Thanks to the great project https://github.com/sugyan/tensorflow-mnist
+. Most of the code in this project comes from there.
diff --git a/mnist-client/app.json b/mnist-client/app.json
new file mode 100644
index 0000000000000000000000000000000000000000..94e383a62d71bc7dc93bb5451714e98c3052ea4f
--- /dev/null
+++ b/mnist-client/app.json
@@ -0,0 +1,7 @@
+{
+    "name": "paddlepaddle-mnist",
+    "buildpacks": [
+        { "url": "https://github.com/heroku/heroku-buildpack-nodejs" },
+        { "url": "https://github.com/heroku/heroku-buildpack-python" }
+    ]
+}
diff --git a/mnist-client/gulpfile.js b/mnist-client/gulpfile.js
new file mode 100644
index 0000000000000000000000000000000000000000..7758098a1dfe6474eccd51323aa16e33a89f6108
--- /dev/null
+++ b/mnist-client/gulpfile.js
@@ -0,0 +1,19 @@
+var gulp = require('gulp');
+var babel = require('gulp-babel');
+var sourcemaps = require('gulp-sourcemaps');
+var uglify = require('gulp-uglify');
+
+gulp.task('build', function() {
+    return gulp.src('src/js/*.js')
+        .pipe(babel({ presets: ['es2015'] }))
+        .pipe(sourcemaps.init({ loadMaps: true }))
+        .pipe(uglify())
+        .pipe(sourcemaps.write())
+        .pipe(gulp.dest('static/js'));
+});
+
+gulp.task('watch', function() {
+    gulp.watch('src/js/*.js', ['build']);
+});
+
+gulp.task('default', ['build']);
diff --git a/mnist-client/main.py b/mnist-client/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc31e3977eba4246fc757a41e6fbf67eaee27f36
--- /dev/null
+++ b/mnist-client/main.py
@@ -0,0 +1,15 @@
+from flask import Flask, jsonify, render_template, request
+import os
+
+# webapp
+app = Flask(__name__)
+
+
+@app.route('/')
+def main():
+    backend_url = os.getenv('BACKEND_URL', 'http://localhost:8000/')
+    return render_template('index.html', backend_url=backend_url)
+
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000, threaded=True)
diff --git a/mnist-client/package.json b/mnist-client/package.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cab177b46f41492347fb5569bebd42bd2e73e54
--- /dev/null
+++ b/mnist-client/package.json
@@ -0,0 +1,29 @@
+{
+  "name": "paddlepaddle-mnist",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1",
+    "postinstall": "gulp"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/sugyan/tensorflow-mnist.git"
+  },
+  "engines": {
+    "node": "6.x"
+  },
+  "dependencies": {
+    "babel-preset-es2015": "^6.1.18",
+    "bootstrap": "^3.3.5",
+    "gulp": "^3.9.0",
+    "gulp-babel": "^6.1.0",
+    "gulp-sourcemaps": "^1.6.0",
+    "gulp-uglify": "^1.5.1",
+    "jquery": "^2.1.4"
+  }
+}
diff --git a/mnist-client/requirements.txt b/mnist-client/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4a5cb4c94070c323e10ba0ecd36d13099a54b055
--- /dev/null
+++ b/mnist-client/requirements.txt
@@ -0,0 +1 @@
+Flask==0.12
diff --git a/mnist-client/runtime.txt b/mnist-client/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..80aea67443c78b91acc7fd40849e1c28daf6ae7c
--- /dev/null
+++ b/mnist-client/runtime.txt
@@ -0,0 +1 @@
+python-3.6.0
\ No newline at end of file
diff --git a/mnist-client/src/js/main.js b/mnist-client/src/js/main.js
new file mode 100644
index 0000000000000000000000000000000000000000..9f556b215ca14c18cf76c668f748941862fe525f
--- /dev/null
+++ b/mnist-client/src/js/main.js
@@ -0,0 +1,136 @@
+/* global $ */
+class Main {
+    constructor() {
+        this.canvas = document.getElementById('main');
+        this.input = document.getElementById('input');
+        this.canvas.width  = 449; // 16 * 28 + 1
+        this.canvas.height = 449; // 16 * 28 + 1
+        this.ctx = this.canvas.getContext('2d');
+        this.canvas.addEventListener('mousedown', this.onMouseDown.bind(this));
+        this.canvas.addEventListener('mouseup',   this.onMouseUp.bind(this));
+        this.canvas.addEventListener('mousemove', this.onMouseMove.bind(this));
+        this.initialize();
+    }
+    initialize() {
+        this.ctx.fillStyle = '#FFFFFF';
+        this.ctx.fillRect(0, 0, 449, 449);
+        this.ctx.lineWidth = 1;
+        this.ctx.strokeRect(0, 0, 449, 449);
+        this.ctx.lineWidth = 0.05;
+        for (var i = 0; i < 27; i++) {
+            this.ctx.beginPath();
+            this.ctx.moveTo((i + 1) * 16,   0);
+            this.ctx.lineTo((i + 1) * 16, 449);
+            this.ctx.closePath();
+            this.ctx.stroke();
+
+            this.ctx.beginPath();
+            this.ctx.moveTo(  0, (i + 1) * 16);
+            this.ctx.lineTo(449, (i + 1) * 16);
+            this.ctx.closePath();
+            this.ctx.stroke();
+        }
+        this.drawInput();
+        $('#output td').text('').removeClass('success');
+    }
+    onMouseDown(e) {
+        this.canvas.style.cursor = 'default';
+        this.drawing = true;
+        this.prev = this.getPosition(e.clientX, e.clientY);
+    }
+    onMouseUp() {
+        this.drawing = false;
+        this.drawInput();
+    }
+    onMouseMove(e) {
+        if (this.drawing) {
+            var curr = this.getPosition(e.clientX, e.clientY);
+            this.ctx.lineWidth = 16;
+            this.ctx.lineCap = 'round';
+            this.ctx.beginPath();
+            this.ctx.moveTo(this.prev.x, this.prev.y);
+            this.ctx.lineTo(curr.x, curr.y);
+            this.ctx.stroke();
+            this.ctx.closePath();
+            this.prev = curr;
+        }
+    }
+    getPosition(clientX, clientY) {
+        var rect = this.canvas.getBoundingClientRect();
+        return {
+            x: clientX - rect.left,
+            y: clientY - rect.top
+        };
+    }
+    drawInput() {
+        var ctx = this.input.getContext('2d');
+        var img = new Image();
+        img.onload = () => {
+            var inputs = [];
+            var small = document.createElement('canvas').getContext('2d');
+            small.drawImage(img, 0, 0, img.width, img.height, 0, 0, 28, 28);
+            var data = small.getImageData(0, 0, 28, 28).data;
+            for (var i = 0; i < 28; i++) {
+                for (var j = 0; j < 28; j++) {
+                    var n = 4 * (i * 28 + j);
+                    inputs[i * 28 + j] = (data[n + 0] + data[n + 1] + data[n + 2]) / 3;
+                    ctx.fillStyle = 'rgb(' + [data[n + 0], data[n + 1], data[n + 2]].join(',') + ')';
+                    ctx.fillRect(j * 5, i * 5, 5, 5);
+                }
+            }
+            if (Math.min(...inputs) === 255) {
+                return;
+            }
+	    for (var i = 0; i < 784; i++) {
+		if (inputs[i] == 255) {
+		    // background
+		    inputs[i] = -1.0
+		} else {
+		    inputs[i] = 1.0
+		}
+	    }
+            $.ajax({
+                url: BACKEND_URL,
+                method: 'POST',
+                contentType: 'application/json',
+                data: JSON.stringify({"img":inputs}),
+                success: (data) => {
+		    data = data["data"][0]
+                    var max = 0;
+                    var max_index = 0;
+                    for (let j = 0; j < 10; j++) {
+                        var value = Math.round(data[j] * 1000);
+                        if (value > max) {
+                            max = value;
+                            max_index = j;
+                        }
+                        var digits = String(value).length;
+                        for (var k = 0; k < 3 - digits; k++) {
+                            value = '0' + value;
+                        }
+                        var text = '0.' + value;
+                        if (value > 999) {
+                            text = '1.000';
+                        }
+                        $('#output tr').eq(j + 1).find('td').text(text);
+                    }
+                    for (let j = 0; j < 10; j++) {
+                        if (j === max_index) {
+                            $('#output tr').eq(j + 1).find('td').addClass('success');
+                        } else {
+                            $('#output tr').eq(j + 1).find('td').removeClass('success');
+                        }
+                    }
+                }
+            });
+        };
+        img.src = this.canvas.toDataURL();
+    }
+}
+
+$(() => {
+    var main = new Main();
+    $('#clear').click(() => {
+        main.initialize();
+    });
+});
diff --git a/mnist-client/static/css/bootstrap.min.css b/mnist-client/static/css/bootstrap.min.css
new file mode 120000
index 0000000000000000000000000000000000000000..93c3bac5658591f65ae244a01f8164cf651e785a
--- /dev/null
+++ b/mnist-client/static/css/bootstrap.min.css
@@ -0,0 +1 @@
+../../node_modules/bootstrap/dist/css/bootstrap.min.css
\ No newline at end of file
diff --git a/mnist-client/static/js/jquery.min.js b/mnist-client/static/js/jquery.min.js
new file mode 120000
index 0000000000000000000000000000000000000000..08ac9f2fff70eeaec16f1444a058deddec7e63b6
--- /dev/null
+++ b/mnist-client/static/js/jquery.min.js
@@ -0,0 +1 @@
+../../node_modules/jquery/dist/jquery.min.js
\ No newline at end of file
diff --git a/mnist-client/templates/index.html b/mnist-client/templates/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..06240bfa95fad434f8c415d42c8ba5a5e04c790a
--- /dev/null
+++ b/mnist-client/templates/index.html
@@ -0,0 +1,79 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>MNIST</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='css/bootstrap.min.css') }}">
+    <script type="text/javascript" src="{{ url_for('static', filename='js/jquery.min.js') }}"></script>
+    <script type="text/javascript">
+        var BACKEND_URL = "{{ backend_url }}"
+    </script>
+    <script type="text/javascript" src="{{ url_for('static', filename='js/main.js') }}"></script>
+  </head>
+  <body>
+    <a href="https://github.com/sugyan/tensorflow-mnist"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://camo.githubusercontent.com/a6677b08c955af8400f44c6298f40e7d19cc5b2d/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f677261795f3664366436642e706e67" alt="Fork me on GitHub" data-canonical-src="https://s3.amazonaws.com/github/ribbons/forkme_right_gray_6d6d6d.png"></a>
+    <div class="container">
+      <h1>MNIST</h1>
+      <div class="row">
+        <div class="col-md-6">
+          <p>draw a digit here!</p>
+          <canvas id="main"></canvas>
+          <p>
+            <button id="clear" class="btn btn-default">clear</button>
+          </p>
+        </div>
+        <div class="col-md-6">
+          <p>input:</p>
+          <canvas id="input" style="border:1px solid" width="140" height="140"></canvas>
+          <hr>
+          <p>output:</p>
+          <table id="output" class="table">
+            <tr>
+              <th class="col-md-1">class</th>
+              <th class="col-md-2">confidence</th>
+            </tr>
+            <tr>
+              <th>0</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>1</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>2</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>3</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>4</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>5</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>6</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>7</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>8</th>
+              <td></td>
+            </tr>
+            <tr>
+              <th>9</th>
+              <td></td>
+            </tr>
+          </table>
+        </div>
+      </div>
+    </div>
+  </body>
+</html>
diff --git a/paddle b/paddle
new file mode 160000
index 0000000000000000000000000000000000000000..77e65d613d2283a9d7dc2dbce3460afa86cd5739
--- /dev/null
+++ b/paddle
@@ -0,0 +1 @@
+Subproject commit 77e65d613d2283a9d7dc2dbce3460afa86cd5739
diff --git a/gan/README.md b/pending/gan/README.md
similarity index 100%
rename from gan/README.md
rename to pending/gan/README.md
diff --git a/gan/index.html b/pending/gan/index.html
similarity index 84%
rename from gan/index.html
rename to pending/gan/index.html
index 1f88c3593d9d0027c58c83fb78543aae95c2b5b5..fb4009c11912a394a88ceafb60c821818a08a109 100644
--- a/gan/index.html
+++ b/pending/gan/index.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,12 +35,13 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
 TODO: Write about https://github.com/PaddlePaddle/Paddle/tree/develop/demo/gan
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -58,6 +60,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/image_caption/README.md b/pending/image_caption/README.md
similarity index 100%
rename from image_caption/README.md
rename to pending/image_caption/README.md
diff --git a/image_qa/index.html b/pending/image_caption/index.html
similarity index 83%
rename from image_qa/index.html
rename to pending/image_caption/index.html
index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..efbfd8a4e368928d761789c0a475c863ab586e34 100644
--- a/image_qa/index.html
+++ b/pending/image_caption/index.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,11 +35,12 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -57,6 +59,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/image_detection/README.md b/pending/image_detection/README.md
similarity index 100%
rename from image_detection/README.md
rename to pending/image_detection/README.md
diff --git a/query_relationship/index.html b/pending/image_detection/index.html
similarity index 83%
rename from query_relationship/index.html
rename to pending/image_detection/index.html
index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..efbfd8a4e368928d761789c0a475c863ab586e34 100644
--- a/query_relationship/index.html
+++ b/pending/image_detection/index.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,11 +35,12 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -57,6 +59,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/image_qa/README.md b/pending/image_qa/README.md
similarity index 100%
rename from image_qa/README.md
rename to pending/image_qa/README.md
diff --git a/image_detection/index.html b/pending/image_qa/index.html
similarity index 83%
rename from image_detection/index.html
rename to pending/image_qa/index.html
index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..efbfd8a4e368928d761789c0a475c863ab586e34 100644
--- a/image_detection/index.html
+++ b/pending/image_qa/index.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,11 +35,12 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -57,6 +59,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/query_relationship/README.md b/pending/query_relationship/README.md
similarity index 100%
rename from query_relationship/README.md
rename to pending/query_relationship/README.md
diff --git a/image_caption/index.html b/pending/query_relationship/index.html
similarity index 83%
rename from image_caption/index.html
rename to pending/query_relationship/index.html
index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..efbfd8a4e368928d761789c0a475c863ab586e34 100644
--- a/image_caption/index.html
+++ b/pending/query_relationship/index.html
@@ -1,3 +1,4 @@
+
 <html>
 <head>
   <script type="text/x-mathjax-config">
@@ -5,21 +6,21 @@
     extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
     jax: ["input/TeX", "output/HTML-CSS"],
     tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
       processEscapes: true
     },
     "HTML-CSS": { availableFonts: ["TeX"] }
   });
   </script>
   <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
+  <script type="text/javascript" src="../.tools/theme/marked.js">
   </script>
   <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
   <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
   <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
   <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
 </head>
 <style type="text/css" >
 .markdown-body {
@@ -34,11 +35,12 @@
 
 <body>
 
-<div id="context" class="container markdown-body">
+<div id="context" class="container-fluid markdown-body">
 </div>
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
+
 </div>
 <!-- You can change the lines below now. -->
 
@@ -57,6 +59,6 @@ marked.setOptions({
   }
 });
 document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
+        document.getElementById("markdown").innerHTML)
 </script>
 </body>
diff --git a/skip_thought/README.md b/pending/skip_thought/README.md
similarity index 100%
rename from skip_thought/README.md
rename to pending/skip_thought/README.md
diff --git a/pending/skip_thought/index.html b/pending/skip_thought/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..efbfd8a4e368928d761789c0a475c863ab586e34
--- /dev/null
+++ b/pending/skip_thought/index.html
@@ -0,0 +1,64 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/speech_recognition/README.md b/pending/speech_recognition/README.md
similarity index 100%
rename from speech_recognition/README.md
rename to pending/speech_recognition/README.md
diff --git a/pending/speech_recognition/index.html b/pending/speech_recognition/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..efbfd8a4e368928d761789c0a475c863ab586e34
--- /dev/null
+++ b/pending/speech_recognition/index.html
@@ -0,0 +1,64 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/recognize_digits/README.en.md b/recognize_digits/README.en.md
deleted file mode 100644
index 3c82ab1efdd2ab9fbb52928a4f8db370270e703a..0000000000000000000000000000000000000000
--- a/recognize_digits/README.en.md
+++ /dev/null
@@ -1,299 +0,0 @@
-# Recognize Digits
-
-The source code for this tutorial is under [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/recognize_digits). First-time readers, please refer to PaddlePaddle [installation instructions](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Introduction
-When we learn a new programming language, the first task is usually to write a program that prints "Hello World." In Machine Learning or Deep Learning, the equivalent task is to train a model to perform handwritten digit recognition with [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. Handwriting recognition is a typical image classification problem. The problem is relatively easy, and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a 28x28 matrix, and the label is one of the digits from 0 to 9. Each image is normalized in size and centered.
-
-<p align="center">
-<img src="image/mnist_example_image.png" width="400"><br/>
-Fig. 1. Examples of MNIST images
-</p>
-
-The MNIST dataset is created from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students the in U.S. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set (60,000 samples) and test set (10,000 samples), where training set was labeled by 250 different annotators, and it was guaranteed that there wasn't a complete overlap of annotators of training set and test set.
-
-Yann LeCun, one of the founders of Deep Learning, contributed highly towards handwritten character recognition in early days and proposed CNN (Convolutional Neural Network), which drastically improved recognition capability for handwritten characters. CNNs are now a critical concept in Deep Learning. From Yann LeCun's first proposal of LeNet to those winning models in ImageNet, such as VGGNet, GoogLeNet, ResNet, etc. (Please refer to [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification) tutorial), CNN achieved a series of impressive results in Image Classification tasks.
-
-Many algorithms are tested on MNIST. In 1998, LeCun experimented with single layer linear classifier, MLP (Multilayer Perceptron) and Multilayer CNN LeNet. These algorithms constantly reduced test error from 12% to 0.7% \[[1](#References)\]. Since then, researchers have worked on many algorithms such as k-NN (K-Nearest Neighbors) \[[2](#References)\], Support Vector Machine (SVM) \[[3](#References)\], Neural Networks \[[4-7](#References)\] and Boosting \[[8](#References)\]. Various preprocessing methods like distortion removal, noise removal, blurring etc. have also been applied to increase recognition accuracy.
-
-In this tutorial, we tackle the task of handwritten character recognition. We start with a simple softmax regression model and guide our readers step-by-step to improve this model's performance on the task of recognition.
-
-
-## Model Overview
-
-Before introducing classification algorithms and training procedure, we provide some definitions:
-- $X$ is the input: Input is a $28\times28$ MNIST image. It is flattened to a $784$ dimensional vector. $X=\left ( x_0, x_1, \dots, x_{783} \right )$.
-- $Y$ is the output: Output of the classifier is 1 of the 10 classes (digits from 0 to 9). $Y=\left ( y_0, y_1, \dots, y_9 \right )$. Each dimension $y_i$ represents the probability that the input image belongs to class $i$.
-- $L$ is the ground truth label: $L=\left ( l_0, l_1, \dots, l_9 \right )$. It is also 10 dimensional, but only one dimension is 1 and all others are all 0.
-
-### Softmax Regression
-
-In a simple softmax regression model, the input is fed to fully connected layers and a softmax function is applied to get probabilities of multiple output classes\[[9](#References)\].
-
-Input $X$ is multiplied with weights $W$, and bias $b$ is added to generate activations.
-
-$$ y_i = softmax(\sum_j W_{i,j}x_j + b_i) $$
-
-where $ softmax(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
-
-For an $N$ class classification problem with $N$ output nodes, an $N$ dimensional vector is normalized to $N$ real values in the range [0, 1], each representing the probability of the sample to belong to the class. Here $y_i$ is the prediction probability that an image is digit $i$.
-
-In such a classification problem, we usually use the cross entropy loss function:
-
-$$  crossentropy(label, y) = -\sum_i label_ilog(y_i) $$
-
-Fig. 2 shows a softmax regression network, with weights in blue, and bias in red. +1 indicates bias is 1.
-
-<p align="center">
-<img src="image/softmax_regression_en.png" width=400><br/>
-Fig. 2. Softmax regression network architecture<br/>
-</p>
-
-### Multilayer Perceptron
-
-The Softmax regression model described above uses the simplest two-layer neural network, i.e. it only contains an input layer and an output layer. So its regression ability is limited. To achieve better recognition results, we consider adding several hidden layers \[[10](#References)\] between the input layer and the output layer.
-
-1.  After the first hidden layer, we get $ H_1 = \phi(W_1X + b_1) $, where $\phi$ is the activation function. Some common ones are sigmoid, tanh and ReLU.
-2.  After the second hidden layer, we get $ H_2 = \phi(W_2H_1 + b_2) $.
-3.  Finally, after output layer, we get $Y=softmax(W_3H_2 + b_3)$, the final classification result vector.
-
-Fig. 3. is Multilayer Perceptron network, with weights in blue, and bias in red. +1 indicates bias is 1.
-
-<p align="center">
-<img src="image/mlp_en.png" width=500><br/>
-Fig. 3. Multilayer Perceptron network architecture<br/>
-
-</p>
-
-### Convolutional Neural Network
-
-#### Convolutional Layer
-
-<p align="center">
-<img src="image/conv_layer_en.png" width=500><br/>
-Fig. 4. Convolutional layer<br/>
-</p>
-
-The Convolutional layer is the core of a Convolutional Neural Network. The parameters in this layer are composed of a set of filters or kernels. In the forward step, each kernel moves horizontally and vertically, we compute a dot product of the kernel and the input at the corresponding positions, to this result we add bias and apply an activation function. The result is a two-dimensional activation map. For example, some kernel may recognize corners, and some may recognize circles. These convolution kernels may respond strongly to the corresponding features.
-
-Fig. 4 is a dynamic graph of a convolutional layer, where depths are not shown for simplicity. Input is $W_1=5, H_1=5, D_1=3$. In fact, this is a common representation for colored images. $W_1$ and  $H_1$ of a colored image correspond to the width and height respectively. $D_1$ corresponds to the 3 color channels for RGB. The parameters of the convolutional layer are $K=2, F=3, S=2, P=1$. $K$ is the number of kernels. Here, $Filter W_0$ and $Filter   W_1$ are two kernels. $F$ is kernel size. $W0$ and $W1$ are both $3\times3$ matrix in all depths. $S$ is the stride. Kernels move leftwards or downwards by 2 units each time. $P$ is padding, an extension of the input. The gray area in the figure shows zero padding with size 1.
-
-#### Pooling Layer
-
-<p align="center">
-<img src="image/max_pooling_en.png" width="400px"><br/>
-Fig. 5 Pooling layer<br/>
-</p>
-
-A Pooling layer performs downsampling. The main functionality of this layer is to reduce computation by reducing the network parameters. It also prevents overfitting to some extent. Usually, a pooling layer is added after a convolutional layer. Pooling layer can be of various types like max pooling, average pooling, etc. Max pooling uses rectangles to segment the input layer into several parts and computes the maximum value in each part as the output (Fig. 5.)
-
-#### LeNet-5 Network 
-
-<p align="center">
-<img src="image/cnn_en.png"><br/>
-Fig. 6. LeNet-5 Convolutional Neural Network architecture<br/>
-</p>
-
-[LeNet-5](http://yann.lecun.com/exdb/lenet/) is one of the simplest Convolutional Neural Networks. Fig. 6. shows its architecture: A 2-dimensional input image is fed into two sets of convolutional layers and pooling layers, this output is then fed to a fully connected layer and a softmax classifier. The following three properties of convolution enable LeNet-5 to better recognize images than Multilayer fully connected perceptrons:
-
-- 3D properties of neurons: a convolutional layer is organized by width, height and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
-- Local connection: A CNN utilizes the local space correlation by connecting local neurons. This design guarantees that the learned filter has a strong response to local input features. Stacking many such layers generates a non-linear filter that is more global. This enables the network to first obtain good representation for small parts of input and then combine them to represent a larger region.
-- Sharing weights: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means all neurons in the same depth of the output respond to the same feature. This allows detecting a feature regardless of its position in the input and enables translation equivariance.
-
-For more details on Convolutional Neural Networks, please refer to [this Stanford open course]( http://cs231n.github.io/convolutional-networks/ ) and [this Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) tutorial.
-
-### List of Common Activation Functions  
-- Sigmoid activation function: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
-
-- Tanh activation function: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
-
-  In fact, tanh function is just a rescaled version of the sigmoid function. It is obtained by magnifying the value of the sigmoid function and moving it downwards by 1.
-
-- ReLU activation function: $ f(x) = max(0, x) $
-
-For more information, please refer to [Activation functions on Wikipedia](https://en.wikipedia.org/wiki/Activation_function).
-
-## Data Preparation
-
-PaddlePaddle provides a Python module, `paddle.dataset.mnist`, which downloads and caches the [MNIST dataset](http://yann.lecun.com/exdb/mnist/).  The cache is under `/home/username/.cache/paddle/dataset/mnist`:
-
-
-|    File name          |       Description              |
-|----------------------|-------------------------|
-|train-images-idx3-ubyte|  Training images, 60,000 |
-|train-labels-idx1-ubyte|  Training labels, 60,000 |
-|t10k-images-idx3-ubyte |  Evaluation images, 10,000 |
-|t10k-labels-idx1-ubyte |  Evaluation labels, 10,000 |
-
-
-## Model Configuration
-
-A PaddlePaddle program starts from importing the API package:
-
-```python
-import paddle.v2 as paddle
-```
-
-We want to use this program to demonstrate multiple kinds of models.  Let define each of them as a Python function:
-
-- softmax regression: the network has a fully-connection layer with softmax activation:
-
-```python
-def softmax_regression(img):
-    predict = paddle.layer.fc(input=img,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-```
-
-- multi-layer perceptron: this network has two hidden fully-connected layers, one with LeRU and the other with softmax activation:
-
-```python
-def multilayer_perceptron(img):
-    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
-    hidden2 = paddle.layer.fc(input=hidden1,
-                              size=64,
-                              act=paddle.activation.Relu())
-    predict = paddle.layer.fc(input=hidden2,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-```
-
-- convolution network LeNet-5: the input image is fed through two convolution-pooling layer, a fully-connected layer, and the softmax output layer:
-
-```python
-def convolutional_neural_network(img):
-
-    conv_pool_1 = paddle.networks.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        num_channel=1,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-
-    conv_pool_2 = paddle.networks.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        num_channel=20,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
-
-    predict = paddle.layer.fc(input=fc1,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-```
-
-PaddlePaddle provides a special layer `layer.data` for reading data. Let us create a data layer for reading images and connect it to a classification network created using one of above three functions.  We also need a cost layer for training the model.
-
-```python
-paddle.init(use_gpu=False, trainer_count=1)
-
-images = paddle.layer.data(
-    name='pixel', type=paddle.data_type.dense_vector(784))
-label = paddle.layer.data(
-    name='label', type=paddle.data_type.integer_value(10))
-
-predict = softmax_regression(images)
-#predict = multilayer_perceptron(images) # uncomment for MLP
-#predict = convolutional_neural_network(images) # uncomment for LeNet5
-
-cost = paddle.layer.classification_cost(input=predict, label=label)
-```
-
-Now, it is time to specify training parameters. The number 0.9 in the following `Momentum` optimizer means that 90% of the current the momentum comes from the momentum of the previous iteration.
-
-```python
-parameters = paddle.parameters.create(cost)
-
-optimizer = paddle.optimizer.Momentum(
-    learning_rate=0.1 / 128.0,
-    momentum=0.9,
-    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-trainer = paddle.trainer.SGD(cost=cost,
-                             parameters=parameters,
-                             update_equation=optimizer)
-```
-
-Then we specify the training data `paddle.dataset.movielens.train()` and testing data `paddle.dataset.movielens.test()`.  These two functions are *reader creators*, once called, returns a *reader*.  A reader is a Python function, which, once called, returns a Python generator, which yields instances of data.  
-
-Here `shuffle` is a reader decorator, which takes a reader A as its parameter, and returns a new reader B, where B calls A to read in `buffer_size` data instances everytime into a buffer, then shuffles and yield instances in the buffer.  If you want very shuffled data, try use a larger buffer size. 
-
-`batch` is a special decorator, whose input is a reader and output is a *batch reader*, which doesn't yield an instance at a time, but a minibatch.
-
-```python
-lists = []
-
-def event_handler(event):
-    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
-            print "Pass %d, Batch %d, Cost %f, %s" % (
-                event.pass_id, event.batch_id, event.cost, event.metrics)
-    if isinstance(event, paddle.event.EndPass):
-        result = trainer.test(reader=paddle.reader.batched(
-            paddle.dataset.mnist.test(), batch_size=128))
-        print "Test with Pass %d, Cost %f, %s\n" % (
-            event.pass_id, result.cost, result.metrics)
-        lists.append((event.pass_id, result.cost,
-                      result.metrics['classification_error_evaluator']))
-
-trainer.train(
-    reader=paddle.reader.batched(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=8192),
-        batch_size=128),
-    event_handler=event_handler,
-    num_passes=100)
-```
-
-During training, `trainer.train` invokes `event_handler` for certain events. This gives us a chance to print the training progress.
-
-```
-# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
-# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
-# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
-# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
-# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
-# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
-```
-
-After the training, we can check the model's prediction accuracy.
-
-```
-# find the best pass
-best = sorted(lists, key=lambda list: float(list[1]))[0]
-print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
-```
-
-Usually, with MNIST data, the softmax regression model can get accuracy around 92.34%, MLP can get about 97.66%, and convolution network can get up to around 99.20%.  Convolution layers have been widely considered a great invention for image processsing.
-
-
-## Conclusion
-This tutorial describes a few basic Deep Learning models viz. Softmax regression, Multilayer Perceptron Network and Convolutional Neural Network. The subsequent tutorials will derive more sophisticated models from these. So it is crucial to understand these models for future learning. When our model evolved from a simple softmax regression to slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST data set achieved large improvement in accuracy. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve results of an old one. Moreover, this tutorial introduced the basic flow of PaddlePaddle model design, starting with a dataprovider, model layer construction, to final training and prediction. Readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
-
-## References
-
-1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
-2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014).
-3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
-4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
-5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
-6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220.
-7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
-8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
-9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
-10. Bishop, Christopher M. ["Pattern recognition."](http://s3.amazonaws.com/academia.edu.documents/30428242/bg0137.pdf?AWSAccessKeyId=AKIAJ56TQJRTWSMTNPEA&Expires=1484816640&Signature=85Ad6%2Fca8T82pmHzxaSXermovIA%3D&response-content-disposition=inline%3B%20filename%3DPattern_recognition_and_machine_learning.pdf) Machine Learning 128 (2006): 1-58.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">This book</span> is created by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and uses <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Shared knowledge signature - non commercial use-Sharing 4.0 International Licensing Protocal</a>.
diff --git a/recognize_digits/data/get_mnist_data.sh b/recognize_digits/data/get_mnist_data.sh
deleted file mode 100755
index 8d5cf179a940be06288d283e8a783b28d038acad..0000000000000000000000000000000000000000
--- a/recognize_digits/data/get_mnist_data.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env sh
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This scripts downloads the mnist data and unzips it.
-set -e
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-rm -rf "$DIR/raw_data"
-mkdir "$DIR/raw_data"
-cd "$DIR/raw_data"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
-
-cd $DIR
-rm -f *.list
-echo "./data/raw_data/train" > "$DIR/train.list"
-echo "./data/raw_data/t10k" > "$DIR/test.list"
diff --git a/recognize_digits/evaluate.py b/recognize_digits/evaluate.py
deleted file mode 100755
index b91467e242e83fe47b910c44122acc26209dcaec..0000000000000000000000000000000000000000
--- a/recognize_digits/evaluate.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import re
-import math
-
-
-def get_best_pass(filename):
-    with open(filename, 'r') as f:
-        text = f.read()
-        pattern = re.compile(
-            'Test.*? cost=([0-9]+\.[0-9]+).*?classification_error_evaluator=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
-            re.S)
-        results = re.findall(pattern, text)
-        sorted_results = sorted(results, key=lambda result: float(result[0]))
-        return sorted_results[0]
-
-
-filename = sys.argv[1]
-log = get_best_pass(filename)
-classification_accuracy = (1 - float(log[1])) * 100
-print 'Best pass is %s, testing Avgcost is %s' % (log[2], log[0])
-print 'The classification accuracy is %.2f%%' % classification_accuracy
diff --git a/recognize_digits/image/conv_layer.png b/recognize_digits/image/conv_layer.png
deleted file mode 100644
index 67d2e3337c94dc3effbf210416062401601d3ec8..0000000000000000000000000000000000000000
Binary files a/recognize_digits/image/conv_layer.png and /dev/null differ
diff --git a/recognize_digits/image/conv_layer_en.png b/recognize_digits/image/conv_layer_en.png
deleted file mode 100755
index 60ee6e183520e5d75bcfa3ba3c7327dd94847f85..0000000000000000000000000000000000000000
Binary files a/recognize_digits/image/conv_layer_en.png and /dev/null differ
diff --git a/recognize_digits/index.en.html b/recognize_digits/index.en.html
deleted file mode 100644
index bec542ca357adc52da20bcc6a9eba26a2c7d580f..0000000000000000000000000000000000000000
--- a/recognize_digits/index.en.html
+++ /dev/null
@@ -1,361 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# Recognize Digits
-
-The source code for this tutorial is under [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/recognize_digits). First-time readers, please refer to PaddlePaddle [installation instructions](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Introduction
-When we learn a new programming language, the first task is usually to write a program that prints "Hello World." In Machine Learning or Deep Learning, the equivalent task is to train a model to perform handwritten digit recognition with [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. Handwriting recognition is a typical image classification problem. The problem is relatively easy, and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a 28x28 matrix, and the label is one of the digits from 0 to 9. Each image is normalized in size and centered.
-
-<p align="center">
-<img src="image/mnist_example_image.png" width="400"><br/>
-Fig. 1. Examples of MNIST images
-</p>
-
-The MNIST dataset is created from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students the in U.S. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set (60,000 samples) and test set (10,000 samples), where training set was labeled by 250 different annotators, and it was guaranteed that there wasn't a complete overlap of annotators of training set and test set.
-
-Yann LeCun, one of the founders of Deep Learning, contributed highly towards handwritten character recognition in early days and proposed CNN (Convolutional Neural Network), which drastically improved recognition capability for handwritten characters. CNNs are now a critical concept in Deep Learning. From Yann LeCun's first proposal of LeNet to those winning models in ImageNet, such as VGGNet, GoogLeNet, ResNet, etc. (Please refer to [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/image_classification) tutorial), CNN achieved a series of impressive results in Image Classification tasks.
-
-Many algorithms are tested on MNIST. In 1998, LeCun experimented with single layer linear classifier, MLP (Multilayer Perceptron) and Multilayer CNN LeNet. These algorithms constantly reduced test error from 12% to 0.7% \[[1](#References)\]. Since then, researchers have worked on many algorithms such as k-NN (K-Nearest Neighbors) \[[2](#References)\], Support Vector Machine (SVM) \[[3](#References)\], Neural Networks \[[4-7](#References)\] and Boosting \[[8](#References)\]. Various preprocessing methods like distortion removal, noise removal, blurring etc. have also been applied to increase recognition accuracy.
-
-In this tutorial, we tackle the task of handwritten character recognition. We start with a simple softmax regression model and guide our readers step-by-step to improve this model's performance on the task of recognition.
-
-
-## Model Overview
-
-Before introducing classification algorithms and training procedure, we provide some definitions:
-- $X$ is the input: Input is a $28\times28$ MNIST image. It is flattened to a $784$ dimensional vector. $X=\left ( x_0, x_1, \dots, x_{783} \right )$.
-- $Y$ is the output: Output of the classifier is 1 of the 10 classes (digits from 0 to 9). $Y=\left ( y_0, y_1, \dots, y_9 \right )$. Each dimension $y_i$ represents the probability that the input image belongs to class $i$.
-- $L$ is the ground truth label: $L=\left ( l_0, l_1, \dots, l_9 \right )$. It is also 10 dimensional, but only one dimension is 1 and all others are all 0.
-
-### Softmax Regression
-
-In a simple softmax regression model, the input is fed to fully connected layers and a softmax function is applied to get probabilities of multiple output classes\[[9](#References)\].
-
-Input $X$ is multiplied with weights $W$, and bias $b$ is added to generate activations.
-
-$$ y_i = softmax(\sum_j W_{i,j}x_j + b_i) $$
-
-where $ softmax(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
-
-For an $N$ class classification problem with $N$ output nodes, an $N$ dimensional vector is normalized to $N$ real values in the range [0, 1], each representing the probability of the sample to belong to the class. Here $y_i$ is the prediction probability that an image is digit $i$.
-
-In such a classification problem, we usually use the cross entropy loss function:
-
-$$  crossentropy(label, y) = -\sum_i label_ilog(y_i) $$
-
-Fig. 2 shows a softmax regression network, with weights in blue, and bias in red. +1 indicates bias is 1.
-
-<p align="center">
-<img src="image/softmax_regression_en.png" width=400><br/>
-Fig. 2. Softmax regression network architecture<br/>
-</p>
-
-### Multilayer Perceptron
-
-The Softmax regression model described above uses the simplest two-layer neural network, i.e. it only contains an input layer and an output layer. So its regression ability is limited. To achieve better recognition results, we consider adding several hidden layers \[[10](#References)\] between the input layer and the output layer.
-
-1.  After the first hidden layer, we get $ H_1 = \phi(W_1X + b_1) $, where $\phi$ is the activation function. Some common ones are sigmoid, tanh and ReLU.
-2.  After the second hidden layer, we get $ H_2 = \phi(W_2H_1 + b_2) $.
-3.  Finally, after output layer, we get $Y=softmax(W_3H_2 + b_3)$, the final classification result vector.
-
-Fig. 3. is Multilayer Perceptron network, with weights in blue, and bias in red. +1 indicates bias is 1.
-
-<p align="center">
-<img src="image/mlp_en.png" width=500><br/>
-Fig. 3. Multilayer Perceptron network architecture<br/>
-
-</p>
-
-### Convolutional Neural Network
-
-#### Convolutional Layer
-
-<p align="center">
-<img src="image/conv_layer_en.png" width=500><br/>
-Fig. 4. Convolutional layer<br/>
-</p>
-
-The Convolutional layer is the core of a Convolutional Neural Network. The parameters in this layer are composed of a set of filters or kernels. In the forward step, each kernel moves horizontally and vertically, we compute a dot product of the kernel and the input at the corresponding positions, to this result we add bias and apply an activation function. The result is a two-dimensional activation map. For example, some kernel may recognize corners, and some may recognize circles. These convolution kernels may respond strongly to the corresponding features.
-
-Fig. 4 is a dynamic graph of a convolutional layer, where depths are not shown for simplicity. Input is $W_1=5, H_1=5, D_1=3$. In fact, this is a common representation for colored images. $W_1$ and  $H_1$ of a colored image correspond to the width and height respectively. $D_1$ corresponds to the 3 color channels for RGB. The parameters of the convolutional layer are $K=2, F=3, S=2, P=1$. $K$ is the number of kernels. Here, $Filter W_0$ and $Filter   W_1$ are two kernels. $F$ is kernel size. $W0$ and $W1$ are both $3\times3$ matrix in all depths. $S$ is the stride. Kernels move leftwards or downwards by 2 units each time. $P$ is padding, an extension of the input. The gray area in the figure shows zero padding with size 1.
-
-#### Pooling Layer
-
-<p align="center">
-<img src="image/max_pooling_en.png" width="400px"><br/>
-Fig. 5 Pooling layer<br/>
-</p>
-
-A Pooling layer performs downsampling. The main functionality of this layer is to reduce computation by reducing the network parameters. It also prevents overfitting to some extent. Usually, a pooling layer is added after a convolutional layer. Pooling layer can be of various types like max pooling, average pooling, etc. Max pooling uses rectangles to segment the input layer into several parts and computes the maximum value in each part as the output (Fig. 5.)
-
-#### LeNet-5 Network 
-
-<p align="center">
-<img src="image/cnn_en.png"><br/>
-Fig. 6. LeNet-5 Convolutional Neural Network architecture<br/>
-</p>
-
-[LeNet-5](http://yann.lecun.com/exdb/lenet/) is one of the simplest Convolutional Neural Networks. Fig. 6. shows its architecture: A 2-dimensional input image is fed into two sets of convolutional layers and pooling layers, this output is then fed to a fully connected layer and a softmax classifier. The following three properties of convolution enable LeNet-5 to better recognize images than Multilayer fully connected perceptrons:
-
-- 3D properties of neurons: a convolutional layer is organized by width, height and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
-- Local connection: A CNN utilizes the local space correlation by connecting local neurons. This design guarantees that the learned filter has a strong response to local input features. Stacking many such layers generates a non-linear filter that is more global. This enables the network to first obtain good representation for small parts of input and then combine them to represent a larger region.
-- Sharing weights: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means all neurons in the same depth of the output respond to the same feature. This allows detecting a feature regardless of its position in the input and enables translation equivariance.
-
-For more details on Convolutional Neural Networks, please refer to [this Stanford open course]( http://cs231n.github.io/convolutional-networks/ ) and [this Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) tutorial.
-
-### List of Common Activation Functions  
-- Sigmoid activation function: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
-
-- Tanh activation function: $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
-
-  In fact, tanh function is just a rescaled version of the sigmoid function. It is obtained by magnifying the value of the sigmoid function and moving it downwards by 1.
-
-- ReLU activation function: $ f(x) = max(0, x) $
-
-For more information, please refer to [Activation functions on Wikipedia](https://en.wikipedia.org/wiki/Activation_function).
-
-## Data Preparation
-
-PaddlePaddle provides a Python module, `paddle.dataset.mnist`, which downloads and caches the [MNIST dataset](http://yann.lecun.com/exdb/mnist/).  The cache is under `/home/username/.cache/paddle/dataset/mnist`:
-
-
-|    File name          |       Description              |
-|----------------------|-------------------------|
-|train-images-idx3-ubyte|  Training images, 60,000 |
-|train-labels-idx1-ubyte|  Training labels, 60,000 |
-|t10k-images-idx3-ubyte |  Evaluation images, 10,000 |
-|t10k-labels-idx1-ubyte |  Evaluation labels, 10,000 |
-
-
-## Model Configuration
-
-A PaddlePaddle program starts from importing the API package:
-
-```python
-import paddle.v2 as paddle
-```
-
-We want to use this program to demonstrate multiple kinds of models.  Let define each of them as a Python function:
-
-- softmax regression: the network has a fully-connection layer with softmax activation:
-
-```python
-def softmax_regression(img):
-    predict = paddle.layer.fc(input=img,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-```
-
-- multi-layer perceptron: this network has two hidden fully-connected layers, one with LeRU and the other with softmax activation:
-
-```python
-def multilayer_perceptron(img):
-    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
-    hidden2 = paddle.layer.fc(input=hidden1,
-                              size=64,
-                              act=paddle.activation.Relu())
-    predict = paddle.layer.fc(input=hidden2,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-```
-
-- convolution network LeNet-5: the input image is fed through two convolution-pooling layer, a fully-connected layer, and the softmax output layer:
-
-```python
-def convolutional_neural_network(img):
-
-    conv_pool_1 = paddle.networks.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        num_channel=1,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-
-    conv_pool_2 = paddle.networks.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        num_channel=20,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
-
-    predict = paddle.layer.fc(input=fc1,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-```
-
-PaddlePaddle provides a special layer `layer.data` for reading data. Let us create a data layer for reading images and connect it to a classification network created using one of above three functions.  We also need a cost layer for training the model.
-
-```python
-paddle.init(use_gpu=False, trainer_count=1)
-
-images = paddle.layer.data(
-    name='pixel', type=paddle.data_type.dense_vector(784))
-label = paddle.layer.data(
-    name='label', type=paddle.data_type.integer_value(10))
-
-predict = softmax_regression(images)
-#predict = multilayer_perceptron(images) # uncomment for MLP
-#predict = convolutional_neural_network(images) # uncomment for LeNet5
-
-cost = paddle.layer.classification_cost(input=predict, label=label)
-```
-
-Now, it is time to specify training parameters. The number 0.9 in the following `Momentum` optimizer means that 90% of the current the momentum comes from the momentum of the previous iteration.
-
-```python
-parameters = paddle.parameters.create(cost)
-
-optimizer = paddle.optimizer.Momentum(
-    learning_rate=0.1 / 128.0,
-    momentum=0.9,
-    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-trainer = paddle.trainer.SGD(cost=cost,
-                             parameters=parameters,
-                             update_equation=optimizer)
-```
-
-Then we specify the training data `paddle.dataset.movielens.train()` and testing data `paddle.dataset.movielens.test()`.  These two functions are *reader creators*, once called, returns a *reader*.  A reader is a Python function, which, once called, returns a Python generator, which yields instances of data.  
-
-Here `shuffle` is a reader decorator, which takes a reader A as its parameter, and returns a new reader B, where B calls A to read in `buffer_size` data instances everytime into a buffer, then shuffles and yield instances in the buffer.  If you want very shuffled data, try use a larger buffer size. 
-
-`batch` is a special decorator, whose input is a reader and output is a *batch reader*, which doesn't yield an instance at a time, but a minibatch.
-
-```python
-lists = []
-
-def event_handler(event):
-    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
-            print "Pass %d, Batch %d, Cost %f, %s" % (
-                event.pass_id, event.batch_id, event.cost, event.metrics)
-    if isinstance(event, paddle.event.EndPass):
-        result = trainer.test(reader=paddle.reader.batched(
-            paddle.dataset.mnist.test(), batch_size=128))
-        print "Test with Pass %d, Cost %f, %s\n" % (
-            event.pass_id, result.cost, result.metrics)
-        lists.append((event.pass_id, result.cost,
-                      result.metrics['classification_error_evaluator']))
-
-trainer.train(
-    reader=paddle.reader.batched(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=8192),
-        batch_size=128),
-    event_handler=event_handler,
-    num_passes=100)
-```
-
-During training, `trainer.train` invokes `event_handler` for certain events. This gives us a chance to print the training progress.
-
-```
-# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
-# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
-# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
-# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
-# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
-# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
-```
-
-After the training, we can check the model's prediction accuracy.
-
-```
-# find the best pass
-best = sorted(lists, key=lambda list: float(list[1]))[0]
-print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
-```
-
-Usually, with MNIST data, the softmax regression model can get accuracy around 92.34%, MLP can get about 97.66%, and convolution network can get up to around 99.20%.  Convolution layers have been widely considered a great invention for image processsing.
-
-
-## Conclusion
-This tutorial describes a few basic Deep Learning models viz. Softmax regression, Multilayer Perceptron Network and Convolutional Neural Network. The subsequent tutorials will derive more sophisticated models from these. So it is crucial to understand these models for future learning. When our model evolved from a simple softmax regression to slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST data set achieved large improvement in accuracy. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve results of an old one. Moreover, this tutorial introduced the basic flow of PaddlePaddle model design, starting with a dataprovider, model layer construction, to final training and prediction. Readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
-
-## References
-
-1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
-2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014).
-3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
-4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
-5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
-6. Cireşan, Dan Claudiu, Ueli Meier, Luca Maria Gambardella, and Jürgen Schmidhuber. ["Deep, big, simple neural nets for handwritten digit recognition."](http://www.mitpressjournals.org/doi/abs/10.1162/NECO_a_00052) Neural computation 22, no. 12 (2010): 3207-3220.
-7. Deng, Li, Michael L. Seltzer, Dong Yu, Alex Acero, Abdel-rahman Mohamed, and Geoffrey E. Hinton. ["Binary coding of speech spectrograms using a deep auto-encoder."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&rep=rep1&type=pdf) In Interspeech, pp. 1692-1695. 2010.
-8. Kégl, Balázs, and Róbert Busa-Fekete. ["Boosting products of base classifiers."](http://dl.acm.org/citation.cfm?id=1553439) In Proceedings of the 26th Annual International Conference on Machine Learning, pp. 497-504. ACM, 2009.
-9. Rosenblatt, Frank. ["The perceptron: A probabilistic model for information storage and organization in the brain."](http://psycnet.apa.org/journals/rev/65/6/386/) Psychological review 65, no. 6 (1958): 386.
-10. Bishop, Christopher M. ["Pattern recognition."](http://s3.amazonaws.com/academia.edu.documents/30428242/bg0137.pdf?AWSAccessKeyId=AKIAJ56TQJRTWSMTNPEA&Expires=1484816640&Signature=85Ad6%2Fca8T82pmHzxaSXermovIA%3D&response-content-disposition=inline%3B%20filename%3DPattern_recognition_and_machine_learning.pdf) Machine Learning 128 (2006): 1-58.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">This book</span> is created by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a>, and uses <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Shared knowledge signature - non commercial use-Sharing 4.0 International Licensing Protocal</a>.
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/recognize_digits/load_data.py b/recognize_digits/load_data.py
deleted file mode 100644
index a3055a591ee897afccc1e56c4f8abde5b274e93f..0000000000000000000000000000000000000000
--- a/recognize_digits/load_data.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import matplotlib.pyplot as plt
-import random
-import struct
-
-
-def read_data(path, filename):
-    with open(path + filename + "-images-idx3-ubyte",
-              "rb") as f:  # open picture file
-        magic, n, rows, cols = struct.unpack(">IIII", f.read(16))
-        images = np.fromfile(
-            f, 'ubyte',
-            count=n * rows * cols).reshape(n, rows, cols).astype('float32')
-
-    with open(path + filename + "-labels-idx1-ubyte",
-              "rb") as l:  # open label file
-        magic, n = struct.unpack(">II", l.read(8))
-        labels = np.fromfile(l, 'ubyte', count=n).astype("int")
-
-    return images, labels
-
-
-if __name__ == "__main__":
-    train_images, train_labels = read_data("./data/raw_data/", "train")
-    test_images, test_labels = read_data("./data/raw_data/", "t10k")
-    label_list = []
-    for i in range(10):
-        index = random.randint(0, train_images.shape[0] - 1)
-        label_list.append(train_labels[index])
-        plt.subplot(1, 10, i + 1)
-        plt.imshow(train_images[index], cmap="Greys_r")
-        plt.axis('off')
-    print('label: %s' % (label_list, ))
-    plt.show()
diff --git a/recognize_digits/mnist_model.py b/recognize_digits/mnist_model.py
deleted file mode 100644
index 4bece1bbe5988c0acc6341d3fb1a6b81c3dcbed7..0000000000000000000000000000000000000000
--- a/recognize_digits/mnist_model.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-
-####################Data Configuration ##################
-
-if not is_predict:
-    data_dir = './data/'
-    define_py_data_sources2(
-        train_list=data_dir + 'train.list',
-        test_list=data_dir + 'test.list',
-        module='mnist_provider',
-        obj='process')
-
-######################Algorithm Configuration #############
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128))
-
-#######################Network Configuration #############
-
-data_size = 1 * 28 * 28
-label_size = 10
-img = data_layer(name='pixel', size=data_size)
-
-
-def softmax_regression(img):
-    predict = fc_layer(input=img, size=10, act=SoftmaxActivation())
-    return predict
-
-
-def multilayer_perceptron(img):
-    # The first fully-connected layer
-    hidden1 = fc_layer(input=img, size=128, act=ReluActivation())
-    # The second fully-connected layer and the according activation function
-    hidden2 = fc_layer(input=hidden1, size=64, act=ReluActivation())
-    # The thrid fully-connected layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
-    return predict
-
-
-def convolutional_neural_network(img):
-    # first conv layer
-    conv_pool_1 = simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        num_channel=1,
-        pool_size=2,
-        pool_stride=2,
-        act=TanhActivation())
-    # second conv layer
-    conv_pool_2 = simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        num_channel=20,
-        pool_size=2,
-        pool_stride=2,
-        act=TanhActivation())
-    # The first fully-connected layer
-    fc1 = fc_layer(input=conv_pool_2, size=128, act=TanhActivation())
-    # The softmax layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = fc_layer(input=fc1, size=10, act=SoftmaxActivation())
-    return predict
-
-
-predict = softmax_regression(img)
-#predict = multilayer_perceptron(img)
-#predict = convolutional_neural_network(img)
-
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    inputs(img, lbl)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
diff --git a/recognize_digits/mnist_provider.py b/recognize_digits/mnist_provider.py
deleted file mode 100644
index b6f1d9662ed46aa18030543246b096fdc2e892cc..0000000000000000000000000000000000000000
--- a/recognize_digits/mnist_provider.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import numpy as np
-import struct
-
-
-# Define a py data provider
-@provider(
-    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
-def process(settings, filename):  # settings is not used currently.
-    with open(filename + "-images-idx3-ubyte", "rb") as f:  # open picture file
-        magic, n, rows, cols = struct.unpack(">IIII", f.read(16))
-        images = np.fromfile(
-            f, 'ubyte',
-            count=n * rows * cols).reshape(n, rows, cols).astype('float32')
-        images = images / 255.0 * 2.0 - 1.0  # normalized to [-1,1]
-
-    with open(filename + "-labels-idx1-ubyte", "rb") as l:  # open label file
-        magic, n = struct.unpack(">II", l.read(8))
-        labels = np.fromfile(l, 'ubyte', count=n).astype("int")
-
-    for i in xrange(n):
-        yield {"pixel": images[i, :], 'label': labels[i]}
diff --git a/recognize_digits/plot_cost.py b/recognize_digits/plot_cost.py
deleted file mode 100644
index 1f79e835f6ccb282102a97699ae9c08fa5fd3aae..0000000000000000000000000000000000000000
--- a/recognize_digits/plot_cost.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import matplotlib.pyplot as plt
-import re
-import sys
-
-
-def plot_log(filename):
-    with open(filename, 'r') as f:
-        text = f.read()
-        pattern = re.compile(
-            'AvgCost=([0-9]+\.[0-9]+).*?Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
-            re.S)
-        results = re.findall(pattern, text)
-        train_cost, test_cost, pass_ = zip(*results)
-        train_cost_float = map(float, train_cost)
-        test_cost_float = map(float, test_cost)
-        pass_int = map(int, pass_)
-        plt.plot(pass_int, train_cost_float, 'red', label='Train')
-        plt.plot(pass_int, test_cost_float, 'g--', label='Test')
-        plt.ylabel('AvgCost')
-        plt.xlabel('Epoch')
-
-        # Now add the legend with some customizations.
-        legend = plt.legend(loc='upper right', shadow=False)
-
-        # The frame is matplotlib.patches.Rectangle instance surrounding the legend.
-        frame = legend.get_frame()
-        frame.set_facecolor('0.90')
-
-        # Set the fontsize
-        for label in legend.get_texts():
-            label.set_fontsize('large')
-
-        for label in legend.get_lines():
-            label.set_linewidth(1.5)  # the legend line width
-
-        plt.show()
-
-
-if __name__ == '__main__':
-    plot_log(sys.argv[1])
diff --git a/recognize_digits/predict.py b/recognize_digits/predict.py
deleted file mode 100644
index 0a6c87bf1a1b982141fd9924f33532ba77e37974..0000000000000000000000000000000000000000
--- a/recognize_digits/predict.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Usage: predict.py -c CONF -d DATA -m MODEL
-
-Arguments:
-    CONF        train conf
-    DATA        MNIST Data
-    MODEL       Model
-
-Options:
-    -h      --help
-    -c      conf
-    -d      data
-    -m      model
-"""
-
-import os
-import sys
-from docopt import docopt
-import numpy as np
-
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-from load_data import read_data
-
-
-class Prediction():
-    def __init__(self, train_conf, data_dir, model_dir):
-
-        conf = parse_config(train_conf, 'is_predict=1')
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(model_dir)
-
-        self.images, self.labels = read_data(data_dir, "t10k")
-        self.images = self.images / 255.0 * 2.0 - 1.0  # normalized to [-1,1]
-
-        slots = [dense_vector(28 * 28)]
-        self.converter = DataProviderConverter(slots)
-
-    def predict(self, index):
-        input = self.converter([[self.images[index].flatten().tolist()]])
-        output = self.network.forwardTest(input)
-        prob = output[0]["value"]
-        predict = np.argsort(-prob)
-        print "Predicted probability of each digit:"
-        print prob
-        print "Predict Number: %d" % predict[0][0]
-        print "Actual Number: %d" % self.labels[index]
-
-
-def main():
-    arguments = docopt(__doc__)
-    train_conf = arguments['CONF']
-    data_dir = arguments['DATA']
-    model_dir = arguments['MODEL']
-    swig_paddle.initPaddle("--use_gpu=0")
-    predictor = Prediction(train_conf, data_dir, model_dir)
-    while True:
-        index = int(raw_input("Input image_id [0~9999]: "))
-        predictor.predict(index)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/recognize_digits/train.py b/recognize_digits/train.py
deleted file mode 100644
index 7ee1c83ad1bd8ec25b78687493a84d79afe05ac3..0000000000000000000000000000000000000000
--- a/recognize_digits/train.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import paddle.v2 as paddle
-
-
-def softmax_regression(img):
-    predict = paddle.layer.fc(input=img,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def multilayer_perceptron(img):
-    # The first fully-connected layer
-    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
-    # The second fully-connected layer and the according activation function
-    hidden2 = paddle.layer.fc(input=hidden1,
-                              size=64,
-                              act=paddle.activation.Relu())
-    # The thrid fully-connected layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=hidden2,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def convolutional_neural_network(img):
-    # first conv layer
-    conv_pool_1 = paddle.networks.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        num_channel=1,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # second conv layer
-    conv_pool_2 = paddle.networks.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        num_channel=20,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # The first fully-connected layer
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
-    # The softmax layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=fc1,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-paddle.init(use_gpu=False, trainer_count=1)
-
-# define network topology
-images = paddle.layer.data(
-    name='pixel', type=paddle.data_type.dense_vector(784))
-label = paddle.layer.data(name='label', type=paddle.data_type.integer_value(10))
-
-# Here we can build the prediction network in different ways. Please
-# choose one by uncomment corresponding line.
-predict = softmax_regression(images)
-#predict = multilayer_perceptron(images)
-#predict = convolutional_neural_network(images)
-
-cost = paddle.layer.classification_cost(input=predict, label=label)
-
-parameters = paddle.parameters.create(cost)
-
-optimizer = paddle.optimizer.Momentum(
-    learning_rate=0.1 / 128.0,
-    momentum=0.9,
-    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-trainer = paddle.trainer.SGD(cost=cost,
-                             parameters=parameters,
-                             update_equation=optimizer)
-
-lists = []
-
-
-def event_handler(event):
-    if isinstance(event, paddle.event.EndIteration):
-        if event.batch_id % 100 == 0:
-            print "Pass %d, Batch %d, Cost %f, %s" % (
-                event.pass_id, event.batch_id, event.cost, event.metrics)
-    if isinstance(event, paddle.event.EndPass):
-        result = trainer.test(reader=paddle.reader.batched(
-            paddle.dataset.mnist.test(), batch_size=128))
-        print "Test with Pass %d, Cost %f, %s\n" % (event.pass_id, result.cost,
-                                                    result.metrics)
-        lists.append((event.pass_id, result.cost,
-                      result.metrics['classification_error_evaluator']))
-
-
-trainer.train(
-    reader=paddle.reader.batched(
-        paddle.reader.shuffle(
-            paddle.dataset.mnist.train(), buf_size=8192),
-        batch_size=128),
-    event_handler=event_handler,
-    num_passes=100)
-
-# find the best pass
-best = sorted(lists, key=lambda list: float(list[1]))[0]
-print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
diff --git a/recognize_digits/train.sh b/recognize_digits/train.sh
deleted file mode 100755
index dfe59b746e10c41b24e9a431cfa092e3dad31b4f..0000000000000000000000000000000000000000
--- a/recognize_digits/train.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-config=mnist_model.py
-output=./softmax_mnist_model
-log=softmax_train.log
-
-
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=0 \
---trainer_count=1 \
---num_passes=100 \
---save_dir=$output \
-2>&1 | tee $log
-
-python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/recommender_system/README.en.md b/recommender_system/README.en.md
deleted file mode 100644
index 76ef7b256f6d247f60f0a7b9acac4df89412d915..0000000000000000000000000000000000000000
--- a/recommender_system/README.en.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# Personalized Recommendation
-
-The source code of this tutorial is in [book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system).
-
-## Background
-
-With the fast growth of e-commerce, online videos, and online reading business, users have to rely on recommender systems to avoid manually browsing tremendous volume of choices.  Recommender systems understand users' interest by mining user behavior and other properties of users and products.
-
-Some well know approaches include:
-
-- User behavior-based approach.  A well-known method is collaborative filtering. The underlying assumption is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.
-
-- Content-based recommendation[[1](#reference)]. This approach infers feature vectors that represent products from their descriptions.  It also infers feature vectors that represent users' interests.  Then it measures the relevance of users and products by some distances between these feature vectors.
-
-- Hybrid approach[[2](#reference)]: This approach uses the content-based information to help address the cold start problem[[6](#reference)] in behavior-based approach.
-
-Among these options, collaborative filtering might be the most studied one.  Some of its variants include user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.
-
-This tutorial explains a deep learning based approach and how to implement it using PaddlePaddle.  We will train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
-
-
-## Model Overview
-
-To know more about deep learning based recommendation, let us start from going over the Youtube recommender system[[7](#参考文献)] before introducing our hybrid model.
-
-
-### YouTube's Deep Learning Recommendation Model
-
-YouTube is a video-sharing Web site with one of the largest user base in the world.  Its recommender system serves more than a billion users.  This system is composed of two major parts: candidate generation and ranking.  The former selects few hundreds of candidates from millions of videos, and the latter ranks and outputs the top 10.
-
-<p align="center">
-<img src="image/YouTube_Overview.en.png" width="70%" ><br/>
-Figure 1. YouTube recommender system overview.
-</p>
-
-#### Candidate Generation Network
-
-Youtube models candidate generation as a multiclass classification problem with a huge number of classes equal to the number of videos.  The architecture of the model is as follows:
-
-<p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.en.png" width="70%" ><br/>
-Figure. Deep candidate geeration model.
-</p>
-
-The first stage of this model maps watching history and search queries into fixed-length representative features.  Then, an MLP (multi-layer perceptron, as described in the [Recognize Digits](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md) tutorial) takes the concatenation of all representative vectors.  The output of the MLP represents the user' *intrinsic interests*.  At training time, it is used together with a softmax output layer for minimizing the classification error.   At serving time, it is used to compute the relevance of the user with all movies.
-
-For a user $U$, the predicted watching probability of video $i$ is
-
-$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
-
-where $u$ is the representative vector of user $U$, $V$ is the corpus of all videos, $v_i$ is the representative vector of the $i$-th video. $u$ and $v_i$ are vectors of the same length, so we can compute their dot product using a fully connected layer.
-
-This model could have a performance issue as the softmax output covers millions of classification labels.  To optimize performance, at the training time, the authors down-sample negative samples, so the actual number of classes is reduced to thousands.  At serving time, the authors ignore the normalization of the softmax outputs, because the results are just for ranking.
-
-
-#### Ranking Network
-
-The architecture of the ranking network is similar to that of the candidate generation network.  Similar to ranking models widely used in online advertising, it uses rich features like video ID, last watching time, etc.  The output layer of the ranking network is a weighted logistic regression, which rates all candidate videos.
-
-
-### Hybrid Model
-
-In the section, let us introduce our movie recommendation system.
-
-In our network, the input includes features of users and movies.  The user feature includes four properties: user ID, gender, occupation, and age.  Movie features include their IDs, genres, and titles.
-
-We use fully-connected layers to map user features into representative feature vectors and concatenate them.  The process of movie features is similar, except that for movie titles -- we feed titles into a text convolution network as described in the [sentiment analysis tutorial](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md)）to get a fixed-length representative feature vector.
-
-Given the feature vectors of users and movies, we compute the relevance using cosine similarity.  We minimize the squared error at training time.
-
-<p align="center">
-
-<img src="image/rec_regression_network_en.png" width="90%" ><br/>
-Figure 3. A hybrid recommendation model.
-</p> 
-
-## Dataset
-
-We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m.zip) to train our model.  This dataset includes 10,000 ratings of 4,000 movies from 6,000 users to 4,000 movies.  Each rate is in the range of 1~5.  Thanks to GroupLens Research for collecting, processing and publishing the dataset.  
-
-We don't have to download and preprocess the data.  Instead, we can use PaddlePaddle's dataset module `paddle.v2.dataset.movielens`.
-
-
-## Model Specification
-
-
-
-## Training
-
-
-
-## Inference
-
-
-
-## Conclusion
-
-This tutorial goes over traditional approaches in recommender system and a deep learning based approach.  We also show that how to train and use the model with PaddlePaddle.  Deep learning has been well used in computer vision and NLP, we look forward to its new successes in recommender systems.
-
-## Reference
-
-1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
-2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
-3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
-4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th International Conference on World Wide Web*. ACM, 2001.
-5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: Combining Social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
-6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
-7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">This tutorial</span> was created by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">the PaddlePaddle community</a> and published under <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Common Creative 4.0 License</a>。
diff --git a/recommender_system/README.md b/recommender_system/README.md
deleted file mode 100644
index 766c2d4510ba2fc931ecd97436d6348718f66b1c..0000000000000000000000000000000000000000
--- a/recommender_system/README.md
+++ /dev/null
@@ -1,382 +0,0 @@
-# 个性化推荐
-
-本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
-
-## 背景介绍
-
-在网络技术不断发展和电子商务规模不断扩大的背景下，商品数量和种类快速增长，用户需要花费大量时间才能找到自己想买的商品，这就是信息超载问题。为了解决这个难题，推荐系统（Recommender System）应运而生。
-
-个性化推荐系统是信息过滤系统（Information Filtering System）的子集，它可以用在很多领域，如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为，发现用户的个性化需求与兴趣特点，将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同，推荐系统不需要用户准确地描述出自己的需求，而是根据分析历史行为建模，主动提供满足用户兴趣和需求的信息。
-
-传统的推荐系统方法主要有：
-
-- 协同过滤推荐（Collaborative Filtering Recommendation）：该方法收集分析用户历史行为、活动、偏好，计算一个用户与其他用户的相似度，利用目标用户的相似用户对商品评价的加权评价值，来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品；缺点是对于没有任何行为的新用户存在冷启动的问题，同时也存在用户与商品之间的交互数据不够多造成的稀疏问题，会导致模型难以找到相近用户。
-- 基于内容过滤推荐[[1](#参考文献)]（Content-based Filtering Recommendation）：该方法利用商品的内容描述，抽象出有意义的特征，通过计算用户的兴趣和商品描述之间的相似度，来给用户做推荐。优点是简单直接，不需要依据其他用户对商品的评价，而是通过商品属性进行商品相似度度量，从而推荐给用户所感兴趣商品的相似商品；缺点是对于没有任何行为的新用户同样存在冷启动的问题。
-- 组合推荐[[2](#参考文献)]（Hybrid Recommendation）：运用不同的输入和技术共同进行推荐，以弥补各自推荐技术的缺点。
-
-其中协同过滤是应用最广泛的技术之一，它又可以分为多个子类：基于用户 （User-Based）的推荐[[3](#参考文献)] 、基于物品（Item-Based）的推荐[[4](#参考文献)]、基于社交网络关系（Social-Based）的推荐[[5](#参考文献)]、基于模型（Model-based）的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想，此后，基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。
-
-深度学习具有优秀的自动提取特征的能力，能够学习多层次的抽象特征表示，并对异质或跨域的内容信息进行学习，可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型，以及如何使用PaddlePaddle实现模型。
-
-## 效果展示
-
-我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后，只需要输入对应的用户ID和电影ID，就可以得出一个匹配的分数（范围[1,5]，分数越高视为兴趣越大），然后根据所有电影的推荐得分排序，推荐给用户可能感兴趣的电影。
-
-```
-Input movie_id: 1962
-Input user_id: 1
-Prediction Score is 4.25
-```
-
-## 模型概览
-
-本章中，我们首先介绍YouTube的视频推荐系统[[7](#参考文献)]，然后介绍我们实现的融合推荐模型。
-
-### YouTube的深度神经网络推荐系统
-
-YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
-
-<p align="center">
-<img src="image/YouTube_Overview.png" width="70%" ><br/>
-图1. YouTube 推荐系统结构
-</p>
-
-#### 候选生成网络（Candidate Generation Network）
-
-候选生成网络将推荐问题建模为一个类别数极大的多类分类问题：对于一个Youtube用户，使用其观看历史（视频ID）、搜索词记录（search tokens）、人口学信息（如地理位置、用户登录设备）、二值特征（如性别，是否登录）和连续特征（如用户年龄）等，对视频库中所有视频进行多分类，得到每一类别的分类结果（即每一个视频的推荐概率），最终输出概率较高的几百个视频。
-
-首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
-
-<p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
-图2. 候选生成网络结构
-</p>
-
-对于一个用户$U$，预测此刻用户要观看的视频$\omega$为视频$i$的概率公式为：
-
-$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
-
-其中$u$为用户$U$的特征表示，$V$为视频库集合，$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量，两者点积可以通过全连接层实现。
-
-考虑到softmax分类的类别数非常多，为了保证一定的计算效率：1）训练阶段，使用负样本类别采样将实际计算的类别数缩小至数千；2）推荐（预测）阶段，忽略softmax的归一化计算（不影响结果），将类别打分问题简化为点积（dot product）空间中的最近邻（nearest neighbor）搜索问题，取与$u$最近的$k$个视频作为生成的候选。
-
-#### 排序网络（Ranking Network）
-排序网络的结构类似于候选生成网络，但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似，这里也构造了大量的用于视频排序的相关特征（如视频 ID、上次观看时间等）。这些特征的处理方式和候选生成网络类似，不同之处是排序网络的顶部是一个加权逻辑回归（weighted logistic regression），它对所有候选视频进行打分，从高到底排序后将分数较高的一些视频返回给用户。
-
-### 融合推荐模型
-
-在下文的电影推荐系统中：
-
-1. 首先，使用用户特征和电影特征作为神经网络的输入，其中：
-
-   - 用户特征融合了四个属性信息，分别是用户ID、性别、职业和年龄。
-
-   - 电影特征融合了三个属性信息，分别是电影ID、电影类型ID和电影名称。
-
-2. 对用户特征，将用户ID映射为维度大小为256的向量表示，输入全连接层，并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。
-
-3. 对电影特征，将电影ID以类似用户ID的方式进行处理，电影类型ID以向量的形式直接输入全连接层，电影名称用文本卷积神经网络（详见[第5章](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md)）得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。
-
-4. 得到用户和电影的向量表示后，计算二者的余弦相似度作为推荐系统的打分。最后，用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。
-
-<p align="center">
-
-<img src="image/rec_regression_network.png" width="90%" ><br/>
-图3. 融合推荐模型 
-</p> 
-
-## 数据准备
-
-### 数据介绍与下载
-
-我们以 [MovieLens 百万数据集（ml-1m）](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价（评分范围 1~5 分，均为整数），由 GroupLens Research 实验室搜集整理。
-
-您可以运行 `data/getdata.sh` 下载数据，如果数椐获取成功，您将在目录`data/ml-1m`中看到下面的文件：
-
-```
-movies.dat  ratings.dat  users.dat  README 
-```
-
-- movies.dat：电影特征数据，格式为`电影ID::电影名称::电影类型`
-- ratings.dat：评分数据，格式为`用户ID::电影ID::评分::时间戳`
-- users.dat：用户特征数据，格式为`用户ID::性别::年龄::职业::邮编`
-- README：数据集的详细描述
-
-### 数据预处理
-
-首先安装 Python 第三方库（推荐使用 Virtualenv）：
-
-```shell
-pip install -r data/requirements.txt
-```
-
-其次在预处理`./preprocess.sh`过程中，我们将字段配置文件`data/config.json`转化为meta配置文件`meta_config.json`，并生成对应的meta文件`meta.bin`，以完成数据文件的序列化。然后再将`ratings.dat`分为训练集、测试集两部分，把它们的地址写入`train.list`和`test.list`。
-
-运行成功后目录`./data` 新增以下文件：
-
-```
-meta_config.json  meta.bin  ratings.dat.train  ratings.dat.test  train.list  test.list
-```
-
-- meta.bin: meta文件是Python的pickle对象， 存储着电影和用户信息。
-- meta_config.json: meta配置文件，用来具体描述如何解析数据集中的每一个字段，由字段配置文件生成。
-- ratings.dat.train和ratings.dat.test: 训练集和测试集，训练集已经随机打乱。
-- train.list和test.list: 训练集和测试集的文件地址列表。
-
-### 提供数据给 PaddlePaddle
-
-我们使用 Python 接口传递数据给系统，下面 `dataprovider.py` 给出了完整示例。
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-from common_utils import meta_to_header
-
-def __list_to_map__(lst):  # 将list转为map
-    ret_val = dict()
-    for each in lst:
-        k, v = each
-        ret_val[k] = v
-    return ret_val
-
-def hook(settings, meta, **kwargs): # 读取meta.bin
-    # 定义电影特征
-    movie_headers = list(meta_to_header(meta, 'movie'))
-    settings.movie_names = [h[0] for h in movie_headers]
-    headers = movie_headers
-    
-    # 定义用户特征
-    user_headers = list(meta_to_header(meta, 'user'))
-    settings.user_names = [h[0] for h in user_headers]
-    headers.extend(user_headers)
-    
-    # 加载评分信息
-    headers.append(("rating", dense_vector(1)))
-    
-    settings.input_types = __list_to_map__(headers)
-    settings.meta = meta
-    
-@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):
-    with open(filename, 'r') as f:
-        for line in f:
-            # 从评分文件中读取评分
-            user_id, movie_id, score = map(int, line.split('::')[:-1])
-            # 将评分平移到[-2, +2]范围内的整数
-            score = float(score - 3)
-            
-            movie_meta = settings.meta['movie'][movie_id]
-            user_meta = settings.meta['user'][user_id]
-
-            # 添加电影ID与电影特征
-            outputs = [('movie_id', movie_id - 1)]
-            for i, each_meta in enumerate(movie_meta):
-                outputs.append((settings.movie_names[i + 1], each_meta))
-            
-            # 添加用户ID与用户特征
-            outputs.append(('user_id', user_id - 1))
-            for i, each_meta in enumerate(user_meta):
-                outputs.append((settings.user_names[i + 1], each_meta))
-            
-            # 添加评分
-            outputs.append(('rating', [score]))
-            # 将数据返回给 paddle
-            yield __list_to_map__(outputs)
-```
-
-## 模型配置说明
-
-### 数据定义
-
-加载`meta.bin`文件并定义通过`define_py_data_sources2`从dataprovider中读入数据：
-
-```python
-from paddle.trainer_config_helpers import *
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-is_predict = get_config_arg('is_predict', bool, False)
-
-META_FILE = 'data/meta.bin'
-
-# 加载 meta 文件
-with open(META_FILE, 'rb') as f:
-    meta = pickle.load(f)
-
-if not is_predict:
-    define_py_data_sources2(
-        'data/train.list',
-        'data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'meta': meta})
-```
-
-### 算法配置
-
-这里我们设置了batch size、网络初始学习率和RMSProp自适应优化方法。
-
-```python
-settings(
-    batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
-```
-
-### 模型结构
-
-1. 定义数据输入和参数维度。
-
-   ```python
-   movie_meta = meta['movie']['__meta__']['raw_meta']
-   user_meta = meta['user']['__meta__']['raw_meta']
-
-   movie_id = data_layer('movie_id', size=movie_meta[0]['max'])    # 电影ID
-   title = data_layer('title', size=len(movie_meta[1]['dict']))    # 电影名称
-   genres = data_layer('genres', size=len(movie_meta[2]['dict']))  # 电影类型
-   user_id = data_layer('user_id', size=user_meta[0]['max'])	    # 用户ID
-   gender = data_layer('gender', size=len(user_meta[1]['dict']))   # 用户性别
-   age = data_layer('age', size=len(user_meta[2]['dict']))			# 用户年龄
-   occupation = data_layer('occupation', size=len(user_meta[3]['dict'])) # 用户职业
-
-   embsize = 256  # 向量维度
-   ```
-
-2. 构造“电影”特征。
-
-   ```python
-   # 电影ID和电影类型分别映射到其对应的特征隐层（256维）。
-   movie_id_emb = embedding_layer(input=movie_id, size=embsize)
-   movie_id_hidden = fc_layer(input=movie_id_emb, size=embsize)
-
-   genres_emb = fc_layer(input=genres, size=embsize)
-
-   # 对于电影名称，一个ID序列表示的词语序列，在输入卷积层后，
-   # 将得到每个时间窗口的特征（序列特征），然后通过在时间维度
-   # 降采样得到固定维度的特征，整个过程在text_conv_pool实现
-   title_emb = embedding_layer(input=title, size=embsize)
-   title_hidden = text_conv_pool(
-       input=title_emb, context_len=5, hidden_size=embsize)
-
-   # 将三个属性的特征表示分别全连接并相加，结果即是电影特征的最终表示
-   movie_feature = fc_layer(
-       input=[movie_id_hidden, title_hidden, genres_emb], size=embsize)
-   ```
-
-3. 构造“用户”特征。
-
-   ```python
-   # 将用户ID，性别，职业，年龄四个属性分别映射到其特征隐层。
-   user_id_emb = embedding_layer(input=user_id, size=embsize)
-   user_id_hidden = fc_layer(input=user_id_emb, size=embsize)
-
-   gender_emb = embedding_layer(input=gender, size=embsize)
-   gender_hidden = fc_layer(input=gender_emb, size=embsize)
-
-   age_emb = embedding_layer(input=age, size=embsize)
-   age_hidden = fc_layer(input=age_emb, size=embsize)
-
-   occup_emb = embedding_layer(input=occupation, size=embsize)
-   occup_hidden = fc_layer(input=occup_emb, size=embsize)
-
-   # 同样将这四个属性分别全连接并相加形成用户特征的最终表示。
-   user_feature = fc_layer(
-       input=[user_id_hidden, gender_hidden, age_hidden, occup_hidden],
-       size=embsize)
-   ```
-
-4. 计算余弦相似度，定义损失函数和网络输出。
-
-   ```python
-   similarity = cos_sim(a=movie_feature, b=user_feature, scale=2)
-
-   # 训练时，采用regression_cost作为损失函数计算回归误差代价，并作为网络的输出。
-   # 预测时，网络的输出即为余弦相似度。
-   if not is_predict:
-       lbl=data_layer('rating', size=1)
-   	cost=regression_cost(input=similarity, label=lbl)
-   	outputs(cost)
-   else:
-       outputs(similarity)
-   ```
-
-## 训练模型
-
-执行`sh train.sh` 开始训练模型，将日志写入文件 `log.txt` 并打印在屏幕上。其中指定了总共需要执行 50 个pass。
-
-```shell
-set -e
-paddle train \
-    --config=trainer_config.py \		 # 神经网络配置文件
-    --save_dir=./output \				 # 模型保存路径
-    --use_gpu=false \					 # 是否使用GPU(默认不使用)
-    --trainer_count=4\					 # 一台机器上面的线程数量
-    --test_all_data_in_one_period=true \ # 每个训练周期训练一次所有数据，否则每个训练周期测试batch_size个batch数据
-    --log_period=100 \					 # 训练log_period个batch后打印日志
-    --dot_period=1 \					 # 每训练dot_period个batch后打印一个"."
-    --num_passes=50  2>&1 | tee 'log.txt'
-```
-
-成功的输出类似如下：
-
-```bash
-I0117 01:01:48.585651  9998 TrainerInternal.cpp:165]  Batch=100 samples=160000 AvgCost=0.600042 CurrentCost=0.600042 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:02:53.821918  9998 TrainerInternal.cpp:165]  Batch=200 samples=320000 AvgCost=0.602855 CurrentCost=0.605668 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:03:58.937922  9998 TrainerInternal.cpp:165]  Batch=300 samples=480000 AvgCost=0.605199 CurrentCost=0.609887 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:05:04.083251  9998 TrainerInternal.cpp:165]  Batch=400 samples=640000 AvgCost=0.608693 CurrentCost=0.619175 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:06:09.155859  9998 TrainerInternal.cpp:165]  Batch=500 samples=800000 AvgCost=0.613273 CurrentCost=0.631591 Eval:  CurrentEval:
-.................................................................I0117 01:06:51.109654  9998 TrainerInternal.cpp:181]
- Pass=49 Batch=565 samples=902826 AvgCost=0.614772 Eval:
-I0117 01:07:04.205142  9998 Tester.cpp:115]  Test samples=97383 cost=0.721995 Eval:
-I0117 01:07:04.205281  9998 GradientMachine.cpp:113] Saving parameters to ./output/pass-00049
-```
-
-## 应用模型
-
-在训练了几轮以后，您可以对模型进行评估。运行以下命令，可以通过选择最小训练误差的一轮参数得到最好轮次的模型。
-
-```shell
-./evaluate.py log.txt
-```
-
-您将看到：
-
-```shell
-Best pass is 00036, error is 0.719281, which means predict get error as 0.424052
-evaluating from pass output/pass-00036
-```
-
-预测任何用户对于任何一部电影评价的命令如下：
-
-```shell
-python prediction.py 'output/pass-00036/'
-```
-
-预测程序将读取用户的输入，然后输出预测分数。您会看到如下命令行界面：
-
-```
-Input movie_id: 1962
-Input user_id: 1
-Prediction Score is 4.25
-```
-
-## 总结
-
-本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统，并以电影推荐为例，使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面，而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术，也将会在推荐系统领域大放异彩。
-
-## 参考文献
-
-1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
-2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
-3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
-4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.
-5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
-6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
-7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/recommender_system/common_utils.py b/recommender_system/common_utils.py
deleted file mode 100755
index c20c65286621d701ad58409b539bbe9c813d453a..0000000000000000000000000000000000000000
--- a/recommender_system/common_utils.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-
-
-def meta_to_header(meta, name):
-    metas = meta[name]['__meta__']['raw_meta']
-    for each_meta in metas:
-        slot_name = each_meta.get('name', '%s_id' % name)
-        if each_meta['type'] == 'id':
-            yield slot_name, integer_value(each_meta['max'])
-        elif each_meta['type'] == 'embedding':
-            is_seq = each_meta['seq'] == 'sequence'
-            yield slot_name, integer_value(
-                len(each_meta['dict']),
-                seq_type=SequenceType.SEQUENCE
-                if is_seq else SequenceType.NO_SEQUENCE)
-        elif each_meta['type'] == 'one_hot_dense':
-            yield slot_name, dense_vector(len(each_meta['dict']))
diff --git a/recommender_system/data/config.json b/recommender_system/data/config.json
deleted file mode 100644
index f26e74ce47bb7843a571e6033f051c046b31f054..0000000000000000000000000000000000000000
--- a/recommender_system/data/config.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "user": {
-    "file": {
-      "name": "users.dat",
-      "delimiter": "::"
-    },
-    "fields": ["id", "gender", "age", "occupation"]
-  },
-  "movie": {
-    "file": {
-      "name": "movies.dat",
-      "delimiter": "::"
-    },
-    "fields": ["id", "title", "genres"]
-  }
-}
diff --git a/recommender_system/data/config_generator.py b/recommender_system/data/config_generator.py
deleted file mode 100644
index 4ca496a252dffc62ed62bb8f2a5ee1661a940580..0000000000000000000000000000000000000000
--- a/recommender_system/data/config_generator.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-config_generator.py
-
-Usage:
-    ./config_generator.py <config_file> [--output_format=<output_format>]
-    ./config_generator.py -h | --help
-
-Options:
-    -h --help                           Show this screen.
-    --output_format=<output_format>     Output Config format(json or yaml) [default: json].
-"""
-
-import json
-import docopt
-import copy
-
-DEFAULT_FILE = {"type": "split", "delimiter": ","}
-
-DEFAULT_FIELD = {
-    "id": {
-        "type": "id"
-    },
-    "gender": {
-        "name": "gender",
-        "type": "embedding",
-        "dict": {
-            "type": "char_based"
-        }
-    },
-    "age": {
-        "name": "age",
-        "type": "embedding",
-        "dict": {
-            "type": "whole_content",
-            "sort": True
-        }
-    },
-    "occupation": {
-        "name": "occupation",
-        "type": "embedding",
-        "dict": {
-            "type": "whole_content",
-            "sort": "true"
-        }
-    },
-    "title": {
-        "regex": {
-            "pattern": r"^(.*)\((\d+)\)$",
-            "group_id": 1,
-            "strip": True
-        },
-        "name": "title",
-        "type": {
-            "name": "embedding",
-            "seq_type": "sequence",
-        },
-        "dict": {
-            "type": "char_based"
-        }
-    },
-    "genres": {
-        "type": "one_hot_dense",
-        "dict": {
-            "type": "split",
-            "delimiter": "|"
-        },
-        "name": "genres"
-    }
-}
-
-
-def merge_dict(master_dict, slave_dict):
-    return dict(((k, master_dict.get(k) or slave_dict.get(k))
-                 for k in set(slave_dict) | set(master_dict)))
-
-
-def main(filename, fmt):
-    with open(filename, 'r') as f:
-        conf = json.load(f)
-        obj = dict()
-        for k in conf:
-            val = conf[k]
-            file_dict = val['file']
-            file_dict = merge_dict(file_dict, DEFAULT_FILE)
-
-            fields = []
-            for pos, field_key in enumerate(val['fields']):
-                assert isinstance(field_key, basestring)
-                field = copy.deepcopy(DEFAULT_FIELD[field_key])
-                field['pos'] = pos
-                fields.append(field)
-            obj[k] = {"file": file_dict, "fields": fields}
-    meta = {"meta": obj}
-    # print meta
-    if fmt == 'json':
-
-        def formatter(x):
-            import json
-            return json.dumps(x, indent=2)
-    elif fmt == 'yaml':
-
-        def formatter(x):
-            import yaml
-            return yaml.safe_dump(x, default_flow_style=False)
-    else:
-        raise NotImplementedError("Dump format %s is not implemented" % fmt)
-
-    print formatter(meta)
-
-
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version="0.1.0")
-    main(args["<config_file>"], args["--output_format"])
diff --git a/recommender_system/data/getdata.sh b/recommender_system/data/getdata.sh
deleted file mode 100755
index 2268d876389e0bdf5ead405e74d278d276626f82..0000000000000000000000000000000000000000
--- a/recommender_system/data/getdata.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -ex
-cd "$(dirname "$0")"
-# download the dataset
-wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
-# unzip the dataset
-unzip ml-1m.zip
-# remove the unused zip file
-rm ml-1m.zip
diff --git a/recommender_system/data/meta_generator.py b/recommender_system/data/meta_generator.py
deleted file mode 100644
index 38e4679d266c331a751114cd13f0e3453016cf26..0000000000000000000000000000000000000000
--- a/recommender_system/data/meta_generator.py
+++ /dev/null
@@ -1,430 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Preprocess Movielens dataset, to get movie/user object.
-
-Usage:
-    ./preprocess.py <dataset_dir> <binary_filename> [--config=<config_file>]
-    ./preprocess.py -h | --help
-
-Options:
-    -h --help               Show this screen.
-    --version               Show version.
-    --config=<config_file>  Get MetaData config file [default: config.json].
-"""
-import docopt
-import os
-import sys
-import re
-import collections
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-
-class UniqueIDGenerator(object):
-    def __init__(self):
-        self.pool = collections.defaultdict(self.__next_id__)
-        self.next_id = 0
-
-    def __next_id__(self):
-        tmp = self.next_id
-        self.next_id += 1
-        return tmp
-
-    def __call__(self, k):
-        return self.pool[k]
-
-    def to_list(self):
-        ret_val = [None] * len(self.pool)
-        for k in self.pool.keys():
-            ret_val[self.pool[k]] = k
-        return ret_val
-
-
-class SortedIDGenerator(object):
-    def __init__(self):
-        self.__key_set__ = set()
-        self.dict = None
-
-    def scan(self, key):
-        self.__key_set__.add(key)
-
-    def finish_scan(self, compare=None, key=None, reverse=False):
-        self.__key_set__ = sorted(
-            list(self.__key_set__), cmp=compare, key=key, reverse=reverse)
-        self.dict = dict()
-        for idx, each_key in enumerate(self.__key_set__):
-            self.dict[each_key] = idx
-
-    def __call__(self, key):
-        return self.dict[key]
-
-    def to_list(self):
-        return self.__key_set__
-
-
-class SplitFileReader(object):
-    def __init__(self, work_dir, config):
-        assert isinstance(config, dict)
-        self.filename = config['name']
-        self.delimiter = config.get('delimiter', ',')
-        self.work_dir = work_dir
-
-    def read(self):
-        with open(os.path.join(self.work_dir, self.filename), 'r') as f:
-            for line in f:
-                line = line.strip()
-                if isinstance(self.delimiter, unicode):
-                    self.delimiter = str(self.delimiter)
-                yield line.split(self.delimiter)
-
-    @staticmethod
-    def create(work_dir, config):
-        assert isinstance(config, dict)
-        if config['type'] == 'split':
-            return SplitFileReader(work_dir, config)
-
-
-class IFileReader(object):
-    READERS = [SplitFileReader]
-
-    def read(self):
-        raise NotImplementedError()
-
-    @staticmethod
-    def create(work_dir, config):
-        for reader_cls in IFileReader.READERS:
-            val = reader_cls.create(work_dir, config)
-            if val is not None:
-                return val
-
-
-class IDFieldParser(object):
-    TYPE = 'id'
-
-    def __init__(self, config):
-        self.__max_id__ = -sys.maxint - 1
-        self.__min_id__ = sys.maxint
-        self.__id_count__ = 0
-
-    def scan(self, line):
-        idx = int(line)
-        self.__max_id__ = max(self.__max_id__, idx)
-        self.__min_id__ = min(self.__min_id__, idx)
-        self.__id_count__ += 1
-
-    def parse(self, line):
-        return int(line)
-
-    def meta_field(self):
-        return {
-            "is_key": True,
-            'max': self.__max_id__,
-            'min': self.__min_id__,
-            'count': self.__id_count__,
-            'type': 'id'
-        }
-
-
-class SplitEmbeddingDict(object):
-    def __init__(self, delimiter):
-        self.__id__ = UniqueIDGenerator()
-        self.delimiter = delimiter
-
-    def scan(self, multi):
-        for val in multi.split(self.delimiter):
-            self.__id__(val)
-
-    def parse(self, multi):
-        return map(self.__id__, multi.split(self.delimiter))
-
-    def meta_field(self):
-        return self.__id__.to_list()
-
-
-class EmbeddingFieldParser(object):
-    TYPE = 'embedding'
-
-    NO_SEQUENCE = "no_sequence"
-    SEQUENCE = "sequence"
-
-    class CharBasedEmbeddingDict(object):
-        def __init__(self, is_seq=True):
-            self.__id__ = UniqueIDGenerator()
-            self.is_seq = is_seq
-
-        def scan(self, s):
-            for ch in s:
-                self.__id__(ch)
-
-        def parse(self, s):
-            return map(self.__id__, s) if self.is_seq else self.__id__(s[0])
-
-        def meta_field(self):
-            return self.__id__.to_list()
-
-    class WholeContentDict(object):
-        def __init__(self, need_sort=True):
-            assert need_sort
-            self.__id__ = SortedIDGenerator()
-            self.__has_finished__ = False
-
-        def scan(self, txt):
-            self.__id__.scan(txt)
-
-        def meta_field(self):
-            if not self.__has_finished__:
-                self.__id__.finish_scan()
-                self.__has_finished__ = True
-            return self.__id__.to_list()
-
-        def parse(self, txt):
-            return self.__id__(txt)
-
-    def __init__(self, config):
-        try:
-            self.seq_type = config['type']['seq_type']
-        except TypeError:
-            self.seq_type = EmbeddingFieldParser.NO_SEQUENCE
-
-        if config['dict']['type'] == 'char_based':
-            self.dict = EmbeddingFieldParser.CharBasedEmbeddingDict(
-                self.seq_type == EmbeddingFieldParser.SEQUENCE)
-        elif config['dict']['type'] == 'split':
-            self.dict = SplitEmbeddingDict(config['dict'].get('delimiter', ','))
-        elif config['dict']['type'] == 'whole_content':
-            self.dict = EmbeddingFieldParser.WholeContentDict(config['dict'][
-                'sort'])
-        else:
-            print config
-            assert False
-
-        self.name = config['name']
-
-    def scan(self, s):
-        self.dict.scan(s)
-
-    def meta_field(self):
-        return {
-            'name': self.name,
-            'dict': self.dict.meta_field(),
-            'type': 'embedding',
-            'seq': self.seq_type
-        }
-
-    def parse(self, s):
-        return self.dict.parse(s)
-
-
-class OneHotDenseFieldParser(object):
-    TYPE = 'one_hot_dense'
-
-    def __init__(self, config):
-        if config['dict']['type'] == 'split':
-            self.dict = SplitEmbeddingDict(config['dict']['delimiter'])
-        self.name = config['name']
-
-    def scan(self, s):
-        self.dict.scan(s)
-
-    def meta_field(self):
-        # print self.dict.meta_field()
-        return {
-            'dict': self.dict.meta_field(),
-            'name': self.name,
-            'type': 'one_hot_dense'
-        }
-
-    def parse(self, s):
-        ids = self.dict.parse(s)
-        retv = [0.0] * len(self.dict.meta_field())
-        for idx in ids:
-            retv[idx] = 1.0
-        # print retv
-        return retv
-
-
-class FieldParserFactory(object):
-    PARSERS = [IDFieldParser, EmbeddingFieldParser, OneHotDenseFieldParser]
-
-    @staticmethod
-    def create(config):
-        if isinstance(config['type'], basestring):
-            config_type = config['type']
-        elif isinstance(config['type'], dict):
-            config_type = config['type']['name']
-
-        assert config_type is not None
-
-        for each_parser_cls in FieldParserFactory.PARSERS:
-            if config_type == each_parser_cls.TYPE:
-                return each_parser_cls(config)
-        print config
-
-
-class CompositeFieldParser(object):
-    def __init__(self, parser, extractor):
-        self.extractor = extractor
-        self.parser = parser
-
-    def scan(self, *args, **kwargs):
-        self.parser.scan(self.extractor.extract(*args, **kwargs))
-
-    def parse(self, *args, **kwargs):
-        return self.parser.parse(self.extractor.extract(*args, **kwargs))
-
-    def meta_field(self):
-        return self.parser.meta_field()
-
-
-class PositionContentExtractor(object):
-    def __init__(self, pos):
-        self.pos = pos
-
-    def extract(self, line):
-        assert isinstance(line, list)
-        return line[self.pos]
-
-
-class RegexPositionContentExtractor(PositionContentExtractor):
-    def __init__(self, pos, pattern, group_id, strip=True):
-        PositionContentExtractor.__init__(self, pos)
-        pattern = pattern.strip()
-        self.pattern = re.compile(pattern)
-        self.group_id = group_id
-        self.strip = strip
-
-    def extract(self, line):
-        line = PositionContentExtractor.extract(self, line)
-        match = self.pattern.match(line)
-        # print line, self.pattern.pattern, match
-        assert match is not None
-        txt = match.group(self.group_id)
-        if self.strip:
-            txt.strip()
-        return txt
-
-
-class ContentExtractorFactory(object):
-    def extract(self, line):
-        pass
-
-    @staticmethod
-    def create(config):
-        if 'pos' in config:
-            if 'regex' not in config:
-                return PositionContentExtractor(config['pos'])
-            else:
-                extra_args = config['regex']
-                return RegexPositionContentExtractor(
-                    pos=config['pos'], **extra_args)
-
-
-class MetaFile(object):
-    def __init__(self, work_dir):
-        self.work_dir = work_dir
-        self.obj = dict()
-
-    def parse(self, config):
-        config = config['meta']
-
-        ret_obj = dict()
-        for key in config.keys():
-            val = config[key]
-            assert 'file' in val
-            reader = IFileReader.create(self.work_dir, val['file'])
-            assert reader is not None
-            assert 'fields' in val and isinstance(val['fields'], list)
-            fields_config = val['fields']
-            field_parsers = map(MetaFile.__field_config_mapper__, fields_config)
-
-            for each_parser in field_parsers:
-                assert each_parser is not None
-
-            for each_block in reader.read():
-                for each_parser in field_parsers:
-                    each_parser.scan(each_block)
-
-            metas = map(lambda x: x.meta_field(), field_parsers)
-            # print metas
-            key_index = filter(
-                lambda x: x is not None,
-                map(lambda (idx, meta): idx if 'is_key' in meta and meta['is_key'] else None,
-                    enumerate(metas)))[0]
-
-            key_map = []
-            for i in range(min(key_index, len(metas))):
-                key_map.append(i)
-            for i in range(key_index + 1, len(metas)):
-                key_map.append(i)
-
-            obj = {'__meta__': {'raw_meta': metas, 'feature_map': key_map}}
-
-            for each_block in reader.read():
-                idx = field_parsers[key_index].parse(each_block)
-                val = []
-                for i, each_parser in enumerate(field_parsers):
-                    if i != key_index:
-                        val.append(each_parser.parse(each_block))
-                obj[idx] = val
-            ret_obj[key] = obj
-        self.obj = ret_obj
-        return ret_obj
-
-    @staticmethod
-    def __field_config_mapper__(conf):
-        assert isinstance(conf, dict)
-        extrator = ContentExtractorFactory.create(conf)
-        field_parser = FieldParserFactory.create(conf)
-        assert extrator is not None
-        assert field_parser is not None
-        return CompositeFieldParser(field_parser, extrator)
-
-    def dump(self, fp):
-        pickle.dump(self.obj, fp, pickle.HIGHEST_PROTOCOL)
-
-
-def preprocess(binary_filename, dataset_dir, config, **kwargs):
-    assert isinstance(config, str)
-    with open(config, 'r') as config_file:
-        file_loader = None
-        if config.lower().endswith('.yaml'):
-            import yaml
-            file_loader = yaml
-        elif config.lower().endswith('.json'):
-            import json
-            file_loader = json
-        config = file_loader.load(config_file)
-    meta = MetaFile(dataset_dir)
-    meta.parse(config)
-    with open(binary_filename, 'wb') as outf:
-        meta.dump(outf)
-
-
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version='0.1.0')
-    kwargs = dict()
-    for key in args.keys():
-        if key != '--help':
-            param_name = key
-            assert isinstance(param_name, str)
-            param_name = param_name.replace('<', '')
-            param_name = param_name.replace('>', '')
-            param_name = param_name.replace('--', '')
-            kwargs[param_name] = args[key]
-    preprocess(**kwargs)
diff --git a/recommender_system/data/requirements.txt b/recommender_system/data/requirements.txt
deleted file mode 100644
index 1ea154584a428b6a389309f1f8def502e0aadfce..0000000000000000000000000000000000000000
--- a/recommender_system/data/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-PyYAML
-docopt
diff --git a/recommender_system/data/split.py b/recommender_system/data/split.py
deleted file mode 100644
index be6869c22f04be1db0f8e9c35c73c851e4c490b0..0000000000000000000000000000000000000000
--- a/recommender_system/data/split.py
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Separate movielens 1m dataset to train/test file.
-
-Usage:
-    ./separate.py <input_file> [--test_ratio=<test_ratio>] [--delimiter=<delimiter>]
-    ./separate.py -h | --help
-
-Options:
-    -h --help                       Show this screen.
-    --version                       Show version.
-    --test_ratio=<test_ratio>       Test ratio for separate [default: 0.1].
-    --delimiter=<delimiter>         File delimiter [default: ,].
-"""
-import docopt
-import collections
-import random
-
-
-def process(test_ratio, input_file, delimiter, **kwargs):
-    test_ratio = float(test_ratio)
-    rating_dict = collections.defaultdict(list)
-    with open(input_file, 'r') as f:
-        for line in f:
-            user_id = int(line.split(delimiter)[0])
-            rating_dict[user_id].append(line.strip())
-
-    with open(input_file + ".train", 'w') as train_file:
-        with open(input_file + ".test", 'w') as test_file:
-            for k in rating_dict.keys():
-                lines = rating_dict[k]
-                assert isinstance(lines, list)
-                random.shuffle(lines)
-                test_len = int(len(lines) * test_ratio)
-                for line in lines[:test_len]:
-                    print >> test_file, line
-
-                for line in lines[test_len:]:
-                    print >> train_file, line
-
-
-if __name__ == '__main__':
-    args = docopt.docopt(__doc__, version='0.1.0')
-    kwargs = dict()
-    for key in args.keys():
-        if key != '--help':
-            param_name = key
-            assert isinstance(param_name, str)
-            param_name = param_name.replace('<', '')
-            param_name = param_name.replace('>', '')
-            param_name = param_name.replace('--', '')
-            kwargs[param_name] = args[key]
-    process(**kwargs)
diff --git a/recommender_system/dataprovider.py b/recommender_system/dataprovider.py
deleted file mode 100755
index 54a5ea6fb8e59fa559a394a0c2ec7ac07d89c2f8..0000000000000000000000000000000000000000
--- a/recommender_system/dataprovider.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-from common_utils import meta_to_header
-
-
-def __list_to_map__(lst):
-    ret_val = dict()
-    for each in lst:
-        k, v = each
-        ret_val[k] = v
-    return ret_val
-
-
-def hook(settings, meta, **kwargs):
-    """
-    Init hook is invoked before process data. It will set obj.slots and store
-    data meta.
-
-    :param obj: global object. It will passed to process routine.
-    :type obj: object
-    :param meta: the meta file object, which passed from trainer_config. Meta
-                 file record movie/user features.
-    :param kwargs: unused other arguments.
-    """
-
-    # Header define slots that used for paddle.
-    #    first part is movie features.
-    #    second part is user features.
-    #    final part is rating score.
-    # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    movie_headers = list(meta_to_header(meta, 'movie'))
-    settings.movie_names = [h[0] for h in movie_headers]
-    headers = movie_headers
-    user_headers = list(meta_to_header(meta, 'user'))
-    settings.user_names = [h[0] for h in user_headers]
-    headers.extend(user_headers)
-    headers.append(("rating", dense_vector(1)))  # Score
-
-    # slot types.
-    settings.input_types = __list_to_map__(headers)
-    settings.meta = meta
-
-
-@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):
-    with open(filename, 'r') as f:
-        for line in f:
-            # Get a rating from file.
-            user_id, movie_id, score = map(int, line.split('::')[:-1])
-
-            # Scale score to [-2, +2]
-            score = float(score - 3)
-
-            # Get movie/user features by movie_id, user_id
-            movie_meta = settings.meta['movie'][movie_id]
-            user_meta = settings.meta['user'][user_id]
-
-            outputs = [('movie_id', movie_id - 1)]
-
-            # Then add movie features
-            for i, each_meta in enumerate(movie_meta):
-                outputs.append((settings.movie_names[i + 1], each_meta))
-
-            # Then add user id.
-            outputs.append(('user_id', user_id - 1))
-
-            # Then add user features.
-            for i, each_meta in enumerate(user_meta):
-                outputs.append((settings.user_names[i + 1], each_meta))
-
-            # Finally, add score
-            outputs.append(('rating', [score]))
-            # Return data to paddle
-            yield __list_to_map__(outputs)
diff --git a/recommender_system/evaluate.py b/recommender_system/evaluate.py
deleted file mode 100755
index 3afa7a1e9db5fefb1bbf5aaa174b8168afae4058..0000000000000000000000000000000000000000
--- a/recommender_system/evaluate.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-import re
-import math
-
-
-def get_best_pass(log_filename):
-    with open(log_filename, 'r') as f:
-        text = f.read()
-        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
-                             re.S)
-        results = re.findall(pattern, text)
-        sorted_results = sorted(results, key=lambda result: float(result[0]))
-        return sorted_results[0]
-
-
-log_filename = sys.argv[1]
-log = get_best_pass(log_filename)
-predict_error = math.sqrt(float(log[0])) / 2
-print 'Best pass is %s, error is %s, which means predict get error as %f' % (
-    log[1], log[0], predict_error)
-
-evaluate_pass = "output/pass-%s" % log[1]
-print "evaluating from pass %s" % evaluate_pass
diff --git a/recommender_system/image/rec_regression_network_en.png b/recommender_system/image/rec_regression_network_en.png
deleted file mode 100755
index 33f2833851a6167ed5cc09601b7f948c02016b18..0000000000000000000000000000000000000000
Binary files a/recommender_system/image/rec_regression_network_en.png and /dev/null differ
diff --git a/recommender_system/index.en.html b/recommender_system/index.en.html
deleted file mode 100644
index b5a4cfd0ac1a87fdf4487956cf48c2a5959e197f..0000000000000000000000000000000000000000
--- a/recommender_system/index.en.html
+++ /dev/null
@@ -1,173 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# Personalized Recommendation
-
-The source code of this tutorial is in [book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system).
-
-## Background
-
-With the fast growth of e-commerce, online videos, and online reading business, users have to rely on recommender systems to avoid manually browsing tremendous volume of choices.  Recommender systems understand users' interest by mining user behavior and other properties of users and products.
-
-Some well know approaches include:
-
-- User behavior-based approach.  A well-known method is collaborative filtering. The underlying assumption is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.
-
-- Content-based recommendation[[1](#reference)]. This approach infers feature vectors that represent products from their descriptions.  It also infers feature vectors that represent users' interests.  Then it measures the relevance of users and products by some distances between these feature vectors.
-
-- Hybrid approach[[2](#reference)]: This approach uses the content-based information to help address the cold start problem[[6](#reference)] in behavior-based approach.
-
-Among these options, collaborative filtering might be the most studied one.  Some of its variants include user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.
-
-This tutorial explains a deep learning based approach and how to implement it using PaddlePaddle.  We will train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
-
-
-## Model Overview
-
-To know more about deep learning based recommendation, let us start from going over the Youtube recommender system[[7](#参考文献)] before introducing our hybrid model.
-
-
-### YouTube's Deep Learning Recommendation Model
-
-YouTube is a video-sharing Web site with one of the largest user base in the world.  Its recommender system serves more than a billion users.  This system is composed of two major parts: candidate generation and ranking.  The former selects few hundreds of candidates from millions of videos, and the latter ranks and outputs the top 10.
-
-<p align="center">
-<img src="image/YouTube_Overview.en.png" width="70%" ><br/>
-Figure 1. YouTube recommender system overview.
-</p>
-
-#### Candidate Generation Network
-
-Youtube models candidate generation as a multiclass classification problem with a huge number of classes equal to the number of videos.  The architecture of the model is as follows:
-
-<p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.en.png" width="70%" ><br/>
-Figure. Deep candidate geeration model.
-</p>
-
-The first stage of this model maps watching history and search queries into fixed-length representative features.  Then, an MLP (multi-layer perceptron, as described in the [Recognize Digits](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md) tutorial) takes the concatenation of all representative vectors.  The output of the MLP represents the user' *intrinsic interests*.  At training time, it is used together with a softmax output layer for minimizing the classification error.   At serving time, it is used to compute the relevance of the user with all movies.
-
-For a user $U$, the predicted watching probability of video $i$ is
-
-$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
-
-where $u$ is the representative vector of user $U$, $V$ is the corpus of all videos, $v_i$ is the representative vector of the $i$-th video. $u$ and $v_i$ are vectors of the same length, so we can compute their dot product using a fully connected layer.
-
-This model could have a performance issue as the softmax output covers millions of classification labels.  To optimize performance, at the training time, the authors down-sample negative samples, so the actual number of classes is reduced to thousands.  At serving time, the authors ignore the normalization of the softmax outputs, because the results are just for ranking.
-
-
-#### Ranking Network
-
-The architecture of the ranking network is similar to that of the candidate generation network.  Similar to ranking models widely used in online advertising, it uses rich features like video ID, last watching time, etc.  The output layer of the ranking network is a weighted logistic regression, which rates all candidate videos.
-
-
-### Hybrid Model
-
-In the section, let us introduce our movie recommendation system.
-
-In our network, the input includes features of users and movies.  The user feature includes four properties: user ID, gender, occupation, and age.  Movie features include their IDs, genres, and titles.
-
-We use fully-connected layers to map user features into representative feature vectors and concatenate them.  The process of movie features is similar, except that for movie titles -- we feed titles into a text convolution network as described in the [sentiment analysis tutorial](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md)）to get a fixed-length representative feature vector.
-
-Given the feature vectors of users and movies, we compute the relevance using cosine similarity.  We minimize the squared error at training time.
-
-<p align="center">
-
-<img src="image/rec_regression_network_en.png" width="90%" ><br/>
-Figure 3. A hybrid recommendation model.
-</p> 
-
-## Dataset
-
-We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m.zip) to train our model.  This dataset includes 10,000 ratings of 4,000 movies from 6,000 users to 4,000 movies.  Each rate is in the range of 1~5.  Thanks to GroupLens Research for collecting, processing and publishing the dataset.  
-
-We don't have to download and preprocess the data.  Instead, we can use PaddlePaddle's dataset module `paddle.v2.dataset.movielens`.
-
-
-## Model Specification
-
-
-
-## Training
-
-
-
-## Inference
-
-
-
-## Conclusion
-
-This tutorial goes over traditional approaches in recommender system and a deep learning based approach.  We also show that how to train and use the model with PaddlePaddle.  Deep learning has been well used in computer vision and NLP, we look forward to its new successes in recommender systems.
-
-## Reference
-
-1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
-2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
-3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
-4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th International Conference on World Wide Web*. ACM, 2001.
-5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: Combining Social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
-6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
-7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">This tutorial</span> was created by <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">the PaddlePaddle community</a> and published under <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Common Creative 4.0 License</a>。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/recommender_system/index.html b/recommender_system/index.html
deleted file mode 100644
index 2bcc0d9ab9992a1c4a865538a963577723cd65f7..0000000000000000000000000000000000000000
--- a/recommender_system/index.html
+++ /dev/null
@@ -1,444 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# 个性化推荐
-
-本教程源代码目录在[book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/recommender_system)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
-
-## 背景介绍
-
-在网络技术不断发展和电子商务规模不断扩大的背景下，商品数量和种类快速增长，用户需要花费大量时间才能找到自己想买的商品，这就是信息超载问题。为了解决这个难题，推荐系统（Recommender System）应运而生。
-
-个性化推荐系统是信息过滤系统（Information Filtering System）的子集，它可以用在很多领域，如电影、音乐、电商和 Feed 流推荐等。推荐系统通过分析、挖掘用户行为，发现用户的个性化需求与兴趣特点，将用户可能感兴趣的信息或商品推荐给用户。与搜索引擎不同，推荐系统不需要用户准确地描述出自己的需求，而是根据分析历史行为建模，主动提供满足用户兴趣和需求的信息。
-
-传统的推荐系统方法主要有：
-
-- 协同过滤推荐（Collaborative Filtering Recommendation）：该方法收集分析用户历史行为、活动、偏好，计算一个用户与其他用户的相似度，利用目标用户的相似用户对商品评价的加权评价值，来预测目标用户对特定商品的喜好程度。优点是可以给用户推荐未浏览过的新产品；缺点是对于没有任何行为的新用户存在冷启动的问题，同时也存在用户与商品之间的交互数据不够多造成的稀疏问题，会导致模型难以找到相近用户。
-- 基于内容过滤推荐[[1](#参考文献)]（Content-based Filtering Recommendation）：该方法利用商品的内容描述，抽象出有意义的特征，通过计算用户的兴趣和商品描述之间的相似度，来给用户做推荐。优点是简单直接，不需要依据其他用户对商品的评价，而是通过商品属性进行商品相似度度量，从而推荐给用户所感兴趣商品的相似商品；缺点是对于没有任何行为的新用户同样存在冷启动的问题。
-- 组合推荐[[2](#参考文献)]（Hybrid Recommendation）：运用不同的输入和技术共同进行推荐，以弥补各自推荐技术的缺点。
-
-其中协同过滤是应用最广泛的技术之一，它又可以分为多个子类：基于用户 （User-Based）的推荐[[3](#参考文献)] 、基于物品（Item-Based）的推荐[[4](#参考文献)]、基于社交网络关系（Social-Based）的推荐[[5](#参考文献)]、基于模型（Model-based）的推荐等。1994年明尼苏达大学推出的GroupLens系统[[3](#参考文献)]一般被认为是推荐系统成为一个相对独立的研究方向的标志。该系统首次提出了基于协同过滤来完成推荐任务的思想，此后，基于该模型的协同过滤推荐引领了推荐系统十几年的发展方向。
-
-深度学习具有优秀的自动提取特征的能力，能够学习多层次的抽象特征表示，并对异质或跨域的内容信息进行学习，可以一定程度上处理推荐系统冷启动问题[[6](#参考文献)]。本教程主要介绍个性化推荐的深度学习模型，以及如何使用PaddlePaddle实现模型。
-
-## 效果展示
-
-我们使用包含用户信息、电影信息与电影评分的数据集作为个性化推荐的应用场景。当我们训练好模型后，只需要输入对应的用户ID和电影ID，就可以得出一个匹配的分数（范围[1,5]，分数越高视为兴趣越大），然后根据所有电影的推荐得分排序，推荐给用户可能感兴趣的电影。
-
-```
-Input movie_id: 1962
-Input user_id: 1
-Prediction Score is 4.25
-```
-
-## 模型概览
-
-本章中，我们首先介绍YouTube的视频推荐系统[[7](#参考文献)]，然后介绍我们实现的融合推荐模型。
-
-### YouTube的深度神经网络推荐系统
-
-YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
-
-<p align="center">
-<img src="image/YouTube_Overview.png" width="70%" ><br/>
-图1. YouTube 推荐系统结构
-</p>
-
-#### 候选生成网络（Candidate Generation Network）
-
-候选生成网络将推荐问题建模为一个类别数极大的多类分类问题：对于一个Youtube用户，使用其观看历史（视频ID）、搜索词记录（search tokens）、人口学信息（如地理位置、用户登录设备）、二值特征（如性别，是否登录）和连续特征（如用户年龄）等，对视频库中所有视频进行多分类，得到每一类别的分类结果（即每一个视频的推荐概率），最终输出概率较高的几百个视频。
-
-首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/recognize_digits/README.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
-
-<p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
-图2. 候选生成网络结构
-</p>
-
-对于一个用户$U$，预测此刻用户要观看的视频$\omega$为视频$i$的概率公式为：
-
-$$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
-
-其中$u$为用户$U$的特征表示，$V$为视频库集合，$v_i$为视频库中第$i$个视频的特征表示。$u$和$v_i$为长度相等的向量，两者点积可以通过全连接层实现。
-
-考虑到softmax分类的类别数非常多，为了保证一定的计算效率：1）训练阶段，使用负样本类别采样将实际计算的类别数缩小至数千；2）推荐（预测）阶段，忽略softmax的归一化计算（不影响结果），将类别打分问题简化为点积（dot product）空间中的最近邻（nearest neighbor）搜索问题，取与$u$最近的$k$个视频作为生成的候选。
-
-#### 排序网络（Ranking Network）
-排序网络的结构类似于候选生成网络，但是它的目标是对候选进行更细致的打分排序。和传统广告排序中的特征抽取方法类似，这里也构造了大量的用于视频排序的相关特征（如视频 ID、上次观看时间等）。这些特征的处理方式和候选生成网络类似，不同之处是排序网络的顶部是一个加权逻辑回归（weighted logistic regression），它对所有候选视频进行打分，从高到底排序后将分数较高的一些视频返回给用户。
-
-### 融合推荐模型
-
-在下文的电影推荐系统中：
-
-1. 首先，使用用户特征和电影特征作为神经网络的输入，其中：
-
-   - 用户特征融合了四个属性信息，分别是用户ID、性别、职业和年龄。
-
-   - 电影特征融合了三个属性信息，分别是电影ID、电影类型ID和电影名称。
-
-2. 对用户特征，将用户ID映射为维度大小为256的向量表示，输入全连接层，并对其他三个属性也做类似的处理。然后将四个属性的特征表示分别全连接并相加。
-
-3. 对电影特征，将电影ID以类似用户ID的方式进行处理，电影类型ID以向量的形式直接输入全连接层，电影名称用文本卷积神经网络（详见[第5章](https://github.com/PaddlePaddle/book/blob/develop/understand_sentiment/README.md)）得到其定长向量表示。然后将三个属性的特征表示分别全连接并相加。
-
-4. 得到用户和电影的向量表示后，计算二者的余弦相似度作为推荐系统的打分。最后，用该相似度打分和用户真实打分的差异的平方作为该回归模型的损失函数。
-
-<p align="center">
-
-<img src="image/rec_regression_network.png" width="90%" ><br/>
-图3. 融合推荐模型 
-</p> 
-
-## 数据准备
-
-### 数据介绍与下载
-
-我们以 [MovieLens 百万数据集（ml-1m）](http://files.grouplens.org/datasets/movielens/ml-1m.zip)为例进行介绍。ml-1m 数据集包含了 6,000 位用户对 4,000 部电影的 1,000,000 条评价（评分范围 1~5 分，均为整数），由 GroupLens Research 实验室搜集整理。
-
-您可以运行 `data/getdata.sh` 下载数据，如果数椐获取成功，您将在目录`data/ml-1m`中看到下面的文件：
-
-```
-movies.dat  ratings.dat  users.dat  README 
-```
-
-- movies.dat：电影特征数据，格式为`电影ID::电影名称::电影类型`
-- ratings.dat：评分数据，格式为`用户ID::电影ID::评分::时间戳`
-- users.dat：用户特征数据，格式为`用户ID::性别::年龄::职业::邮编`
-- README：数据集的详细描述
-
-### 数据预处理
-
-首先安装 Python 第三方库（推荐使用 Virtualenv）：
-
-```shell
-pip install -r data/requirements.txt
-```
-
-其次在预处理`./preprocess.sh`过程中，我们将字段配置文件`data/config.json`转化为meta配置文件`meta_config.json`，并生成对应的meta文件`meta.bin`，以完成数据文件的序列化。然后再将`ratings.dat`分为训练集、测试集两部分，把它们的地址写入`train.list`和`test.list`。
-
-运行成功后目录`./data` 新增以下文件：
-
-```
-meta_config.json  meta.bin  ratings.dat.train  ratings.dat.test  train.list  test.list
-```
-
-- meta.bin: meta文件是Python的pickle对象， 存储着电影和用户信息。
-- meta_config.json: meta配置文件，用来具体描述如何解析数据集中的每一个字段，由字段配置文件生成。
-- ratings.dat.train和ratings.dat.test: 训练集和测试集，训练集已经随机打乱。
-- train.list和test.list: 训练集和测试集的文件地址列表。
-
-### 提供数据给 PaddlePaddle
-
-我们使用 Python 接口传递数据给系统，下面 `dataprovider.py` 给出了完整示例。
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-from common_utils import meta_to_header
-
-def __list_to_map__(lst):  # 将list转为map
-    ret_val = dict()
-    for each in lst:
-        k, v = each
-        ret_val[k] = v
-    return ret_val
-
-def hook(settings, meta, **kwargs): # 读取meta.bin
-    # 定义电影特征
-    movie_headers = list(meta_to_header(meta, 'movie'))
-    settings.movie_names = [h[0] for h in movie_headers]
-    headers = movie_headers
-    
-    # 定义用户特征
-    user_headers = list(meta_to_header(meta, 'user'))
-    settings.user_names = [h[0] for h in user_headers]
-    headers.extend(user_headers)
-    
-    # 加载评分信息
-    headers.append(("rating", dense_vector(1)))
-    
-    settings.input_types = __list_to_map__(headers)
-    settings.meta = meta
-    
-@provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):
-    with open(filename, 'r') as f:
-        for line in f:
-            # 从评分文件中读取评分
-            user_id, movie_id, score = map(int, line.split('::')[:-1])
-            # 将评分平移到[-2, +2]范围内的整数
-            score = float(score - 3)
-            
-            movie_meta = settings.meta['movie'][movie_id]
-            user_meta = settings.meta['user'][user_id]
-
-            # 添加电影ID与电影特征
-            outputs = [('movie_id', movie_id - 1)]
-            for i, each_meta in enumerate(movie_meta):
-                outputs.append((settings.movie_names[i + 1], each_meta))
-            
-            # 添加用户ID与用户特征
-            outputs.append(('user_id', user_id - 1))
-            for i, each_meta in enumerate(user_meta):
-                outputs.append((settings.user_names[i + 1], each_meta))
-            
-            # 添加评分
-            outputs.append(('rating', [score]))
-            # 将数据返回给 paddle
-            yield __list_to_map__(outputs)
-```
-
-## 模型配置说明
-
-### 数据定义
-
-加载`meta.bin`文件并定义通过`define_py_data_sources2`从dataprovider中读入数据：
-
-```python
-from paddle.trainer_config_helpers import *
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-is_predict = get_config_arg('is_predict', bool, False)
-
-META_FILE = 'data/meta.bin'
-
-# 加载 meta 文件
-with open(META_FILE, 'rb') as f:
-    meta = pickle.load(f)
-
-if not is_predict:
-    define_py_data_sources2(
-        'data/train.list',
-        'data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'meta': meta})
-```
-
-### 算法配置
-
-这里我们设置了batch size、网络初始学习率和RMSProp自适应优化方法。
-
-```python
-settings(
-    batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
-```
-
-### 模型结构
-
-1. 定义数据输入和参数维度。
-
-   ```python
-   movie_meta = meta['movie']['__meta__']['raw_meta']
-   user_meta = meta['user']['__meta__']['raw_meta']
-
-   movie_id = data_layer('movie_id', size=movie_meta[0]['max'])    # 电影ID
-   title = data_layer('title', size=len(movie_meta[1]['dict']))    # 电影名称
-   genres = data_layer('genres', size=len(movie_meta[2]['dict']))  # 电影类型
-   user_id = data_layer('user_id', size=user_meta[0]['max'])	    # 用户ID
-   gender = data_layer('gender', size=len(user_meta[1]['dict']))   # 用户性别
-   age = data_layer('age', size=len(user_meta[2]['dict']))			# 用户年龄
-   occupation = data_layer('occupation', size=len(user_meta[3]['dict'])) # 用户职业
-
-   embsize = 256  # 向量维度
-   ```
-
-2. 构造“电影”特征。
-
-   ```python
-   # 电影ID和电影类型分别映射到其对应的特征隐层（256维）。
-   movie_id_emb = embedding_layer(input=movie_id, size=embsize)
-   movie_id_hidden = fc_layer(input=movie_id_emb, size=embsize)
-
-   genres_emb = fc_layer(input=genres, size=embsize)
-
-   # 对于电影名称，一个ID序列表示的词语序列，在输入卷积层后，
-   # 将得到每个时间窗口的特征（序列特征），然后通过在时间维度
-   # 降采样得到固定维度的特征，整个过程在text_conv_pool实现
-   title_emb = embedding_layer(input=title, size=embsize)
-   title_hidden = text_conv_pool(
-       input=title_emb, context_len=5, hidden_size=embsize)
-
-   # 将三个属性的特征表示分别全连接并相加，结果即是电影特征的最终表示
-   movie_feature = fc_layer(
-       input=[movie_id_hidden, title_hidden, genres_emb], size=embsize)
-   ```
-
-3. 构造“用户”特征。
-
-   ```python
-   # 将用户ID，性别，职业，年龄四个属性分别映射到其特征隐层。
-   user_id_emb = embedding_layer(input=user_id, size=embsize)
-   user_id_hidden = fc_layer(input=user_id_emb, size=embsize)
-
-   gender_emb = embedding_layer(input=gender, size=embsize)
-   gender_hidden = fc_layer(input=gender_emb, size=embsize)
-
-   age_emb = embedding_layer(input=age, size=embsize)
-   age_hidden = fc_layer(input=age_emb, size=embsize)
-
-   occup_emb = embedding_layer(input=occupation, size=embsize)
-   occup_hidden = fc_layer(input=occup_emb, size=embsize)
-
-   # 同样将这四个属性分别全连接并相加形成用户特征的最终表示。
-   user_feature = fc_layer(
-       input=[user_id_hidden, gender_hidden, age_hidden, occup_hidden],
-       size=embsize)
-   ```
-
-4. 计算余弦相似度，定义损失函数和网络输出。
-
-   ```python
-   similarity = cos_sim(a=movie_feature, b=user_feature, scale=2)
-
-   # 训练时，采用regression_cost作为损失函数计算回归误差代价，并作为网络的输出。
-   # 预测时，网络的输出即为余弦相似度。
-   if not is_predict:
-       lbl=data_layer('rating', size=1)
-   	cost=regression_cost(input=similarity, label=lbl)
-   	outputs(cost)
-   else:
-       outputs(similarity)
-   ```
-
-## 训练模型
-
-执行`sh train.sh` 开始训练模型，将日志写入文件 `log.txt` 并打印在屏幕上。其中指定了总共需要执行 50 个pass。
-
-```shell
-set -e
-paddle train \
-    --config=trainer_config.py \		 # 神经网络配置文件
-    --save_dir=./output \				 # 模型保存路径
-    --use_gpu=false \					 # 是否使用GPU(默认不使用)
-    --trainer_count=4\					 # 一台机器上面的线程数量
-    --test_all_data_in_one_period=true \ # 每个训练周期训练一次所有数据，否则每个训练周期测试batch_size个batch数据
-    --log_period=100 \					 # 训练log_period个batch后打印日志
-    --dot_period=1 \					 # 每训练dot_period个batch后打印一个"."
-    --num_passes=50  2>&1 | tee 'log.txt'
-```
-
-成功的输出类似如下：
-
-```bash
-I0117 01:01:48.585651  9998 TrainerInternal.cpp:165]  Batch=100 samples=160000 AvgCost=0.600042 CurrentCost=0.600042 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:02:53.821918  9998 TrainerInternal.cpp:165]  Batch=200 samples=320000 AvgCost=0.602855 CurrentCost=0.605668 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:03:58.937922  9998 TrainerInternal.cpp:165]  Batch=300 samples=480000 AvgCost=0.605199 CurrentCost=0.609887 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:05:04.083251  9998 TrainerInternal.cpp:165]  Batch=400 samples=640000 AvgCost=0.608693 CurrentCost=0.619175 Eval:  CurrentEval:
-...................................................................................................
-I0117 01:06:09.155859  9998 TrainerInternal.cpp:165]  Batch=500 samples=800000 AvgCost=0.613273 CurrentCost=0.631591 Eval:  CurrentEval:
-.................................................................I0117 01:06:51.109654  9998 TrainerInternal.cpp:181]
- Pass=49 Batch=565 samples=902826 AvgCost=0.614772 Eval:
-I0117 01:07:04.205142  9998 Tester.cpp:115]  Test samples=97383 cost=0.721995 Eval:
-I0117 01:07:04.205281  9998 GradientMachine.cpp:113] Saving parameters to ./output/pass-00049
-```
-
-## 应用模型
-
-在训练了几轮以后，您可以对模型进行评估。运行以下命令，可以通过选择最小训练误差的一轮参数得到最好轮次的模型。
-
-```shell
-./evaluate.py log.txt
-```
-
-您将看到：
-
-```shell
-Best pass is 00036, error is 0.719281, which means predict get error as 0.424052
-evaluating from pass output/pass-00036
-```
-
-预测任何用户对于任何一部电影评价的命令如下：
-
-```shell
-python prediction.py 'output/pass-00036/'
-```
-
-预测程序将读取用户的输入，然后输出预测分数。您会看到如下命令行界面：
-
-```
-Input movie_id: 1962
-Input user_id: 1
-Prediction Score is 4.25
-```
-
-## 总结
-
-本章介绍了传统的推荐系统方法和YouTube的深度神经网络推荐系统，并以电影推荐为例，使用PaddlePaddle训练了一个个性化推荐神经网络模型。推荐系统几乎涵盖了电商系统、社交网络、广告推荐、搜索引擎等领域的方方面面，而在图像处理、自然语言处理等领域已经发挥重要作用的深度学习技术，也将会在推荐系统领域大放异彩。
-
-## 参考文献
-
-1. [Peter Brusilovsky](https://en.wikipedia.org/wiki/Peter_Brusilovsky) (2007). *The Adaptive Web*. p. 325.
-2. Robin Burke , [Hybrid Web Recommender Systems](http://www.dcs.warwick.ac.uk/~acristea/courses/CS411/2010/Book%20-%20The%20Adaptive%20Web/HybridWebRecommenderSystems.pdf), pp. 377-408, The Adaptive Web, Peter Brusilovsky, Alfred Kobsa, Wolfgang Nejdl (Ed.), Lecture Notes in Computer Science, Springer-Verlag, Berlin, Germany, Lecture Notes in Computer Science, Vol. 4321, May 2007, 978-3-540-72078-2.
-3. P. Resnick, N. Iacovou, etc. “[GroupLens: An Open Architecture for Collaborative Filtering of Netnews](http://ccs.mit.edu/papers/CCSWP165.html)”, Proceedings of ACM Conference on Computer Supported Cooperative Work, CSCW 1994. pp.175-186.
-4. Sarwar, Badrul, et al. "[Item-based collaborative filtering recommendation algorithms.](http://files.grouplens.org/papers/www10_sarwar.pdf)" *Proceedings of the 10th international conference on World Wide Web*. ACM, 2001.
-5. Kautz, Henry, Bart Selman, and Mehul Shah. "[Referral Web: combining social networks and collaborative filtering.](http://www.cs.cornell.edu/selman/papers/pdf/97.cacm.refweb.pdf)" Communications of the ACM 40.3 (1997): 63-65. APA
-6. Yuan, Jianbo, et al. ["Solving Cold-Start Problem in Large-scale Recommendation Engines: A Deep Learning Approach."](https://arxiv.org/pdf/1611.05480v1.pdf) *arXiv preprint arXiv:1611.05480* (2016).
-7. Covington P, Adams J, Sargin E. [Deep neural networks for youtube recommendations](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)[C]//Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016: 191-198.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/recommender_system/prediction.py b/recommender_system/prediction.py
deleted file mode 100755
index 9824d132d276dfe4ff4ea336d8c6b483949b9d08..0000000000000000000000000000000000000000
--- a/recommender_system/prediction.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/env python2
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle, DataProviderConverter
-
-from common_utils import *
-from paddle.trainer.config_parser import parse_config
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-import sys
-
-if __name__ == '__main__':
-    model_path = sys.argv[1]
-    swig_paddle.initPaddle('--use_gpu=0')
-    conf = parse_config("trainer_config.py", "is_predict=1")
-    network = swig_paddle.GradientMachine.createFromConfigProto(
-        conf.model_config)
-    assert isinstance(network, swig_paddle.GradientMachine)
-    network.loadParameters(model_path)
-    with open('./data/meta.bin', 'rb') as f:
-        meta = pickle.load(f)
-        headers = [h[1] for h in meta_to_header(meta, 'movie')]
-        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
-        cvt = DataProviderConverter(headers)
-        while True:
-            movie_id = int(raw_input("Input movie_id: "))
-            user_id = int(raw_input("Input user_id: "))
-            movie_meta = meta['movie'][movie_id]  # Query Data From Meta.
-            user_meta = meta['user'][user_id]
-            data = [movie_id - 1]
-            data.extend(movie_meta)
-            data.append(user_id - 1)
-            data.extend(user_meta)
-            print "Prediction Score is %.2f" % (
-                network.forwardTest(cvt.convert([data]))[0]['value'][0][0] + 3)
diff --git a/recommender_system/preprocess.sh b/recommender_system/preprocess.sh
deleted file mode 100755
index eeb81ce3cb47e65c0aeb303e7571024ba82dad65..0000000000000000000000000000000000000000
--- a/recommender_system/preprocess.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-UNAME_STR=`uname`
-
-if [[ ${UNAME_STR} == 'Linux' ]]; then
-	SHUF_PROG='shuf'
-else
-	SHUF_PROG='gshuf'
-fi
-
-
-cd "$(dirname "$0")"
-delimiter='::'
-dir=ml-1m
-cd data
-echo 'generate meta config file'
-python config_generator.py config.json > meta_config.json
-echo 'generate meta file'
-python meta_generator.py $dir meta.bin --config=meta_config.json
-echo 'split train/test file'
-python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
-echo 'shuffle train file'
-${SHUF_PROG} $dir/ratings.dat.train > ratings.dat.train
-cp $dir/ratings.dat.test .
-echo "./data/ratings.dat.train" > train.list
-echo "./data/ratings.dat.test" > test.list
diff --git a/recommender_system/train.sh b/recommender_system/train.sh
deleted file mode 100755
index e341d1cc7a3267bef9db916719b2e4b1981e31bc..0000000000000000000000000000000000000000
--- a/recommender_system/train.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-paddle train \
-    --config=trainer_config.py \
-    --save_dir=./output \
-    --use_gpu=false \
-    --trainer_count=4\
-    --test_all_data_in_one_period=true \
-    --log_period=100 \
-    --dot_period=1 \
-    --num_passes=50  2>&1 | tee 'log.txt'
diff --git a/recommender_system/trainer_config.py b/recommender_system/trainer_config.py
deleted file mode 100755
index c2eeb7b874c7667809a401347f43b873b8dea92a..0000000000000000000000000000000000000000
--- a/recommender_system/trainer_config.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-is_predict = get_config_arg('is_predict', bool, False)
-
-META_FILE = 'data/meta.bin'
-
-with open(META_FILE, 'rb') as f:
-    # load meta file
-    meta = pickle.load(f)
-
-if not is_predict:
-    define_py_data_sources2(
-        'data/train.list',
-        'data/test.list',
-        module='dataprovider',
-        obj='process',
-        args={'meta': meta})
-
-settings(
-    batch_size=1600, learning_rate=1e-3, learning_method=RMSPropOptimizer())
-
-movie_meta = meta['movie']['__meta__']['raw_meta']
-user_meta = meta['user']['__meta__']['raw_meta']
-
-movie_id = data_layer('movie_id', size=movie_meta[0]['max'])
-title = data_layer('title', size=len(movie_meta[1]['dict']))
-genres = data_layer('genres', size=len(movie_meta[2]['dict']))
-user_id = data_layer('user_id', size=user_meta[0]['max'])
-gender = data_layer('gender', size=len(user_meta[1]['dict']))
-age = data_layer('age', size=len(user_meta[2]['dict']))
-occupation = data_layer('occupation', size=len(user_meta[3]['dict']))
-
-embsize = 256
-
-# construct movie feature
-movie_id_emb = embedding_layer(input=movie_id, size=embsize)
-movie_id_hidden = fc_layer(input=movie_id_emb, size=embsize)
-
-genres_emb = fc_layer(input=genres, size=embsize)
-
-title_emb = embedding_layer(input=title, size=embsize)
-title_hidden = text_conv_pool(
-    input=title_emb, context_len=5, hidden_size=embsize)
-
-movie_feature = fc_layer(
-    input=[movie_id_hidden, title_hidden, genres_emb], size=embsize)
-
-# construct user feature
-user_id_emb = embedding_layer(input=user_id, size=embsize)
-user_id_hidden = fc_layer(input=user_id_emb, size=embsize)
-
-gender_emb = embedding_layer(input=gender, size=embsize)
-gender_hidden = fc_layer(input=gender_emb, size=embsize)
-
-age_emb = embedding_layer(input=age, size=embsize)
-age_hidden = fc_layer(input=age_emb, size=embsize)
-
-occup_emb = embedding_layer(input=occupation, size=embsize)
-occup_hidden = fc_layer(input=occup_emb, size=embsize)
-
-user_feature = fc_layer(
-    input=[user_id_hidden, gender_hidden, age_hidden, occup_hidden],
-    size=embsize)
-
-similarity = cos_sim(a=movie_feature, b=user_feature, scale=2)
-
-if not is_predict:
-    lbl = data_layer('rating', size=1)
-    cost = regression_cost(input=similarity, label=lbl)
-    outputs(cost)
-
-else:
-    outputs(similarity)
diff --git a/serve/.gitignore b/serve/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..284f7c45c85dd82bb0f8b32e877ae714d436c354
--- /dev/null
+++ b/serve/.gitignore
@@ -0,0 +1,3 @@
+*~
+.idea
+index.html
diff --git a/serve/Dockerfile b/serve/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dfcd94da4b91898d8aa3983c8e0234e663d9a0f5
--- /dev/null
+++ b/serve/Dockerfile
@@ -0,0 +1,8 @@
+FROM paddlepaddle/paddle
+
+ENV PARAMETER_TAR_PATH=/data/param.tar \
+    TOPOLOGY_FILE_PATH=/data/inference_topology.pkl
+ADD requirements.txt /root
+ADD main.py /root
+RUN pip install -r /root/requirements.txt
+CMD ["python", "/root/main.py"]
diff --git a/serve/Dockerfile.gpu b/serve/Dockerfile.gpu
new file mode 100644
index 0000000000000000000000000000000000000000..7ec79dca05f0d1e0431b39e97fa78cad9165126a
--- /dev/null
+++ b/serve/Dockerfile.gpu
@@ -0,0 +1,8 @@
+FROM paddlepaddle/paddle:latest-gpu
+
+ENV PARAMETER_TAR_PATH=/data/param.tar \
+    TOPOLOGY_FILE_PATH=/data/inference_topology.pkl
+ADD requirements.txt /root
+ADD main.py /root
+RUN pip install -r /root/requirements.txt
+CMD ["python", "/root/main.py"]
diff --git a/serve/README.md b/serve/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..05316d1c7d6424a6dd29304e947dee74370de91e
--- /dev/null
+++ b/serve/README.md
@@ -0,0 +1,219 @@
+# Inference Server Example
+
+The inference server can be used to perform inference on any model trained on
+PaddlePaddle. It provides an HTTP endpoint.
+
+## Run
+
+The inference server reads a trained model (a topology file and a
+parameter file) and serves HTTP request at port `8000`. Because models
+differ in the numbers and types of inputs, **the HTTP API will differ
+slightly for each model,** please see [HTTP API](#http-api) for the
+API spec,
+and
+[here](https://github.com/PaddlePaddle/book/wiki/Using-Pre-trained-Models) for
+the request examples of different models that illustrate the
+difference.
+
+We will first show how to obtain the PaddlePaddle model, and then how
+to start the server.
+
+We will use Docker to run the demo, if you are not familiar with
+Docker, please checkout
+this
+[TLDR](https://github.com/PaddlePaddle/Paddle/wiki/Docker-for-Beginners).
+
+### Obtain the PaddlePaddle Model
+
+A neural network model in PaddlePaddle contains two parts: the
+**parameter** and the **topology**.
+
+A PaddlePaddle training script contains the neural network topology,
+which is represented by layers. For example,
+
+```python
+img = paddle.layer.data(name="img", type=paddle.data_type.dense_vector(784))
+hidden = fc_layer(input=type, size=200)
+prediction = fc_layer(input=hidden, size=10, act=paddle.activation.Softmax())
+```
+
+The parameter instance is created by the topology and updated by the
+`train` method.
+
+```python
+...
+params = paddle.parameters.create(cost)
+...
+trainer = paddle.trainer.SGD(cost=cost, parameters=params)
+...
+```
+
+PaddlePaddle stores the topology and parameter separately.
+
+1. To serialize a topology, we need to create a topology instance
+   explicitly by the outputs of the neural network. Then, invoke
+   `serialize_for_inference` method.
+
+  ```python
+  # Save the inference topology to protobuf.
+  inference_topology = paddle.topology.Topology(layers=prediction)
+  with open("inference_topology.pkl", 'wb') as f:
+      inference_topology.serialize_for_inference(f)
+  ```
+
+2. To save a parameter, we need to invoke `save_parameter_to_tar` method of
+  `trainer`.
+
+  ```python
+  with open('param.tar', 'w') as f:
+      trainer.save_parameter_to_tar(f)
+  ```
+
+ After serializing the parameter and topology into two files, we could
+ use them to set up an inference server.
+
+ For a working example, please see [train.py](https://github.com/reyoung/paddle_mnist_v2_demo/blob/master/train.py).
+
+
+### Start the Server
+
+Make sure the `inference_topology.pkl` and `param.tar` mentioned in
+the last section are in your current working directory, and run the
+command:
+
+```bash
+docker run --name paddle_serve -v `pwd`:/data -d -p 8000:80 -e WITH_GPU=0 paddlepaddle/book:serve
+```
+
+The above command will mount the current working directory to the
+`/data/` directory inside the docker container. The inference server
+will load the model topology and parameters that we just created from
+there.
+
+To run the inference server with GPU support, please make sure you have
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+first, and run:
+
+```bash
+nvidia-docker run --name paddle_serve -v `pwd`:/data -d -p 8000:80 -e WITH_GPU=1 paddlepaddle/book:serve-gpu
+```
+
+this command will start a server on port `8000`.
+
+After you are done with the demo, you can run `docker stop
+paddle_serve` to stop this docker container.
+
+## HTTP API
+
+The inference server will handle HTTP POST request on path `/`. The
+content type of the request and response is json. You need to manually
+add `Content-Type` request header as `Content-Type: application/json`.
+
+The request json object is a single json dictionay object, whose key
+is the layer name of input data. The type of the corresponding value
+is decided by the data type. For most cases the corresponding value
+will be a list of floats. For completeness, we will list all data types
+below:
+
+There are twelve data types supported by PaddePaddle:
+
+| | plain | a sequence | a sequence of sequence |
+| --- | --- | --- | ---|
+| dense | [ f, f, f, f, ... ] | [ [f, f, f, ...], [f, f, f, ...]] | [[[f, f, ...], [f, f, ...]], [[f, f, ...], [f, f, ...]], ...] |
+| integer | i | [i, i, ...] | [[i, i, ...], [i, i, ...], ...] |
+| sparse | [i, i, ...] | [[i, i, ...], [i, i, ...], ...] | [[[i, i, ...], [i, i, ...], ...], [[i, i, ...], [i, i, ...], ...], ...] |
+| sparse | [[i, f], [i, f], ... ] | [[[i, f], [i, f], ... ], ...] | [[[[i, f], [i, f], ... ], ...], ...]
+
+In the table, `i` stands for a `int` value and `f` stands for a
+`float` value.
+
+What `data_type` should be used is decided by the training
+topology. For example,
+
+* For image data, they are usually a plain dense vector, we flatten
+  the image into a vector. The pixel values of that image are usually
+  normalized in `[-1.0, 1.0]` or `[0.0, 1.0]`(depends on each neural
+  network).
+
+    ```text
+    +-------+
+   |243 241|
+   |139 211| +---->[0.95, 0.95, 0.54, 0.82]
+   +-------+
+    ```
+
+* For text data, each word of that text is represented by an
+  integer. The association map between word and integer is decided by
+  the training process. A sentence is represented by a list of
+  integer.
+
+   ```text
+    I am good .
+        +
+        |
+        v
+   23 942 402 19  +----->  [23, 942, 402, 19]
+   ```
+
+A sample request data of a `4x4` image and a sentence could be
+
+```json
+{
+    "img": [
+        0.95,
+        0.95,
+        0.54,
+        0.82
+    ],
+    "sentence": [
+        23,
+        942,
+        402,
+        19
+    ]
+}
+```
+
+The response is a json object, too. The example of return data are:
+
+```json
+{
+  "code": 0,
+  "data": [
+    [
+      0.10060056298971176,
+      0.057179879397153854,
+      0.1453431099653244,
+      0.15825574100017548,
+      0.04464773088693619,
+      0.1566203236579895,
+      0.05657859891653061,
+      0.12077419459819794,
+      0.08073269575834274,
+      0.07926714420318604
+    ]
+  ],
+  "message": "success"
+}
+```
+
+Here, `code` and `message` represent the status of the request.
+`data` corresponds to the outputs of the neural network; they could be a
+probability of each class, could be the IDs of output sentence, and so
+on.
+
+## MNIST Demo Client
+
+If you have trained an model with [train.py](https://github.com/reyoung/paddle_mnist_v2_demo/blob/master/train.py) and
+start a inference server. Then you can use this [client](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits/client/client.py) to test if it works right.
+
+## Build
+
+We have already prepared the pre-built docker image
+`paddlepaddle/book:serve`, here is the command if you want to build
+the docker image again.
+
+```bash
+docker build -t paddlepaddle/book:serve .
+docker build -t paddlepaddle/book:serve-gpu -f Dockerfile.gpu .
+```
diff --git a/serve/main.py b/serve/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee1de9313793455c12ba457b004527ef11b2b3f6
--- /dev/null
+++ b/serve/main.py
@@ -0,0 +1,89 @@
+import os
+import traceback
+
+import paddle.v2 as paddle
+from flask import Flask, jsonify, request
+from flask_cors import CORS
+from Queue import Queue
+import threading
+
+tarfn = os.getenv('PARAMETER_TAR_PATH', None)
+
+if tarfn is None:
+    raise ValueError(
+        "please specify parameter tar file path with environment variable PARAMETER_TAR_PATH"
+    )
+
+topology_filepath = os.getenv('TOPOLOGY_FILE_PATH', None)
+
+if topology_filepath is None:
+    raise ValueError(
+        "please specify topology file path with environment variable TOPOLOGY_FILE_PATH"
+    )
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+output_field = os.getenv('OUTPUT_FIELD', 'value')
+port = int(os.getenv('PORT', '80'))
+
+app = Flask(__name__)
+CORS(app)
+
+
+def errorResp(msg):
+    return jsonify(code=-1, message=msg)
+
+
+def successResp(data):
+    return jsonify(code=0, message="success", data=data)
+
+
+sendQ = Queue()
+
+
+@app.route('/', methods=['POST'])
+def infer():
+    recv_queue = Queue()
+    sendQ.put((request.json, recv_queue))
+    success, resp = recv_queue.get()
+    if success:
+        return successResp(resp)
+    else:
+        return errorResp(resp)
+
+
+# PaddlePaddle v0.10.0 does not support inference from different
+# threads, so we create a single worker thread.
+def worker():
+    paddle.init(use_gpu=with_gpu)
+
+    fields = filter(lambda x: len(x) != 0, output_field.split(","))
+
+    with open(tarfn) as param_f, open(topology_filepath) as topo_f:
+        params = paddle.parameters.Parameters.from_tar(param_f)
+        inferer = paddle.inference.Inference(parameters=params, fileobj=topo_f)
+
+    while True:
+        j, recv_queue = sendQ.get()
+        try:
+            feeding = {}
+            d = []
+            for i, key in enumerate(j):
+                d.append(j[key])
+                feeding[key] = i
+                r = inferer.infer([d], feeding=feeding, field=fields)
+        except:
+            trace = traceback.format_exc()
+            recv_queue.put((False, trace))
+            continue
+        if isinstance(r, list):
+            recv_queue.put((True, [elem.tolist() for elem in r]))
+        else:
+            recv_queue.put((True, r.tolist()))
+
+
+if __name__ == '__main__':
+    t = threading.Thread(target=worker)
+    t.daemon = True
+    t.start()
+    print 'serving on port', port
+    app.run(host='0.0.0.0', port=port, threaded=True)
diff --git a/serve/requirements.txt b/serve/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8efb58a12a9bcc01e6844b036ab249b4b9ff5a79
--- /dev/null
+++ b/serve/requirements.txt
@@ -0,0 +1,2 @@
+Flask==0.12.2
+Flask-CORS==3.0.3
diff --git a/skip_thought/index.html b/skip_thought/index.html
deleted file mode 100644
index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..0000000000000000000000000000000000000000
--- a/skip_thought/index.html
+++ /dev/null
@@ -1,62 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/speech_recognition/index.html b/speech_recognition/index.html
deleted file mode 100644
index bd5f85aff99e291b01dc091f7a3d4ac622bce4a6..0000000000000000000000000000000000000000
--- a/speech_recognition/index.html
+++ /dev/null
@@ -1,62 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/understand_sentiment/README.en.md b/understand_sentiment/README.en.md
deleted file mode 100644
index efdb5f6d36301b237f16a5efc90b1724e6601dcc..0000000000000000000000000000000000000000
--- a/understand_sentiment/README.en.md
+++ /dev/null
@@ -1,493 +0,0 @@
-# Sentiment Analysis
-
-The source codes of this section can be located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Background Introduction
-In natural language processing, sentiment analysis refers to describing emotion status in texts. The texts may refer to a sentence, a paragraph or a document. Emotion status can be a binary classification problem (positive/negative or happy/sad), or a three-class problem (positive/neutral/negative). Sentiment analysis can be applied widely in various situations, such as online shopping (Amazon, Taobao), travel and movie websites. It can be used to grasp from the reviews how the customers feel about the product. Table 1 is an example of sentiment analysis in movie reviews:
-
-| Movie Review       | Category  |
-| --------     | -----  |
-| Best movie of Xiaogang Feng in recent years!| Positive |
-| Pretty bad. Feels like a tv-series from a local TV-channel     | Negative |
-| Politically correct version of Taken ... and boring as Heck| Negative|
-|delightful, mesmerizing, and completely unexpected. The plot is nicely designed.|Positive|
-
-<p align="center">Table 1 Sentiment Analysis in Movie Reviews</p>
-
-In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before deep learning becomes heated, the main-stream methods for the former include BOW (bag of words) and topic modeling, while the latter contain SVM(support vector machine), LR(logistic regression).
-
-For a piece of text, BOW model ignores its word order, grammar and syntax, and regard it as a set of words, so BOW does not capture all the information in the text. For example, “this movie is extremely bad“ and “boring, dull and empty work” describe very similar semantic with low similarity in sense of BOW. Also, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW feature, but they express completely opposite semantics.
-
-
-In this chapter, we introduce our deep learning model which handles these issues in BOW. Our model embeds texts into a low-dimensional space and takes word order into consideration. It is an end-to-end framework, and has large performance improvement over traditional methods \[[1](#Reference)\].
-
-## Model Overview
-The model we used in this chapter is the CNN (Convolutional Neural Networks) and RNN (Recurrent Neural Networks) with some specific extension. 
-
-
-### Convolutional Neural Networks for Texts (CNN)
-Convolutional Neural Networks are always applied in data with grid-like topology, such as 2-d images and 1-d texts. CNN can combine extracted multiple local features to produce higher-level abstract semantics. Experimentally, CNN is very efficient for image and text modeling.
-
-CNN mainly contains convolution and pooling operation, with various extensions. We briefly describe CNN here with an example \[[1](#Refernce)\]. As shown in Figure 1：
-
-
-<p align="center">
-<img src="image/text_cnn_en.png" width = "80%" align="center"/><br/>
-Figure 1. CNN for text modeling. 
-</p>
-
-Assuming the length of the sentence is $n$, where the $i$-th word has embedding as $x_i\in\mathbb{R}^k$，where $k$ is the embedding dimensionality. 
-
-First, we concatenate the words together: we piece every $h$ words as a window of length $h$: $x_{i:i+h-1}$. It refers to $x_{i},x_{i+1},\ldots,x_{i+h-1}$, where $i$ is the first word in the window, ranging from $1$ to $n-h+1$: $x_{i:i+h-1}\in\mathbb{R}^{hk}$.
-
-Next, we apply the convolution operation: we apply the kernel $w\in\mathbb{R}^{hk}$ in each window, extracting features $c_i=f(w\cdot x_{i:i+h-1}+b)$，
-where $b\in\mathbb{R}$ is the bias and $f$ is a non-linear activation function such as $sigmoid$. Applying CNN on every window ${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$ produces a feature map as:
-
-$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
-
-Next, we apply max pooling over time to represent the whole sentence $\hat c$, which is the maximum element across the feature map:
-
-$$\hat c=max(c)$$
-
-In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size (as shown in Figure 1 in different colors).
-
-Finally, the CNN features are concatenated together to produce a fixed-length representation, which can be combined with a softmax for sentiment analysis problem.
-
-For short texts, above CNN model can achieve high accuracy \[[1](#Reference)\]. If we want to extract more abstract representation, we may apply a deeper CNN model \[[2](#Reference),[3](#Reference)\].
-
-### Recurrent Neural Network（RNN）
-RNN is an effective model for sequential data. Theoretical, the  computational ability of RNN is Turing-complete \[[4](#Reference)\]. NLP is a classical sequential data, and RNN (especially its variant LSTM\[[5](#Reference)\]) achieves State-of-the-Art performance on various tasks in NLP, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation and so forth.
-
-<p align="center">
-<img src="image/rnn.png" width = "60%" align="center"/><br/>
-Figure 2. An illustration of an unrolled RNN across “time”. 
-</p>
-As shown in Figure 2, we unroll an RNN: at $t$-th time step, the network takes the $t$-th input vector and the latent state from last time-step $h_{t-1}$ as inputs and compute the latent state of current step. The whole process is repeated until all inputs are consumed. If we regard the RNN as a function $f$, it can be formulated as:
-
-$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{h-1}+b_h)$$
-
-where $W_{xh}$ is the weight matrix from input to latent; $W_{hh}$ is the latent-to-latent matrix; $b_h$ is the latent bias and $\sigma$ refers to the $sigmoid$function.
-
-In NLP, words are first represented as a one-hot vector and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN. e.g., a deep or stacked RNN. Also, the last latent state can be used as a feature for sentence classification.
-
-### Long-Short Term Memory
-For data of long sequence, training RNN sometimes has gradient vanishing and explosion problem \[[6](#)\]. To solve this problem Hochreiter S, Schmidhuber J. (1997) proposed the LSTM(long short term memory\[[5](#Refernce)\]).  
-
-Compared with simple RNN, the structrue of LSTM has included memory cell $c$, input gate $i$, forget gate $f$ and output gate $o$. These gates and memory cells largely improves the ability of handling long sequences. We can formulate LSTM-RNN as a function $F$ as：
-
-$$ h_t=F(x_t,h_{t-1})$$
-
-$F$ contains following formulations\[[7](#Reference)\]：
-\begin{align}
-i_t & = \sigma(W_{xi}x_t+W_{hi}h_{h-1}+W_{ci}c_{t-1}+b_i)\\\\
-f_t & = \sigma(W_{xf}x_t+W_{hf}h_{h-1}+W_{cf}c_{t-1}+b_f)\\\\
-c_t & = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{h-1}+b_c)\\\\
-o_t & = \sigma(W_{xo}x_t+W_{ho}h_{h-1}+W_{co}c_{t}+b_o)\\\\
-h_t & = o_t\odot tanh(c_t)\\\\
-\end{align}
-
-In the equation，$i_t, f_t, c_t, o_t$ stand for input gate, forget gate, memory cell and output gate separately; $W$ and $b$ are model parameters. The $tanh$ is a hyperbolic tangent， and $\odot$ denotes an element-wise product operation. Input gate controls the magnitude of new input into the memory cell $c$; forget gate controls memory propagated from the last time step; output gate controls output magnitude. The three gates are computed similarly with different parameters, and they influence memory cell $c$ separately, as shown in Figure 3:
-
-<p align="center">
-<img src="image/lstm_en.png" width = "65%" align="center"/><br/>
-Figure 3. LSTM at time step $t$ [7].
-</p>
-
-LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of last time-step, and the process goes on recurrently until all input are consumed:**
-
-$$ h_t=Recrurent(x_t,h_{t-1})$$
-where $Recrurent$ is a simple RNN, GRU or LSTM.
-
-### Stacked Bidirectional LSTM
-For vanilla LSTM, $h_t$ contains input information from previous time-step $1..t-1$ context. We can also apply an RNN with reverse-direction to take successive context $t+1…n$ into consideration. Combining constructing deep RNN (deeper RNN can contain more abstract and higher level semantic), we can design structures with deep stacked bidirectional LSTM to model sequential data\[[9](#Reference)\].
-
-As shown in Figure 4 (3-layer RNN), odd/even layers are forward/reverse LSTM. Higher layers of LSTM take lower-layers LSTM as input, and the top-layer LSTM produces a fixed length vector by max-pooling (this representation considers contexts from previous and successive words for higher-level abstractions). Finally, we concatenate the output to a softmax layer for classification.
-
-<p align="center">
-<img src="image/stacked_lstm_en.png" width=450><br/>
-Figure 4. Stacked Bidirectional LSTM for NLP modeling.
-</p>
-
-## Data Preparation
-### Data introduction and Download
-We taks the [IMDB sentiment analysis dataset](http://ai.stanford.edu/%7Eamaas/data/sentiment/) as an example. IMDB dataset contains training and testing set, with 25000 movie reviews. With a 1-10 score, negative reviews are those with score<=4, while positives are those with score>=7. You may use following scripts to download the IMDB dataset and [Moses](http://www.statmt.org/moses/) toolbox:
-
-
-```bash
-./data/get_imdb.sh
-```
-If successful, you should see the directory ```data``` with following files:
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: original data downloaded from the website;
-* imdb: containing only training and testing data
-* mosesdecoder-master: Moses tool
-
-### Data Preprocessing
-We use the script `preprocess.py` to preprocess the data. It will call `tokenizer.perl` in the Moses toolbox to split words and punctuations, randomly shuffle training set and construct the dictionary. Notice: we only use labeled training and testing set. Executing following commands will preprocess the data:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i $data_dir
-```
-
-If it runs successfully, `./data/pre-imdb` will contain:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-
-* test\_part\_000 和 train\_part\_000: all labeled training and testing set, and the training set is shuffled. 
-* train.list and test.list: training and testing file-list (containing list of file names).
-* dict.txt: dictionary generated from training set.
-* labels.list: class label, 0 stands for negative while 1 for positive.
-
-### Data Provider for PaddlePaddle
-PaddlePaddle can read Python-style script for configuration. The following `dataprovider.py` provides a detailed example, consisting of two parts:
-
-* hook: define text information and class Id. Texts are defined as `integer_value_sequence` while class Ids are defined as `integer_value`.
-* process: read line by line for ID and text information split by `’\t\t’`, and yield the data as a generator.
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-
-def hook(settings, dictionary, **kwargs):
-settings.word_dict = dictionary
-settings.input_types = {
-'word':  integer_value_sequence(len(settings.word_dict)),
-'label': integer_value(2)
-}
-settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-@provider(init_hook=hook)
-def process(settings, file_name):
-with open(file_name, 'r') as fdata:
-for line_count, line in enumerate(fdata):
-label, comment = line.strip().split('\t\t')
-label = int(label)
-words = comment.split()
-word_slot = [
-settings.word_dict[w] for w in words if w in settings.word_dict
-]
-yield {
-'word': word_slot,
-'label': label
-}
-```
-
-## Model Setup
-`trainer_config.py` is an example of a setup file.
-### Data Definition
-```python
-from os.path import join as join_path
-from paddle.trainer_config_helpers import *
-# if it is “test” mode
-is_test = get_config_arg('is_test', bool, False)
-# if it is “predict” mode
-is_predict = get_config_arg('is_predict', bool, False)
-
-# Data path
-data_dir = "./data/pre-imdb"
-# File names
-train_list = "train.list"
-test_list = "test.list"
-dict_file = "dict.txt"
-
-# Dictionary size
-dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
-# class number
-class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
-
-if not is_predict:
-train_list = join_path(data_dir, train_list)
-test_list = join_path(data_dir, test_list)
-dict_file = join_path(data_dir, dict_file)
-train_list = train_list if not is_test else None
-# construct the dictionary
-word_dict = dict()
-with open(dict_file, 'r') as f:
-for i, line in enumerate(open(dict_file, 'r')):
-word_dict[line.split('\t')[0]] = i
-# Call the function “define_py_data_sources2” in the file dataprovider.py to extract features
-define_py_data_sources2(
-train_list,
-test_list,
-module="dataprovider",
-obj="process",  # function to generate data
-args={'dictionary': word_dict}) # extra parameters, here refers to dictionary
-```
-
-### Algorithm Setup
-
-```python
-settings(
-batch_size=128,
-learning_rate=2e-3,
-learning_method=AdamOptimizer(),
-regularization=L2Regularization(8e-4),
-gradient_clipping_threshold=25)
-```
-
-* Batch size set as 128;
-* Set global learning rate;
-* Apply ADAM algorithm for optimization;
-* Set up L2 regularization;
-* Set up gradient clipping threshold;
-
-### Model Structure
-We use PaddlePaddle to implement two classification algorithms, based on above mentioned model [Text-CNN](#Text-CNN（CNN）)和[Stacked-bidirectional LSTM](#Stacked-bidirectional LSTM（Stacked Bidirectional LSTM）)。
-#### Implementation of Text CNN 
-```python
-def convolution_net(input_dim,
-class_dim=2,
-emb_dim=128,
-hid_dim=128,
-is_predict=False):
-# network input: id denotes word order, dictionary size as input_dim
-data = data_layer("word", input_dim)
-# Embed one-hot id to embedding subspace
-emb = embedding_layer(input=data, size=emb_dim)
-# Convolution and max-pooling operation, convolution kernel size set as 3
-conv_3 = sequence_conv_pool(input=emb, context_len=3, hidden_size=hid_dim)
-# Convolution and max-pooling, convolution kernel size set as 4
-conv_4 = sequence_conv_pool(input=emb, context_len=4, hidden_size=hid_dim)
-# Concatenate conv_3 and conv_4 as input for softmax classification, class number as class_dim
-output = fc_layer(
-input=[conv_3, conv_4], size=class_dim, act=SoftmaxActivation())
-
-if not is_predict:
-lbl = data_layer("label", 1)    #network input: class label
-outputs(classification_cost(input=output, label=lbl))
-else:
-outputs(output)
-```
-
-In our implementation, we can use just a single layer [`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) to do convolution and pooling operation, convolution kernel size set as hidden_size parameters.
-
-#### Implementation of Stacked bidirectional LSTM
-
-```python
-def stacked_lstm_net(input_dim,
-class_dim=2,
-emb_dim=128,
-hid_dim=512,
-stacked_num=3,
-is_predict=False):
-
-# layer number of LSTM “stacked_num” is an odd number to confirm the top-layer LSTM is forward
-assert stacked_num % 2 == 1
-# network attributes setup
-layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-# parameter attributes setup
-fc_para_attr = ParameterAttribute(learning_rate=1e-3)
-lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
-para_attr = [fc_para_attr, lstm_para_attr]
-bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
-# Activation functions
-relu = ReluActivation()
-linear = LinearActivation()
-
-
-# Network input: id as word order, dictionary size is set as input_dim
-data = data_layer("word", input_dim)
-# Mapping id from word to the embedding subspace
-emb = embedding_layer(input=data, size=emb_dim)
-
-fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
-# LSTM-based RNN
-lstm1 = lstmemory(
-input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-
-# Construct stacked bidirectional LSTM with fc_layer and lstmemory with layer depth as stacked_num:
-inputs = [fc1, lstm1]
-for i in range(2, stacked_num + 1):
-fc = fc_layer(
-input=inputs,
-size=hid_dim,
-act=linear,
-param_attr=para_attr,
-bias_attr=bias_attr)
-lstm = lstmemory(
-input=fc,
-# Odd number-th layer: forward, Even number-th reverse.
-reverse=(i % 2) == 0,
-act=relu,
-bias_attr=bias_attr,
-layer_attr=layer_attr)
-inputs = [fc, lstm]
-
-# Apply max-pooling along the temporal dimension on the last fc_layer to produce a fixed length vector
-fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
-# Apply max-pooling along tempoeral dim of lstmemory to obtain fixed length feature vector
-lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
-# concatenate fc_last and lstm_last as input for a softmax classification layer, with class number equals class_dim
-output = fc_layer(
-input=[fc_last, lstm_last],
-size=class_dim,
-act=SoftmaxActivation(),
-bias_attr=bias_attr,
-param_attr=para_attr)
-
-if is_predict:
-outputs(output)
-else:
-outputs(classification_cost(input=output, label=data_layer('label', 1)))
-```
-
-Our model defined in `trainer_config.py` uses the `stacked_lstm_net` structure as default. If you want to use `convolution_net`, you can comment related lines.
-
-```python
-stacked_lstm_net(
-dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
-# convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-## Model Training
-Use `train.sh` script to run local training:
-
-```
-./train.sh
-```
-
-train.sh is as following:
-
-```bash
-paddle train --config=trainer_config.py \
---save_dir=./model_output \
---job=train \
---use_gpu=false \
---trainer_count=4 \
---num_passes=10 \
---log_period=20 \
---dot_period=20 \
---show_parameter_stats_period=100 \
---test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
-* \--config=trainer_config.py: set up model configuration.
-* \--save\_dir=./model_output: set up output folder to save model parameters.
-* \--job=train: set job mode as training.
-* \--use\_gpu=false: Use CPU for training. If you have installed GPU-version PaddlePaddle and want to try GPU training, you may set this term as true.
-* \--trainer\_count=4: setup thread number (or GPU numer）.
-* \--num\_passes=15: Setup pass. In PaddlePaddle, a pass means a training epoch over all samples.
-* \--log\_period=20: print log every 20 batches.
-* \--show\_parameter\_stats\_period=100: Print statistics to screen every 100 batch.
-* \--test\_all_data\_in\_one\_period=1: Predict all testing data every time.
-
-If it is running sussefully, the output log will be saved at `train.log`, model parameters will be saved at the directory `model_output/`. Output log will be as following:
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-
-* Batch=xx: Already |xx| Batch trained.
-* samples=xx: xx samples have been processed during training.
-* AvgCost=xx: Average loss from 0-th batch to the current batch.
-* CurrentCost=xx: loss of the latest |log_period|-th batch;
-* Eval: classification\_error\_evaluator=xx: Average accuracy from 0-th batch to current batch;
-* CurrentEval: classification\_error\_evaluator: latest |log_period| batches of classification error;
-* Pass=0: Running over all data in the training set is called as a Pass. Pass “0” denotes the first round.
-
-
-## Application models
-### Testing
-
-Testing refers to use trained model to evaluate labeled dataset.
-
-```
-./test.sh
-```
-
-Scripts for testing `test.sh` is as following, where the function `get_best_pass` ranks classification accuracy to obtain the best model:
-
-```bash
-function get_best_pass() {
-cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
---model_list=$model_list \
---job=test \
---use_gpu=false \
---trainer_count=4 \
---config_args=is_test=1 \
-2>&1 | tee 'test.log'
-```
-
-Different from training, testing requires denoting `--job = test` and model path `--model_list = $model_list`. If successful, log will be saved at `test.log`. In our test, the best model is `model_output/pass-00002`, with classification error rate as 0.115645：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-### Prediction
-`predict.py` script provides an API. Predicting IMDB data without labels as following:
-
-```
-./predict.sh
-```
-predict.sh is as following（default model path `model_output/pass-00002` may exist or modified to others）:
-
-```bash
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
---tconf=$config \
---model=$model \
---label=$label \
---dict=./data/pre-imdb/dict.txt \
---batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : Input prediction samples.
-* `predict.py` : Prediction script.
-* `--tconf=$config` : Network set up.
-* `--model=$model` : Model path set up.
-* `--label=$label` : set up the label dictionary, mapping integer IDs to string labels.
-* `--dict=data/pre-imdb/dict.txt` : set up the dictionary file.
-* `--batch_size=1` : batch size during prediction.
-
-
-Prediction result of our example:
-
-```
-Loading parameters from model_output/pass-00002/
-predicting label is pos
-```
-
-`10007_10.txt` in folder`./data/aclImdb/test/pos`, the predicted label is also pos，so the prediction is correct.
-## Summary
-In this chapter, we use sentiment analysis as an example to introduce applying deep learning models on end-to-end short text classification, as well as how to use PaddlePaddle to implement the model. Meanwhile, we briefly introduce two models for text processing: CNN and RNN. In following chapters we will see how these models can be applied in other tasks. 
-## Reference
-1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
-2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
-3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
-4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
-5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
-6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
-7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
-8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
-9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/understand_sentiment/data/get_imdb.sh b/understand_sentiment/data/get_imdb.sh
deleted file mode 100755
index 7600af6fbb900ee845702f1297779c1f0ed9bf84..0000000000000000000000000000000000000000
--- a/understand_sentiment/data/get_imdb.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-set -x
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-#download the dataset
-echo "Downloading aclImdb..."
-#http://ai.stanford.edu/%7Eamaas/data/sentiment/
-wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
-
-echo "Downloading mosesdecoder..."
-#https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
-
-#extract package
-echo "Unzipping..."
-tar -zxvf aclImdb_v1.tar.gz
-unzip master.zip
-
-#move train and test set to imdb_data directory 
-#in order to process when traing
-mkdir -p imdb/train
-mkdir -p imdb/test
-
-cp -r aclImdb/train/pos/ imdb/train/pos
-cp -r aclImdb/train/neg/ imdb/train/neg
-
-cp -r aclImdb/test/pos/ imdb/test/pos
-cp -r aclImdb/test/neg/ imdb/test/neg
-
-#remove compressed package
-rm aclImdb_v1.tar.gz
-rm master.zip
-
-echo "Done."
diff --git a/understand_sentiment/dataprovider.py b/understand_sentiment/dataprovider.py
deleted file mode 100755
index 976351ab7015fe136d270e97b0c767ac7fc63112..0000000000000000000000000000000000000000
--- a/understand_sentiment/dataprovider.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-
-
-def hook(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {
-        'word': integer_value_sequence(len(settings.word_dict)),
-        'label': integer_value(2)
-    }
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-
-@provider(init_hook=hook)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line_count, line in enumerate(fdata):
-            label, comment = line.strip().split('\t\t')
-            label = int(label)
-            words = comment.split()
-            word_slot = [
-                settings.word_dict[w] for w in words if w in settings.word_dict
-            ]
-            yield {'word': word_slot, 'label': label}
diff --git a/understand_sentiment/image/text_cnn.png b/understand_sentiment/image/text_cnn.png
deleted file mode 100644
index d548fc8b98ec5a41d0dec6cf65f85fb70f366d41..0000000000000000000000000000000000000000
Binary files a/understand_sentiment/image/text_cnn.png and /dev/null differ
diff --git a/understand_sentiment/image/text_cnn_en.png b/understand_sentiment/image/text_cnn_en.png
deleted file mode 100755
index 5601a902af7ed86bbea96fe329ab56b160992b2d..0000000000000000000000000000000000000000
Binary files a/understand_sentiment/image/text_cnn_en.png and /dev/null differ
diff --git a/understand_sentiment/index.en.html b/understand_sentiment/index.en.html
deleted file mode 100644
index 43f14c10bcf4c77a71ea337940eb89e899daa916..0000000000000000000000000000000000000000
--- a/understand_sentiment/index.en.html
+++ /dev/null
@@ -1,555 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# Sentiment Analysis
-
-The source codes of this section can be located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Background Introduction
-In natural language processing, sentiment analysis refers to describing emotion status in texts. The texts may refer to a sentence, a paragraph or a document. Emotion status can be a binary classification problem (positive/negative or happy/sad), or a three-class problem (positive/neutral/negative). Sentiment analysis can be applied widely in various situations, such as online shopping (Amazon, Taobao), travel and movie websites. It can be used to grasp from the reviews how the customers feel about the product. Table 1 is an example of sentiment analysis in movie reviews:
-
-| Movie Review       | Category  |
-| --------     | -----  |
-| Best movie of Xiaogang Feng in recent years!| Positive |
-| Pretty bad. Feels like a tv-series from a local TV-channel     | Negative |
-| Politically correct version of Taken ... and boring as Heck| Negative|
-|delightful, mesmerizing, and completely unexpected. The plot is nicely designed.|Positive|
-
-<p align="center">Table 1 Sentiment Analysis in Movie Reviews</p>
-
-In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before deep learning becomes heated, the main-stream methods for the former include BOW (bag of words) and topic modeling, while the latter contain SVM(support vector machine), LR(logistic regression).
-
-For a piece of text, BOW model ignores its word order, grammar and syntax, and regard it as a set of words, so BOW does not capture all the information in the text. For example, “this movie is extremely bad“ and “boring, dull and empty work” describe very similar semantic with low similarity in sense of BOW. Also, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW feature, but they express completely opposite semantics.
-
-
-In this chapter, we introduce our deep learning model which handles these issues in BOW. Our model embeds texts into a low-dimensional space and takes word order into consideration. It is an end-to-end framework, and has large performance improvement over traditional methods \[[1](#Reference)\].
-
-## Model Overview
-The model we used in this chapter is the CNN (Convolutional Neural Networks) and RNN (Recurrent Neural Networks) with some specific extension. 
-
-
-### Convolutional Neural Networks for Texts (CNN)
-Convolutional Neural Networks are always applied in data with grid-like topology, such as 2-d images and 1-d texts. CNN can combine extracted multiple local features to produce higher-level abstract semantics. Experimentally, CNN is very efficient for image and text modeling.
-
-CNN mainly contains convolution and pooling operation, with various extensions. We briefly describe CNN here with an example \[[1](#Refernce)\]. As shown in Figure 1：
-
-
-<p align="center">
-<img src="image/text_cnn_en.png" width = "80%" align="center"/><br/>
-Figure 1. CNN for text modeling. 
-</p>
-
-Assuming the length of the sentence is $n$, where the $i$-th word has embedding as $x_i\in\mathbb{R}^k$，where $k$ is the embedding dimensionality. 
-
-First, we concatenate the words together: we piece every $h$ words as a window of length $h$: $x_{i:i+h-1}$. It refers to $x_{i},x_{i+1},\ldots,x_{i+h-1}$, where $i$ is the first word in the window, ranging from $1$ to $n-h+1$: $x_{i:i+h-1}\in\mathbb{R}^{hk}$.
-
-Next, we apply the convolution operation: we apply the kernel $w\in\mathbb{R}^{hk}$ in each window, extracting features $c_i=f(w\cdot x_{i:i+h-1}+b)$，
-where $b\in\mathbb{R}$ is the bias and $f$ is a non-linear activation function such as $sigmoid$. Applying CNN on every window ${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$ produces a feature map as:
-
-$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
-
-Next, we apply max pooling over time to represent the whole sentence $\hat c$, which is the maximum element across the feature map:
-
-$$\hat c=max(c)$$
-
-In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size (as shown in Figure 1 in different colors).
-
-Finally, the CNN features are concatenated together to produce a fixed-length representation, which can be combined with a softmax for sentiment analysis problem.
-
-For short texts, above CNN model can achieve high accuracy \[[1](#Reference)\]. If we want to extract more abstract representation, we may apply a deeper CNN model \[[2](#Reference),[3](#Reference)\].
-
-### Recurrent Neural Network（RNN）
-RNN is an effective model for sequential data. Theoretical, the  computational ability of RNN is Turing-complete \[[4](#Reference)\]. NLP is a classical sequential data, and RNN (especially its variant LSTM\[[5](#Reference)\]) achieves State-of-the-Art performance on various tasks in NLP, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation and so forth.
-
-<p align="center">
-<img src="image/rnn.png" width = "60%" align="center"/><br/>
-Figure 2. An illustration of an unrolled RNN across “time”. 
-</p>
-As shown in Figure 2, we unroll an RNN: at $t$-th time step, the network takes the $t$-th input vector and the latent state from last time-step $h_{t-1}$ as inputs and compute the latent state of current step. The whole process is repeated until all inputs are consumed. If we regard the RNN as a function $f$, it can be formulated as:
-
-$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{h-1}+b_h)$$
-
-where $W_{xh}$ is the weight matrix from input to latent; $W_{hh}$ is the latent-to-latent matrix; $b_h$ is the latent bias and $\sigma$ refers to the $sigmoid$function.
-
-In NLP, words are first represented as a one-hot vector and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN. e.g., a deep or stacked RNN. Also, the last latent state can be used as a feature for sentence classification.
-
-### Long-Short Term Memory
-For data of long sequence, training RNN sometimes has gradient vanishing and explosion problem \[[6](#)\]. To solve this problem Hochreiter S, Schmidhuber J. (1997) proposed the LSTM(long short term memory\[[5](#Refernce)\]).  
-
-Compared with simple RNN, the structrue of LSTM has included memory cell $c$, input gate $i$, forget gate $f$ and output gate $o$. These gates and memory cells largely improves the ability of handling long sequences. We can formulate LSTM-RNN as a function $F$ as：
-
-$$ h_t=F(x_t,h_{t-1})$$
-
-$F$ contains following formulations\[[7](#Reference)\]：
-\begin{align}
-i_t & = \sigma(W_{xi}x_t+W_{hi}h_{h-1}+W_{ci}c_{t-1}+b_i)\\\\
-f_t & = \sigma(W_{xf}x_t+W_{hf}h_{h-1}+W_{cf}c_{t-1}+b_f)\\\\
-c_t & = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{h-1}+b_c)\\\\
-o_t & = \sigma(W_{xo}x_t+W_{ho}h_{h-1}+W_{co}c_{t}+b_o)\\\\
-h_t & = o_t\odot tanh(c_t)\\\\
-\end{align}
-
-In the equation，$i_t, f_t, c_t, o_t$ stand for input gate, forget gate, memory cell and output gate separately; $W$ and $b$ are model parameters. The $tanh$ is a hyperbolic tangent， and $\odot$ denotes an element-wise product operation. Input gate controls the magnitude of new input into the memory cell $c$; forget gate controls memory propagated from the last time step; output gate controls output magnitude. The three gates are computed similarly with different parameters, and they influence memory cell $c$ separately, as shown in Figure 3:
-
-<p align="center">
-<img src="image/lstm_en.png" width = "65%" align="center"/><br/>
-Figure 3. LSTM at time step $t$ [7].
-</p>
-
-LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of last time-step, and the process goes on recurrently until all input are consumed:**
-
-$$ h_t=Recrurent(x_t,h_{t-1})$$
-where $Recrurent$ is a simple RNN, GRU or LSTM.
-
-### Stacked Bidirectional LSTM
-For vanilla LSTM, $h_t$ contains input information from previous time-step $1..t-1$ context. We can also apply an RNN with reverse-direction to take successive context $t+1…n$ into consideration. Combining constructing deep RNN (deeper RNN can contain more abstract and higher level semantic), we can design structures with deep stacked bidirectional LSTM to model sequential data\[[9](#Reference)\].
-
-As shown in Figure 4 (3-layer RNN), odd/even layers are forward/reverse LSTM. Higher layers of LSTM take lower-layers LSTM as input, and the top-layer LSTM produces a fixed length vector by max-pooling (this representation considers contexts from previous and successive words for higher-level abstractions). Finally, we concatenate the output to a softmax layer for classification.
-
-<p align="center">
-<img src="image/stacked_lstm_en.png" width=450><br/>
-Figure 4. Stacked Bidirectional LSTM for NLP modeling.
-</p>
-
-## Data Preparation
-### Data introduction and Download
-We taks the [IMDB sentiment analysis dataset](http://ai.stanford.edu/%7Eamaas/data/sentiment/) as an example. IMDB dataset contains training and testing set, with 25000 movie reviews. With a 1-10 score, negative reviews are those with score<=4, while positives are those with score>=7. You may use following scripts to download the IMDB dataset and [Moses](http://www.statmt.org/moses/) toolbox:
-
-
-```bash
-./data/get_imdb.sh
-```
-If successful, you should see the directory ```data``` with following files:
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: original data downloaded from the website;
-* imdb: containing only training and testing data
-* mosesdecoder-master: Moses tool
-
-### Data Preprocessing
-We use the script `preprocess.py` to preprocess the data. It will call `tokenizer.perl` in the Moses toolbox to split words and punctuations, randomly shuffle training set and construct the dictionary. Notice: we only use labeled training and testing set. Executing following commands will preprocess the data:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i $data_dir
-```
-
-If it runs successfully, `./data/pre-imdb` will contain:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-
-* test\_part\_000 和 train\_part\_000: all labeled training and testing set, and the training set is shuffled. 
-* train.list and test.list: training and testing file-list (containing list of file names).
-* dict.txt: dictionary generated from training set.
-* labels.list: class label, 0 stands for negative while 1 for positive.
-
-### Data Provider for PaddlePaddle
-PaddlePaddle can read Python-style script for configuration. The following `dataprovider.py` provides a detailed example, consisting of two parts:
-
-* hook: define text information and class Id. Texts are defined as `integer_value_sequence` while class Ids are defined as `integer_value`.
-* process: read line by line for ID and text information split by `’\t\t’`, and yield the data as a generator.
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-
-def hook(settings, dictionary, **kwargs):
-settings.word_dict = dictionary
-settings.input_types = {
-'word':  integer_value_sequence(len(settings.word_dict)),
-'label': integer_value(2)
-}
-settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-@provider(init_hook=hook)
-def process(settings, file_name):
-with open(file_name, 'r') as fdata:
-for line_count, line in enumerate(fdata):
-label, comment = line.strip().split('\t\t')
-label = int(label)
-words = comment.split()
-word_slot = [
-settings.word_dict[w] for w in words if w in settings.word_dict
-]
-yield {
-'word': word_slot,
-'label': label
-}
-```
-
-## Model Setup
-`trainer_config.py` is an example of a setup file.
-### Data Definition
-```python
-from os.path import join as join_path
-from paddle.trainer_config_helpers import *
-# if it is “test” mode
-is_test = get_config_arg('is_test', bool, False)
-# if it is “predict” mode
-is_predict = get_config_arg('is_predict', bool, False)
-
-# Data path
-data_dir = "./data/pre-imdb"
-# File names
-train_list = "train.list"
-test_list = "test.list"
-dict_file = "dict.txt"
-
-# Dictionary size
-dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
-# class number
-class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
-
-if not is_predict:
-train_list = join_path(data_dir, train_list)
-test_list = join_path(data_dir, test_list)
-dict_file = join_path(data_dir, dict_file)
-train_list = train_list if not is_test else None
-# construct the dictionary
-word_dict = dict()
-with open(dict_file, 'r') as f:
-for i, line in enumerate(open(dict_file, 'r')):
-word_dict[line.split('\t')[0]] = i
-# Call the function “define_py_data_sources2” in the file dataprovider.py to extract features
-define_py_data_sources2(
-train_list,
-test_list,
-module="dataprovider",
-obj="process",  # function to generate data
-args={'dictionary': word_dict}) # extra parameters, here refers to dictionary
-```
-
-### Algorithm Setup
-
-```python
-settings(
-batch_size=128,
-learning_rate=2e-3,
-learning_method=AdamOptimizer(),
-regularization=L2Regularization(8e-4),
-gradient_clipping_threshold=25)
-```
-
-* Batch size set as 128;
-* Set global learning rate;
-* Apply ADAM algorithm for optimization;
-* Set up L2 regularization;
-* Set up gradient clipping threshold;
-
-### Model Structure
-We use PaddlePaddle to implement two classification algorithms, based on above mentioned model [Text-CNN](#Text-CNN（CNN）)和[Stacked-bidirectional LSTM](#Stacked-bidirectional LSTM（Stacked Bidirectional LSTM）)。
-#### Implementation of Text CNN 
-```python
-def convolution_net(input_dim,
-class_dim=2,
-emb_dim=128,
-hid_dim=128,
-is_predict=False):
-# network input: id denotes word order, dictionary size as input_dim
-data = data_layer("word", input_dim)
-# Embed one-hot id to embedding subspace
-emb = embedding_layer(input=data, size=emb_dim)
-# Convolution and max-pooling operation, convolution kernel size set as 3
-conv_3 = sequence_conv_pool(input=emb, context_len=3, hidden_size=hid_dim)
-# Convolution and max-pooling, convolution kernel size set as 4
-conv_4 = sequence_conv_pool(input=emb, context_len=4, hidden_size=hid_dim)
-# Concatenate conv_3 and conv_4 as input for softmax classification, class number as class_dim
-output = fc_layer(
-input=[conv_3, conv_4], size=class_dim, act=SoftmaxActivation())
-
-if not is_predict:
-lbl = data_layer("label", 1)    #network input: class label
-outputs(classification_cost(input=output, label=lbl))
-else:
-outputs(output)
-```
-
-In our implementation, we can use just a single layer [`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) to do convolution and pooling operation, convolution kernel size set as hidden_size parameters.
-
-#### Implementation of Stacked bidirectional LSTM
-
-```python
-def stacked_lstm_net(input_dim,
-class_dim=2,
-emb_dim=128,
-hid_dim=512,
-stacked_num=3,
-is_predict=False):
-
-# layer number of LSTM “stacked_num” is an odd number to confirm the top-layer LSTM is forward
-assert stacked_num % 2 == 1
-# network attributes setup
-layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-# parameter attributes setup
-fc_para_attr = ParameterAttribute(learning_rate=1e-3)
-lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
-para_attr = [fc_para_attr, lstm_para_attr]
-bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
-# Activation functions
-relu = ReluActivation()
-linear = LinearActivation()
-
-
-# Network input: id as word order, dictionary size is set as input_dim
-data = data_layer("word", input_dim)
-# Mapping id from word to the embedding subspace
-emb = embedding_layer(input=data, size=emb_dim)
-
-fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
-# LSTM-based RNN
-lstm1 = lstmemory(
-input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-
-# Construct stacked bidirectional LSTM with fc_layer and lstmemory with layer depth as stacked_num:
-inputs = [fc1, lstm1]
-for i in range(2, stacked_num + 1):
-fc = fc_layer(
-input=inputs,
-size=hid_dim,
-act=linear,
-param_attr=para_attr,
-bias_attr=bias_attr)
-lstm = lstmemory(
-input=fc,
-# Odd number-th layer: forward, Even number-th reverse.
-reverse=(i % 2) == 0,
-act=relu,
-bias_attr=bias_attr,
-layer_attr=layer_attr)
-inputs = [fc, lstm]
-
-# Apply max-pooling along the temporal dimension on the last fc_layer to produce a fixed length vector
-fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
-# Apply max-pooling along tempoeral dim of lstmemory to obtain fixed length feature vector
-lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
-# concatenate fc_last and lstm_last as input for a softmax classification layer, with class number equals class_dim
-output = fc_layer(
-input=[fc_last, lstm_last],
-size=class_dim,
-act=SoftmaxActivation(),
-bias_attr=bias_attr,
-param_attr=para_attr)
-
-if is_predict:
-outputs(output)
-else:
-outputs(classification_cost(input=output, label=data_layer('label', 1)))
-```
-
-Our model defined in `trainer_config.py` uses the `stacked_lstm_net` structure as default. If you want to use `convolution_net`, you can comment related lines.
-
-```python
-stacked_lstm_net(
-dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
-# convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-## Model Training
-Use `train.sh` script to run local training:
-
-```
-./train.sh
-```
-
-train.sh is as following:
-
-```bash
-paddle train --config=trainer_config.py \
---save_dir=./model_output \
---job=train \
---use_gpu=false \
---trainer_count=4 \
---num_passes=10 \
---log_period=20 \
---dot_period=20 \
---show_parameter_stats_period=100 \
---test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
-* \--config=trainer_config.py: set up model configuration.
-* \--save\_dir=./model_output: set up output folder to save model parameters.
-* \--job=train: set job mode as training.
-* \--use\_gpu=false: Use CPU for training. If you have installed GPU-version PaddlePaddle and want to try GPU training, you may set this term as true.
-* \--trainer\_count=4: setup thread number (or GPU numer）.
-* \--num\_passes=15: Setup pass. In PaddlePaddle, a pass means a training epoch over all samples.
-* \--log\_period=20: print log every 20 batches.
-* \--show\_parameter\_stats\_period=100: Print statistics to screen every 100 batch.
-* \--test\_all_data\_in\_one\_period=1: Predict all testing data every time.
-
-If it is running sussefully, the output log will be saved at `train.log`, model parameters will be saved at the directory `model_output/`. Output log will be as following:
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-
-* Batch=xx: Already |xx| Batch trained.
-* samples=xx: xx samples have been processed during training.
-* AvgCost=xx: Average loss from 0-th batch to the current batch.
-* CurrentCost=xx: loss of the latest |log_period|-th batch;
-* Eval: classification\_error\_evaluator=xx: Average accuracy from 0-th batch to current batch;
-* CurrentEval: classification\_error\_evaluator: latest |log_period| batches of classification error;
-* Pass=0: Running over all data in the training set is called as a Pass. Pass “0” denotes the first round.
-
-
-## Application models
-### Testing
-
-Testing refers to use trained model to evaluate labeled dataset.
-
-```
-./test.sh
-```
-
-Scripts for testing `test.sh` is as following, where the function `get_best_pass` ranks classification accuracy to obtain the best model:
-
-```bash
-function get_best_pass() {
-cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
---model_list=$model_list \
---job=test \
---use_gpu=false \
---trainer_count=4 \
---config_args=is_test=1 \
-2>&1 | tee 'test.log'
-```
-
-Different from training, testing requires denoting `--job = test` and model path `--model_list = $model_list`. If successful, log will be saved at `test.log`. In our test, the best model is `model_output/pass-00002`, with classification error rate as 0.115645：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-### Prediction
-`predict.py` script provides an API. Predicting IMDB data without labels as following:
-
-```
-./predict.sh
-```
-predict.sh is as following（default model path `model_output/pass-00002` may exist or modified to others）:
-
-```bash
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
---tconf=$config \
---model=$model \
---label=$label \
---dict=./data/pre-imdb/dict.txt \
---batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : Input prediction samples.
-* `predict.py` : Prediction script.
-* `--tconf=$config` : Network set up.
-* `--model=$model` : Model path set up.
-* `--label=$label` : set up the label dictionary, mapping integer IDs to string labels.
-* `--dict=data/pre-imdb/dict.txt` : set up the dictionary file.
-* `--batch_size=1` : batch size during prediction.
-
-
-Prediction result of our example:
-
-```
-Loading parameters from model_output/pass-00002/
-predicting label is pos
-```
-
-`10007_10.txt` in folder`./data/aclImdb/test/pos`, the predicted label is also pos，so the prediction is correct.
-## Summary
-In this chapter, we use sentiment analysis as an example to introduce applying deep learning models on end-to-end short text classification, as well as how to use PaddlePaddle to implement the model. Meanwhile, we briefly introduce two models for text processing: CNN and RNN. In following chapters we will see how these models can be applied in other tasks. 
-## Reference
-1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
-2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
-3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
-4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
-5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
-6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
-7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
-8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
-9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/understand_sentiment/index.html b/understand_sentiment/index.html
deleted file mode 100644
index e1fc47435f3f745359cd9134b29bf2203b9b6476..0000000000000000000000000000000000000000
--- a/understand_sentiment/index.html
+++ /dev/null
@@ -1,538 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# 情感分析
-
-本教程源代码目录在[book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/understand_sentiment)， 初次使用请参考PaddlePaddle[安装教程](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html)。
-
-## 背景介绍
-在自然语言处理中，情感分析一般是指判断一段文本所表达的情绪状态。其中，一段文本可以是一个句子，一个段落或一个文档。情绪状态可以是两类，如（正面，负面），（高兴，悲伤）；也可以是三类，如（积极，消极，中性）等等。情感分析的应用场景十分广泛，如把用户在购物网站（亚马逊、天猫、淘宝等）、旅游网站、电影评论网站上发表的评论分成正面评论和负面评论；或为了分析用户对于某一产品的整体使用感受，抓取产品的用户评论并进行情感分析等等。表格1展示了对电影评论进行情感分析的例子：
-
-| 电影评论       | 类别  |
-| --------     | -----  |
-| 在冯小刚这几年的电影里，算最好的一部的了| 正面 |
-| 很不好看，好像一个地方台的电视剧     | 负面 |
-| 圆方镜头全程炫技，色调背景美则美矣，但剧情拖沓，口音不伦不类，一直努力却始终无法入戏| 负面|
-|剧情四星。但是圆镜视角加上婺源的风景整个非常有中国写意山水画的感觉，看得实在太舒服了。。|正面|
-
-<p align="center">表格 1 电影评论情感分析</p>
-
-在自然语言处理中，情感分析属于典型的**文本分类**问题，即把需要进行情感分析的文本划分为其所属类别。文本分类涉及文本表示和分类方法两个问题。在深度学习的方法出现之前，主流的文本表示方法为词袋模型BOW(bag of words)，话题模型等等；分类方法有SVM(support vector machine), LR(logistic regression)等等。  
-
-对于一段文本，BOW表示会忽略其词顺序、语法和句法，将这段文本仅仅看做是一个词集合，因此BOW方法并不能充分表示文本的语义信息。例如，句子“这部电影糟糕透了”和“一个乏味，空洞，没有内涵的作品”在情感分析中具有很高的语义相似度，但是它们的BOW表示的相似度为0。又如，句子“一个空洞，没有内涵的作品”和“一个不空洞而且有内涵的作品”的BOW相似度很高，但实际上它们的意思很不一样。  
-
-本章我们所要介绍的深度学习模型克服了BOW表示的上述缺陷，它在考虑词顺序的基础上把文本映射到低维度的语义空间，并且以端对端（end to end）的方式进行文本表示及分类，其性能相对于传统方法有显著的提升\[[1](#参考文献)\]。
-## 模型概览
-本章所使用的文本表示模型为卷积神经网络（Convolutional Neural Networks）和循环神经网络(Recurrent Neural Networks)及其扩展。下面依次介绍这几个模型。
-### 文本卷积神经网络（CNN）
-卷积神经网络经常用来处理具有类似网格拓扑结构（grid-like topology）的数据。例如，图像可以视为二维网格的像素点，自然语言可以视为一维的词序列。卷积神经网络可以提取多种局部特征，并对其进行组合抽象得到更高级的特征表示。实验表明，卷积神经网络能高效地对图像及文本问题进行建模处理。  
-
-卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以一种简单的文本分类卷积神经网络为例进行讲解\[[1](#参考文献)\]，如图1所示：
-<p align="center">
-<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
-图1. 卷积神经网络文本分类模型
-</p>
-假设待处理句子的长度为$n$，其中第$i$个词的词向量（word embedding）为$x_i\in\mathbb{R}^k$，$k$为维度大小。  
-
-首先，进行词向量的拼接操作：将每$h$个词拼接起来形成一个大小为$h$的词窗口，记为$x_{i:i+h-1}$，它表示词序列$x_{i},x_{i+1},\ldots,x_{i+h-1}$的拼接，其中，$i$表示词窗口中第一个词在整个句子中的位置，取值范围从$1$到$n-h+1$，$x_{i:i+h-1}\in\mathbb{R}^{hk}$。  
-
-其次，进行卷积操作：把卷积核(kernel)$w\in\mathbb{R}^{hk}$应用于包含$h$个词的窗口$x_{i:i+h-1}$，得到特征$c_i=f(w\cdot x_{i:i+h-1}+b)$，其中$b\in\mathbb{R}$为偏置项（bias），$f$为非线性激活函数，如$sigmoid$。将卷积核应用于句子中所有的词窗口${x_{1:h},x_{2:h+1},\ldots,x_{n-h+1:n}}$，产生一个特征图（feature map）：
-
-$$c=[c_1,c_2,\ldots,c_{n-h+1}], c \in \mathbb{R}^{n-h+1}$$
-
-接下来，对特征图采用时间维度上的最大池化（max pooling over time）操作得到此卷积核对应的整句话的特征$\hat c$，它是特征图中所有元素的最大值：
-
-$$\hat c=max(c)$$
-
-在实际应用中，我们会使用多个卷积核来处理句子，窗口大小相同的卷积核堆叠起来形成一个矩阵（上文中的单个卷积核参数$w$相当于矩阵的某一行），这样可以更高效的完成运算。另外，我们也可使用窗口大小不同的卷积核来处理句子（图1作为示意画了四个卷积核，不同颜色表示不同大小的卷积核操作）。  
-
-最后，将所有卷积核得到的特征拼接起来即为文本的定长向量表示，对于文本分类问题，将其连接至softmax即构建出完整的模型。
-
-对于一般的短文本分类问题，上文所述的简单的文本卷积网络即可达到很高的正确率\[[1](#参考文献)\]。若想得到更抽象更高级的文本特征表示，可以构建深层文本卷积神经网络\[[2](#参考文献),[3](#参考文献)\]。
-### 循环神经网络（RNN）
-循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
-<p align="center">
-<img src="image/rnn.png" width = "60%" align="center"/><br/>
-图2. 循环神经网络按时间展开的示意图
-</p>
-循环神经网络按时间展开后如图2所示：在第$t$时刻，网络读入第$t$个输入$x_t$（向量表示）及前一时刻隐层的状态值$h_{t-1}$（向量表示，$h_0$一般初始化为$0$向量），计算得出本时刻隐层的状态值$h_t$，重复这一步骤直至读完所有输入。如果将循环神经网络所表示的函数记为$f$，则其公式可表示为：
-
-$$h_t=f(x_t,h_{t-1})=\sigma(W_{xh}x_t+W_{hh}h_{h-1}+b_h)$$
-
-其中$W_{xh}$是输入到隐层的矩阵参数，$W_{hh}$是隐层到隐层的矩阵参数，$b_h$为隐层的偏置向量（bias）参数，$\sigma$为$sigmoid$函数。  
-  
-在处理自然语言时，一般会先将词（one-hot表示）映射为其词向量（word embedding）表示，然后再作为循环神经网络每一时刻的输入$x_t$。此外，可以根据实际需要的不同在循环神经网络的隐层上连接其它层。如，可以把一个循环神经网络的隐层输出连接至下一个循环神经网络的输入构建深层（deep or stacked）循环神经网络，或者提取最后一个时刻的隐层状态作为句子表示进而使用分类模型等等。  
-
-### 长短期记忆网络（LSTM）
-对于较长的序列数据，循环神经网络的训练过程中容易出现梯度消失或爆炸现象\[[6](#参考文献)\]。为了解决这一问题，Hochreiter S, Schmidhuber J. (1997)提出了LSTM(long short term memory\[[5](#参考文献)\])。  
-
-相比于简单的循环神经网络，LSTM增加了记忆单元$c$、输入门$i$、遗忘门$f$及输出门$o$。这些门及记忆单元组合起来大大提升了循环神经网络处理长序列数据的能力。若将基于LSTM的循环神经网络表示的函数记为$F$，则其公式为：
-
-$$ h_t=F(x_t,h_{t-1})$$
-
-$F$由下列公式组合而成\[[7](#参考文献)\]：
-\begin{align}
-i_t & = \sigma(W_{xi}x_t+W_{hi}h_{h-1}+W_{ci}c_{t-1}+b_i)\\\\
-f_t & = \sigma(W_{xf}x_t+W_{hf}h_{h-1}+W_{cf}c_{t-1}+b_f)\\\\
-c_t & = f_t\odot c_{t-1}+i_t\odot tanh(W_{xc}x_t+W_{hc}h_{h-1}+b_c)\\\\
-o_t & = \sigma(W_{xo}x_t+W_{ho}h_{h-1}+W_{co}c_{t}+b_o)\\\\
-h_t & = o_t\odot tanh(c_t)\\\\
-\end{align}
-其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图3所示：
-<p align="center">
-<img src="image/lstm.png" width = "65%" align="center"/><br/>
-图3. 时刻$t$的LSTM [7]
-</p>
-LSTM通过给简单的循环神经网络增加记忆及控制门的方式，增强了其处理远距离依赖问题的能力。类似原理的改进还有Gated Recurrent Unit (GRU)\[[8](#参考文献)\]，其设计更为简洁一些。**这些改进虽然各有不同，但是它们的宏观描述却与简单的循环神经网络一样（如图2所示），即隐状态依据当前输入及前一时刻的隐状态来改变，不断地循环这一过程直至输入处理完毕：**
-
-$$ h_t=Recrurent(x_t,h_{t-1})$$
-
-其中，$Recrurent$可以表示简单的循环神经网络、GRU或LSTM。
-### 栈式双向LSTM（Stacked Bidirectional LSTM）
-对于正常顺序的循环神经网络，$h_t$包含了$t$时刻之前的输入信息，也就是上文信息。同样，为了得到下文信息，我们可以使用反方向（将输入逆序处理）的循环神经网络。结合构建深层循环神经网络的方法（深层神经网络往往能得到更抽象和高级的特征表示），我们可以通过构建更加强有力的基于LSTM的栈式双向循环神经网络\[[9](#参考文献)\]，来对时序数据进行建模。  
-
-如图4所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
-<p align="center">
-<img src="image/stacked_lstm.jpg" width=450><br/>
-图4. 栈式双向LSTM用于文本分类
-</p>
-## 数据准备
-### 数据介绍与下载
-我们以[IMDB情感分析数据集](http://ai.stanford.edu/%7Eamaas/data/sentiment/)为例进行介绍。IMDB数据集的训练集和测试集分别包含25000个已标注过的电影评论。其中，负面评论的得分小于等于4，正面评论的得分大于等于7，满分10分。您可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具：
-
-```bash
-./data/get_imdb.sh
-```
-如果数椐获取成功，您将在目录```data```中看到下面的文件：
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: 从外部网站上下载的原始数椐集。
-* imdb: 仅包含训练和测试数椐集。
-* mosesdecoder-master: Moses 工具。
-
-### 数据预处理
-我们使用的预处理脚本为`preprocess.py`。该脚本会调用Moses工具中的`tokenizer.perl`脚本来切分单词和标点符号，并会将训练集随机打乱排序再构建字典。注意：我们只使用已标注的训练集和测试集。执行下面的命令就可以预处理数椐：
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i $data_dir
-```
-
-运行成功后目录`./data/pre-imdb` 结构如下:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-
-* test\_part\_000 和 train\_part\_000: 所有标记的测试集和训练集，训练集已经随机打乱。
-* train.list 和 test.list: 训练集和测试集文件列表。
-* dict.txt: 利用训练集生成的字典。
-* labels.list: 类别标签列表，标签0表示负面评论，标签1表示正面评论。
-
-### 提供数据给PaddlePaddle
-PaddlePaddle可以读取Python写的传输数据脚本，下面`dataprovider.py`文件给出了完整例子，主要包括两部分：
-
-* hook： 定义文本信息、类别Id的数据类型。文本被定义为整数序列`integer_value_sequence`，类别被定义为整数`integer_value`。
-* process： 按行读取以`'\t\t'`分隔的类别ID和文本信息，并用yield关键字返回。
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-
-def hook(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {
-        'word':  integer_value_sequence(len(settings.word_dict)),
-        'label': integer_value(2)
-    }
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-
-@provider(init_hook=hook)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line_count, line in enumerate(fdata):
-            label, comment = line.strip().split('\t\t')
-            label = int(label)
-            words = comment.split()
-            word_slot = [
-                settings.word_dict[w] for w in words if w in settings.word_dict
-            ]
-            yield {
-                'word': word_slot,
-                'label': label
-            }
-```
-
-## 模型配置说明
-`trainer_config.py` 是一个配置文件的例子。
-### 数据定义
-```python
-from os.path import join as join_path
-from paddle.trainer_config_helpers import *
-# 是否是测试模式
-is_test = get_config_arg('is_test', bool, False)
-# 是否是预测模式
-is_predict = get_config_arg('is_predict', bool, False)
-
-# 数据路径
-data_dir = "./data/pre-imdb"
-# 文件名
-train_list = "train.list"
-test_list = "test.list"
-dict_file = "dict.txt"
-
-# 字典大小
-dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
-# 类别个数
-class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
-
-if not is_predict:
-    train_list = join_path(data_dir, train_list)
-    test_list = join_path(data_dir, test_list)
-    dict_file = join_path(data_dir, dict_file)
-    train_list = train_list if not is_test else None
-    # 构造字典
-    word_dict = dict()
-    with open(dict_file, 'r') as f:
-        for i, line in enumerate(open(dict_file, 'r')):
-            word_dict[line.split('\t')[0]] = i
-    # 通过define_py_data_sources2函数从dataprovider.py中读取数据
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",  # 指定生成数据的函数。
-        args={'dictionary': word_dict}) # 额外的参数，这里指定词典。
-```
-
-### 算法配置
-
-```python
-settings(
-    batch_size=128,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-```
-
-* 设置batch size大小为128。
-* 设置全局学习率。
-* 使用adam优化。
-* 设置L2正则。
-* 设置梯度截断（clipping）阈值。
-
-### 模型结构
-我们用PaddlePaddle实现了两种文本分类算法，分别基于上文所述的[文本卷积神经网络](#文本卷积神经网络（CNN）)和[栈式双向LSTM](#栈式双向LSTM（Stacked Bidirectional LSTM）)。
-#### 文本卷积神经网络的实现
-```python
-def convolution_net(input_dim,
-                    class_dim=2,
-                    emb_dim=128,
-                    hid_dim=128,
-                    is_predict=False):
-    # 网络输入：id表示的词序列，词典大小为input_dim
-    data = data_layer("word", input_dim)
-    # 将id表示的词序列映射为embedding序列
-    emb = embedding_layer(input=data, size=emb_dim)
-    # 卷积及最大化池操作，卷积核窗口大小为3
-    conv_3 = sequence_conv_pool(input=emb, context_len=3, hidden_size=hid_dim)
-    # 卷积及最大化池操作，卷积核窗口大小为4
-    conv_4 = sequence_conv_pool(input=emb, context_len=4, hidden_size=hid_dim)
-    # 将conv_3和conv_4拼接起来输入给softmax分类，类别数为class_dim
-    output = fc_layer(
-        input=[conv_3, conv_4], size=class_dim, act=SoftmaxActivation())
-
-    if not is_predict:
-        lbl = data_layer("label", 1)    #网络输入：类别标签
-        outputs(classification_cost(input=output, label=lbl))
-    else:
-        outputs(output)
-```
-
-其中，我们仅用一个[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py)方法就实现了卷积和池化操作，卷积核的数量为hidden_size参数。
-#### 栈式双向LSTM的实现
-
-```python
-def stacked_lstm_net(input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3,
-                     is_predict=False):
-
-    # LSTM的层数stacked_num为奇数，确保最高层LSTM正向
-    assert stacked_num % 2 == 1
-    # 设置神经网络层的属性
-    layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-    # 设置参数的属性
-    fc_para_attr = ParameterAttribute(learning_rate=1e-3)
-    lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
-    para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
-    # 激活函数
-    relu = ReluActivation()
-    linear = LinearActivation()
-
-
-    # 网络输入：id表示的词序列，词典大小为input_dim
-    data = data_layer("word", input_dim)
-    # 将id表示的词序列映射为embedding序列
-    emb = embedding_layer(input=data, size=emb_dim)
-
-    fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
-    # 基于LSTM的循环神经网络
-    lstm1 = lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-
-    # 由fc_layer和lstmemory构建深度为stacked_num的栈式双向LSTM
-    inputs = [fc1, lstm1]
-    for i in range(2, stacked_num + 1):
-        fc = fc_layer(
-            input=inputs,
-            size=hid_dim,
-            act=linear,
-            param_attr=para_attr,
-            bias_attr=bias_attr)
-        lstm = lstmemory(
-            input=fc,
-            # 奇数层正向，偶数层反向。
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
-        inputs = [fc, lstm]
-
-    # 对最后一层fc_layer使用时间维度上的最大池化得到定长向量
-    fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
-    # 对最后一层lstmemory使用时间维度上的最大池化得到定长向量
-    lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
-    # 将fc_last和lstm_last拼接起来输入给softmax分类，类别数为class_dim
-    output = fc_layer(
-        input=[fc_last, lstm_last],
-        size=class_dim,
-        act=SoftmaxActivation(),
-        bias_attr=bias_attr,
-        param_attr=para_attr)
-
-    if is_predict:
-        outputs(output)
-    else:
-        outputs(classification_cost(input=output, label=data_layer('label', 1)))
-```
-
-我们的模型配置`trainer_config.py`默认使用`stacked_lstm_net`网络，如果要使用`convolution_net`，注释相应的行即可。
-```python
-stacked_lstm_net(
-    dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
-# convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-## 训练模型
-使用`train.sh`脚本可以开启本地的训练：
-
-```
-./train.sh
-```
-
-train.sh内容如下:
-
-```bash
-paddle train --config=trainer_config.py \
-             --save_dir=./model_output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=trainer_config.py: 设置模型配置。
-* \--save\_dir=./model_output: 设置输出路径以保存训练完成的模型。
-* \--job=train: 设置工作模式为训练。
-* \--use\_gpu=false: 使用CPU训练，如果您安装GPU版本的PaddlePaddle，并想使用GPU来训练可将此设置为true。
-* \--trainer\_count=4:设置线程数（或GPU个数）。
-* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
-* \--log\_period=20: 每20个batch打印一次日志。
-* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
-* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
-
-如果运行成功，输出日志保存在 `train.log`中，模型保存在目录`model_output/`中。  输出日志说明如下：
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-
-* Batch=xx: 表示训练了xx个Batch。
-* samples=xx: 表示训练了xx个样本。
-* AvgCost=xx: 从第0个batch到当前batch的平均损失。
-* CurrentCost=xx: 最新log_period个batch的损失。
-* Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
-* CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
-* Pass=0: 通过所有训练集一次称为一个Pass。 0表示第一次经过训练集。
-
-
-## 应用模型
-### 测试
-
-测试是指使用训练出的模型评估已标记的数据集。
-
-```
-./test.sh
-```
-
-测试脚本`test.sh`的内容如下，其中函数`get_best_pass`通过对分类错误率进行排序来获得最佳模型：
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-与训练不同，测试时需要指定`--job = test`和模型路径`--model_list = $model_list`。如果测试成功，日志将保存在`test.log`中。 在我们的测试中，最好的模型是`model_output/pass-00002`，分类错误率是0.115645：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-### 预测
-`predict.py`脚本提供了一个预测接口。预测IMDB中未标记评论的示例如下：
-
-```
-./predict.sh
-```
-predict.sh的内容如下（注意应该确保默认模型路径`model_output/pass-00002`存在或更改为其它模型路径）:
-
-```bash
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config \
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
-* `predict.py` : 预测接口脚本。
-* `--tconf=$config` : 设置网络配置。
-* `--model=$model` : 设置模型路径。
-* `--label=$label` : 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
-* `--dict=data/pre-imdb/dict.txt` : 设置文本数据字典文件。
-* `--batch_size=1` : 预测时的batch size大小。
-
-
-本示例的预测结果：
-
-```
-Loading parameters from model_output/pass-00002/
-predicting label is pos
-```
-
-`10007_10.txt`在路径`./data/aclImdb/test/pos`下面，而这里预测的标签也是pos，说明预测正确。
-## 总结
-本章我们以情感分析为例，介绍了使用深度学习的方法进行端对端的短文本分类，并且使用PaddlePaddle完成了全部相关实验。同时，我们简要介绍了两种文本处理模型：卷积神经网络和循环神经网络。在后续的章节中我们会看到这两种基本的深度学习模型在其它任务上的应用。
-## 参考文献
-1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
-2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
-3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
-4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
-5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
-6. Bengio Y, Simard P, Frasconi P. [Learning long-term dependencies with gradient descent is difficult](http://www-dsi.ing.unifi.it/~paolo/ps/tnn-94-gradient.pdf)[J]. IEEE transactions on neural networks, 1994, 5(2): 157-166.
-7. Graves A. [Generating sequences with recurrent neural networks](http://arxiv.org/pdf/1308.0850)[J]. arXiv preprint arXiv:1308.0850, 2013.
-8. Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://arxiv.org/pdf/1406.1078)[J]. arXiv preprint arXiv:1406.1078, 2014.
-9. Zhou J, Xu W. [End-to-end learning of semantic role labeling using recurrent neural networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf)[C]//Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/understand_sentiment/predict.py b/understand_sentiment/predict.py
deleted file mode 100755
index 8ec490f64691924013200a3d0038d39aa834b038..0000000000000000000000000000000000000000
--- a/understand_sentiment/predict.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import integer_value_sequence
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python predict.py -h
-"""
-
-
-class SentimentPrediction():
-    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-        self.train_conf = train_conf
-        self.dict_file = dict_file
-        self.word_dict = {}
-        self.dict_dim = self.load_dict()
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.label = None
-        if label_file is not None:
-            self.load_label(label_file)
-
-        conf = parse_config(train_conf, "is_predict=1")
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(self.model_dir)
-        input_types = [integer_value_sequence(self.dict_dim)]
-        self.converter = DataProviderConverter(input_types)
-
-    def load_dict(self):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(self.dict_file, 'r')):
-            self.word_dict[line.strip().split('\t')[0]] = line_count
-        return len(self.word_dict)
-
-    def load_label(self, label_file):
-        """
-        Load label.
-        """
-        self.label = {}
-        for v in open(label_file, 'r'):
-            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
-
-    def get_index(self, data):
-        """
-        transform word into integer index according to the dictionary.
-        """
-        words = data.strip().split()
-        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
-        return word_slot
-
-    def batch_predict(self, data_batch):
-        input = self.converter(data_batch)
-        output = self.network.forwardTest(input)
-        prob = output[0]["value"]
-        labs = np.argsort(-prob)
-        for idx, lab in enumerate(labs):
-            if self.label is None:
-                print("predicting label is %d" % (lab[0]))
-            else:
-                print("predicting label is %s" % (self.label[lab[0]]))
-
-
-def option_parser():
-    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-n",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-b",
-        "--label",
-        action="store",
-        dest="label",
-        default=None,
-        help="dictionary file")
-    parser.add_option(
-        "-c",
-        "--batch_size",
-        type="int",
-        action="store",
-        dest="batch_size",
-        default=1,
-        help="the batch size for prediction")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    batch_size = options.batch_size
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label = options.label
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = SentimentPrediction(train_conf, dict_file, model_path, label)
-
-    batch = []
-    for line in sys.stdin:
-        batch.append([predict.get_index(line)])
-        if len(batch) == batch_size:
-            predict.batch_predict(batch)
-            batch = []
-    if len(batch) > 0:
-        predict.batch_predict(batch)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/understand_sentiment/predict.sh b/understand_sentiment/predict.sh
deleted file mode 100755
index 20adee8a465ad2b78066dccd9efac2743f583350..0000000000000000000000000000000000000000
--- a/understand_sentiment/predict.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config \
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
diff --git a/understand_sentiment/preprocess.py b/understand_sentiment/preprocess.py
deleted file mode 100755
index 29b3682b747c66574590de5ea70574981cc536bb..0000000000000000000000000000000000000000
--- a/understand_sentiment/preprocess.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import random
-import operator
-import numpy as np
-from subprocess import Popen, PIPE
-from os.path import join as join_path
-from optparse import OptionParser
-
-from paddle.utils.preprocess_util import *
-"""
-Usage: run following command to show help message.
-  python preprocess.py -h 
-"""
-
-
-def save_dict(dict, filename, is_reverse=True):
-    """
-    Save dictionary into file.
-    dict:   input dictionary.
-    filename: output file name, string.
-    is_reverse: True, descending order by value.
-                False, ascending order by value.
-    """
-    f = open(filename, 'w')
-    for k, v in sorted(dict.items(), key=operator.itemgetter(1),\
-                       reverse=is_reverse):
-        f.write('%s\t%s\n' % (k, v))
-    f.close()
-
-
-def tokenize(sentences):
-    """
-    Use tokenizer.perl to tokenize input sentences.
-    tokenizer.perl is tool of Moses.
-    sentences : a list of input sentences.
-    return: a list of processed text.
-    """
-    dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
-    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
-    assert isinstance(sentences, list)
-    text = "\n".join(sentences)
-    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
-    tok_text, _ = tokenizer.communicate(text)
-    toks = tok_text.split('\n')[:-1]
-    return toks
-
-
-def read_lines(path):
-    """
-    path: String, file path.
-    return a list of sequence.
-    """
-    seqs = []
-    with open(path, 'r') as f:
-        for line in f.readlines():
-            line = line.strip()
-            if len(line):
-                seqs.append(line)
-    return seqs
-
-
-class SentimentDataSetCreate():
-    """
-    A class to process data for sentiment analysis task.
-    """
-
-    def __init__(self,
-                 data_path,
-                 output_path,
-                 use_okenizer=True,
-                 multi_lines=False):
-        """
-        data_path: string, traing and testing dataset path
-        output_path: string, output path, store processed dataset
-        multi_lines: whether a file has multi lines.
-                     In order to shuffle fully, it needs to read all files into
-                     memory, then shuffle them if one file has multi lines.
-        """
-        self.output_path = output_path
-        self.data_path = data_path
-
-        self.train_dir = 'train'
-        self.test_dir = 'test'
-
-        self.train_list = "train.list"
-        self.test_list = "test.list"
-
-        self.label_list = "labels.list"
-        self.classes_num = 0
-
-        self.batch_size = 50000
-        self.batch_dir = 'batches'
-
-        self.dict_file = "dict.txt"
-        self.dict_with_test = False
-        self.dict_size = 0
-        self.word_count = {}
-
-        self.tokenizer = use_okenizer
-        self.overwrite = False
-
-        self.multi_lines = multi_lines
-
-        self.train_dir = join_path(data_path, self.train_dir)
-        self.test_dir = join_path(data_path, self.test_dir)
-        self.train_list = join_path(output_path, self.train_list)
-        self.test_list = join_path(output_path, self.test_list)
-        self.label_list = join_path(output_path, self.label_list)
-        self.dict_file = join_path(output_path, self.dict_file)
-
-    def data_list(self, path):
-        """
-        create dataset from path
-        path: data path
-        return: data list
-        """
-        label_set = get_label_set_from_dir(path)
-        data = []
-        for lab_name in label_set.keys():
-            file_paths = list_files(join_path(path, lab_name))
-            for p in file_paths:
-                data.append({"label"  : label_set[lab_name],\
-                             "seq_path": p})
-        return data, label_set
-
-    def create_dict(self, data):
-        """
-        create dict for input data.
-        data: list, [sequence, sequnce, ...]
-        """
-        for seq in data:
-            for w in seq.strip().lower().split():
-                if w not in self.word_count:
-                    self.word_count[w] = 1
-                else:
-                    self.word_count[w] += 1
-
-    def create_dataset(self):
-        """
-        create file batches and dictionary of train data set.
-        If the self.overwrite is false and train.list already exists in
-        self.output_path, this function will not create and save file
-        batches from the data set path.
-        return: dictionary size, class number.
-        """
-        out_path = self.output_path
-        if out_path and not os.path.exists(out_path):
-            os.makedirs(out_path)
-
-        # If self.overwrite is false or self.train_list has existed,
-        # it will not process dataset.
-        if not (self.overwrite or not os.path.exists(self.train_list)):
-            print "%s already exists." % self.train_list
-            return
-
-        # Preprocess train data.
-        train_data, train_lab_set = self.data_list(self.train_dir)
-        print "processing train set..."
-        file_lists = self.save_data(train_data, "train", self.batch_size, True,
-                                    True)
-        save_list(file_lists, self.train_list)
-
-        # If have test data path, preprocess test data.
-        if os.path.exists(self.test_dir):
-            test_data, test_lab_set = self.data_list(self.test_dir)
-            assert (train_lab_set == test_lab_set)
-            print "processing test set..."
-            file_lists = self.save_data(test_data, "test", self.batch_size,
-                                        False, self.dict_with_test)
-            save_list(file_lists, self.test_list)
-
-        # save labels set.
-        save_dict(train_lab_set, self.label_list, False)
-        self.classes_num = len(train_lab_set.keys())
-
-        # save dictionary.
-        save_dict(self.word_count, self.dict_file, True)
-        self.dict_size = len(self.word_count)
-
-    def save_data(self,
-                  data,
-                  prefix="",
-                  batch_size=50000,
-                  is_shuffle=False,
-                  build_dict=False):
-        """
-        Create batches for a Dataset object.
-        data: the Dataset object to process.
-        prefix: the prefix of each batch.
-        batch_size: number of data in each batch.
-        build_dict: whether to build dictionary for data
-
-        return: list of batch names
-        """
-        if is_shuffle and self.multi_lines:
-            return self.save_data_multi_lines(data, prefix, batch_size,
-                                              build_dict)
-
-        if is_shuffle:
-            random.shuffle(data)
-        num_batches = int(math.ceil(len(data) / float(batch_size)))
-        batch_names = []
-        for i in range(num_batches):
-            batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" % (prefix, i))
-            begin = i * batch_size
-            end = min((i + 1) * batch_size, len(data))
-            # read a batch of data
-            label_list, data_list = self.get_data_list(begin, end, data)
-            if build_dict:
-                self.create_dict(data_list)
-            self.save_file(label_list, data_list, batch_name)
-            batch_names.append(batch_name)
-
-        return batch_names
-
-    def get_data_list(self, begin, end, data):
-        """
-        begin: int, begining index of data.
-        end: int, ending index of data.
-        data: a list of {"seq_path": seqquence path, "label": label index}
-
-        return a list of label and a list of sequence.
-        """
-        label_list = []
-        data_list = []
-        for j in range(begin, end):
-            seqs = read_lines(data[j]["seq_path"])
-            lab = int(data[j]["label"])
-            #File may have multiple lines.
-            for seq in seqs:
-                data_list.append(seq)
-                label_list.append(lab)
-        if self.tokenizer:
-            data_list = tokenize(data_list)
-        return label_list, data_list
-
-    def save_data_multi_lines(self,
-                              data,
-                              prefix="",
-                              batch_size=50000,
-                              build_dict=False):
-        """
-        In order to shuffle fully, there is no need to load all data if
-        each file only contains one sample, it only needs to shuffle list
-        of file name. But one file contains multi lines, each line is one
-        sample. It needs to read all data into memory to shuffle fully.
-        This interface is mainly for data containning multi lines in each
-        file, which consumes more memory if there is a great mount of data.
-
-        data: the Dataset object to process.
-        prefix: the prefix of each batch.
-        batch_size: number of data in each batch.
-        build_dict: whether to build dictionary for data
-
-        return: list of batch names
-        """
-        assert self.multi_lines
-        label_list = []
-        data_list = []
-
-        # read all data
-        label_list, data_list = self.get_data_list(0, len(data), data)
-        if build_dict:
-            self.create_dict(data_list)
-
-        length = len(label_list)
-        perm_list = np.array([i for i in xrange(length)])
-        random.shuffle(perm_list)
-
-        num_batches = int(math.ceil(length / float(batch_size)))
-        batch_names = []
-        for i in range(num_batches):
-            batch_name = join_path(self.output_path,
-                                   "%s_part_%03d" % (prefix, i))
-            begin = i * batch_size
-            end = min((i + 1) * batch_size, length)
-            sub_label = [label_list[perm_list[i]] for i in range(begin, end)]
-            sub_data = [data_list[perm_list[i]] for i in range(begin, end)]
-            self.save_file(sub_label, sub_data, batch_name)
-            batch_names.append(batch_name)
-
-        return batch_names
-
-    def save_file(self, label_list, data_list, filename):
-        """
-        Save data into file.
-        label_list: a list of int value.
-        data_list: a list of sequnece.
-        filename: output file name.
-        """
-        f = open(filename, 'w')
-        print "saving file: %s" % filename
-        for lab, seq in zip(label_list, data_list):
-            f.write('%s\t\t%s\n' % (lab, seq))
-        f.close()
-
-
-def option_parser():
-    parser = OptionParser(usage="usage: python preprcoess.py "\
-                                "-i data_dir [options]")
-    parser.add_option(
-        "-i",
-        "--data",
-        action="store",
-        dest="input",
-        help="Input data directory.")
-    parser.add_option(
-        "-o",
-        "--output",
-        action="store",
-        dest="output",
-        default=None,
-        help="Output directory.")
-    parser.add_option(
-        "-t",
-        "--tokenizer",
-        action="store",
-        dest="use_tokenizer",
-        default=True,
-        help="Whether to use tokenizer.")
-    parser.add_option("-m", "--multi_lines", action="store",
-                      dest="multi_lines", default=False,
-                      help="If input text files have multi lines and they "\
-                           "need to be shuffled, you should set -m True,")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    data_dir = options.input
-    output_dir = options.output
-    use_tokenizer = options.use_tokenizer
-    multi_lines = options.multi_lines
-    if output_dir is None:
-        outname = os.path.basename(options.input)
-        output_dir = join_path(os.path.dirname(data_dir), 'pre-' + outname)
-    data_creator = SentimentDataSetCreate(data_dir, output_dir, use_tokenizer,
-                                          multi_lines)
-    data_creator.create_dataset()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/understand_sentiment/test.sh b/understand_sentiment/test.sh
deleted file mode 100755
index 8af827c3388c8df88a872bd87d121a4f9631c3ff..0000000000000000000000000000000000000000
--- a/understand_sentiment/test.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
-  sort -n | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
diff --git a/understand_sentiment/train.sh b/understand_sentiment/train.sh
deleted file mode 100755
index df8d464d557edbc2f538cb492bd29e8d32c77635..0000000000000000000000000000000000000000
--- a/understand_sentiment/train.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-paddle train --config=trainer_config.py \
-             --save_dir=./model_output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=10 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
diff --git a/understand_sentiment/trainer_config.py b/understand_sentiment/trainer_config.py
deleted file mode 100644
index 9b9b98634bda18e4659c9aeaa8eeffcc52c13e1c..0000000000000000000000000000000000000000
--- a/understand_sentiment/trainer_config.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from os.path import join as join_path
-from paddle.trainer_config_helpers import *
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-
-data_dir = "./data/pre-imdb"
-train_list = "train.list"
-test_list = "test.list"
-dict_file = "dict.txt"
-
-dict_dim = len(open(join_path(data_dir, "dict.txt")).readlines())
-class_dim = len(open(join_path(data_dir, 'labels.list')).readlines())
-
-if not is_predict:
-    train_list = join_path(data_dir, train_list)
-    test_list = join_path(data_dir, test_list)
-    dict_file = join_path(data_dir, dict_file)
-    train_list = train_list if not is_test else None
-    word_dict = dict()
-    with open(dict_file, 'r') as f:
-        for i, line in enumerate(open(dict_file, 'r')):
-            word_dict[line.split('\t')[0]] = i
-
-    define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={'dictionary': word_dict})
-
-################## Algorithm Config #####################
-
-settings(
-    batch_size=128,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    average_window=0.5,
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-#################### Network Config ######################
-
-
-def convolution_net(input_dim,
-                    class_dim=2,
-                    emb_dim=128,
-                    hid_dim=128,
-                    is_predict=False):
-    data = data_layer("word", input_dim)
-    emb = embedding_layer(input=data, size=emb_dim)
-    conv_3 = sequence_conv_pool(input=emb, context_len=3, hidden_size=hid_dim)
-    conv_4 = sequence_conv_pool(input=emb, context_len=4, hidden_size=hid_dim)
-    output = fc_layer(
-        input=[conv_3, conv_4], size=class_dim, act=SoftmaxActivation())
-
-    if not is_predict:
-        lbl = data_layer("label", 1)
-        outputs(classification_cost(input=output, label=lbl))
-    else:
-        outputs(output)
-
-
-def stacked_lstm_net(input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3,
-                     is_predict=False):
-    """
-    A Wrapper for sentiment classification task.
-    This network uses bi-directional recurrent network,
-    consisting three LSTM layers. This configure is referred to
-    the paper as following url, but use fewer layrs.
-        http://www.aclweb.org/anthology/P15-1109
-
-    input_dim: here is word dictionary dimension.
-    class_dim: number of categories.
-    emb_dim: dimension of word embedding.
-    hid_dim: dimension of hidden layer.
-    stacked_num: number of stacked lstm-hidden layer.
-    is_predict: is predicting or not.
-                Some layers is not needed in network when predicting.
-    """
-    assert stacked_num % 2 == 1
-
-    layer_attr = ExtraLayerAttribute(drop_rate=0.5)
-    fc_para_attr = ParameterAttribute(learning_rate=1e-3)
-    lstm_para_attr = ParameterAttribute(initial_std=0., learning_rate=1.)
-    para_attr = [fc_para_attr, lstm_para_attr]
-    bias_attr = ParameterAttribute(initial_std=0., l2_rate=0.)
-    relu = ReluActivation()
-    linear = LinearActivation()
-
-    data = data_layer("word", input_dim)
-    emb = embedding_layer(input=data, size=emb_dim)
-
-    fc1 = fc_layer(input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
-    lstm1 = lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
-
-    inputs = [fc1, lstm1]
-    for i in range(2, stacked_num + 1):
-        fc = fc_layer(
-            input=inputs,
-            size=hid_dim,
-            act=linear,
-            param_attr=para_attr,
-            bias_attr=bias_attr)
-        lstm = lstmemory(
-            input=fc,
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
-        inputs = [fc, lstm]
-
-    fc_last = pooling_layer(input=inputs[0], pooling_type=MaxPooling())
-    lstm_last = pooling_layer(input=inputs[1], pooling_type=MaxPooling())
-    output = fc_layer(
-        input=[fc_last, lstm_last],
-        size=class_dim,
-        act=SoftmaxActivation(),
-        bias_attr=bias_attr,
-        param_attr=para_attr)
-
-    if is_predict:
-        outputs(output)
-    else:
-        outputs(classification_cost(input=output, label=data_layer('label', 1)))
-
-
-stacked_lstm_net(
-    dict_dim, class_dim=class_dim, stacked_num=3, is_predict=is_predict)
-# convolution_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
diff --git a/word2vec/README.en.md b/word2vec/README.en.md
deleted file mode 100644
index 654025329250e614d72a110673b4f9054a4f68ce..0000000000000000000000000000000000000000
--- a/word2vec/README.en.md
+++ /dev/null
@@ -1,180 +0,0 @@
-# Word2Vec
-
-This is intended as a reference tutorial. The source code of this tutorial lives on [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/word2vec).
-
-For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Background Introduction
-
-This section introduces the concept of **word embedding**, which is a vector representation of words. It is a popular technique used in natural language processing. Word embeddings support many Internet services, including search engines, advertising systems, and recommendation systems.
-
-### One-Hot Vectors
-
-Building these services requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
-
-For each word, its vector representation has the corresponding entry in the vector as 1, and all other entries as 0. The lengths of one-hot vectors match the size of the dictionary. Each entry of a vector corresponds to the presence (or absence) of a word in the dictionary.
-
-One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
-
-Like many machine learning models, word embeddings can represent knowledge in various ways. Another model may project an one-hot vector to an embedding vector of lower dimension e.g. $embedding(mother's day) = [0.3, 4.2, -1.5, ...], embedding(carnations) = [0.2, 5.6, -2.3, ...]$. Mapping one-hot vectors onto an embedded vector space has the potential to bring the embedding vectors of similar words (either semantically or usage-wise) closer to each other, so that the cosine similarity between the corresponding vectors for words like "Mother's Day" and "carnations" are no longer zero.
-
-A word embedding model could be a probabilistic model, a co-occurrence matrix model, or a neural network. Before people started using neural networks to generate word embedding, the traditional method was to calculate a co-occurrence matrix $X$ of words. Here, $X$ is a $|V| \times |V|$ matrix, where $X_{ij}$ represents the co-occurrence times of the $i$th and $j$th words in the vocabulary `V` within all corpus, and $|V|$ is the size of the vocabulary. By performing matrix decomposition on $X$ e.g. Singular Value Decomposition \[[5](#References)\]
-
-$$X = USV^T$$
-
-the resulting $U$ can be seen as the word embedding of all the words.
-
-However, this method suffers from many drawbacks:
-1) Since many pairs of words don't co-occur, the co-occurrence matrix is sparse. To achieve good performance of matrix factorization, further treatment on word frequency is needed;
-2) The matrix is large, frequently on the order of $10^6*10^6$;
-3) We need to manually filter out stop words (like "although", "a", ...), otherwise these frequent words will affect the performance of matrix factorization.
-
-The neural network based model does not require storing huge hash tables of statistics on all of the corpus. It obtains the word embedding by learning from semantic information, hence could avoid the aforementioned problems in the traditional method. In this chapter, we will introduce the details of neural network word embedding model and how to train such model in PaddlePaddle.
-
-## Results Demonstration
-
-In this section, after training the word embedding model, we could use the data visualization algorithm $t-$SNE\[[4](#reference)\] to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we could see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
-
-<p align="center">
-	<img src = "image/2d_similarity.png" width=400><br/>
-	Figure 1. Two dimension projection of word embeddings
-</p>
-
-### Cosine Similarity
-
-On the other hand, we know that the cosine similarity between two vectors falls between $[-1,1]$. Specifically, the cosine similarity is 1 when the vectors are identical, 0 when the vectors are perpendicular, -1 when the are of opposite directions. That is, the cosine similarity between two vectors scales with their relevance. So we can calculate the cosine similarity of two word embedding vectors to represent their relevance:
-
-```
-please input two words: big huge
-similarity: 0.899180685161
-
-please input two words: from company
-similarity: -0.0997506977351
-```
-
-The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](#Model Application).
-
-
-## Model Overview
-
-In this section, we will introduce three word embedding models: N-gram model, CBOW, and Skip-gram, which all output the frequency of each word given its immediate context.
-
-For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Model Training](#Model Training).
-
-The latter two models, which became popular recently, are neural word embedding model developed by Tomas Mikolov at Google \[[3](#reference)\]. Despite their apparent simplicity, these models train very well.
-
-### Language Model
-
-Before diving into word embedding models, we will first introduce the concept of **language model**. Language models build the joint probability function $P(w_1, ..., w_T)$ of a sentence, where $w_i$ is the i-th word in the sentence. The goal is to give higher probabilities to meaningful sentences, and lower probabilities to meaningless constructions.
-
-In general, models that generate the probability of a sequence can be applied to many fields, like machine translation, speech recognition, information retrieval, part-of-speech tagging, and handwriting recognition. Take information retrieval, for example. If you were to search for "how long is a football bame" (where bame is a medical noun), the search engine would have asked if you had meant "how long is a football game" instead. This is because the probability of "how long is a football bame" is very low according to the language model; in addition, among all of the words easily confused with "bame", "game" would build the most probable sentence.
-
-#### Target Probability
-For language model's target probability $P(w_1, ..., w_T)$, if the words in the sentence were to be independent, the joint probability of the whole sentence would be the product of each word's probability: 
-
-$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
-
-However, the frequency of words in a sentence typically relates to the words before them, so canonical language models are constructed using conditional probability in its target probability:
-
-$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$ 
-
-
-### N-gram neural model 
-
-In computational linguistics, n-gram is an important method to represent text. An n-gram represents a contiguous sequence of n consecutive items given a text. Based on the desired application scenario, each item could be a letter, a syllable or a word. The N-gram model is also an important method in statistical language modeling. When training language models with n-grams, the first (n-1) words of an n-gram are used to predict the *n*th word.
-
-Yoshua Bengio and other scientists describe how to train a word embedding model using neural network in the famous paper of Neural Probabilistic Language Models \[[1](#reference)\] published in 2003. The Neural Network Language Model (NNLM) described in the paper learns the language model and word embedding simultaneously through a linear transformation and a non-linear hidden connection. That is, after training on large amounts of corpus, the model learns the word embedding; then, it computes the probability of the whole sentence, using the embedding. This type of language model can overcome the **curse of dimensionality** i.e. model inaccuracy caused by the difference in dimensionality between training and testing data. Note that the term *neural network language model* is ill-defined, so we will not use the name NNLM but only refer to it as *N-gram neural model* in this section.
-
-We have previously described language model using conditional probability, where the probability of the *t*-th word in a sentence depends on all $t-1$ words before it. Furthermore, since words further prior have less impact on a word, and every word within an n-gram is only effected by its previous n-1 words, we have:
-
-$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
-
-Given some real corpus in which all sentences are meaningful, the n-gram model should maximize the following objective function: 
-
-$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
-
-where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term. 
-
-<p align="center">	
-   	<img src="image/nnlm_en.png" width=500><br/>
-   	Figure 2. N-gram neural network model
-</p>
-
-
-Figure 2 shows the N-gram neural network model. From the bottom up, the model has the following components:
-
- - For each sample, the model gets input $w_{t-n+1},...w_{t-1}$, and outputs the probability that the t-th word is one of `|V|` in the dictionary.
- 
- Every input word $w_{t-n+1},...w_{t-1}$ first gets transformed into word embedding $C(w_{t-n+1}),...C(w_{t-1})$ through a transformation matrix. 
- 
- - All the word embeddings concatenate into a single vector, which is mapped (nonlinearly) into the $t$-th word hidden representation:
-
-	$$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
-	
-   where $x$ is the large vector concatenated from all the word embeddings representing the context; $\theta$, $U$, $b_1$, $b_2$ and $W$ are parameters connecting word embedding layers to the hidden layers. $g$ represents the unnormalized probability of the output word, $g_i$ represents the unnormalized probability of the output word being the i-th word in the dictionary. 
-
- - Based on the definition of softmax, using normalized $g_i$, the probability that the output word is $w_t$ is represented as:
- 
-  $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
- 
- - The cost of the entire network is a multi-class cross-entropy and can be described by the following loss function
-
-   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$ 
-
-   where $y_k^i$ represents the true label for the $k$-th class in the $i$-th sample ($0$ or $1$), $softmax(g_k^i)$ represents the softmax probability for the $k$-th class in the $i$-th sample.
-
-### Continuous Bag-of-Words model(CBOW) 
-
-CBOW model predicts the current word based on the N words both before and after it. When $N=2$, the model is as the figure below:
-
-<p align="center">	
-	<img src="image/cbow_en.png" width=250><br/>
-	Figure 3. CBOW model
-</p>
-
-Specifically, by ignoring the order of words in the sequence, CBOW uses the average value of the word embedding of the context to predict the current word:
-
-$$\text{context} = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
-
-where $x_t$ is the word embedding of the t-th word, classification score vector is $z=U*\text{context}$, the final classification $y$ uses softmax and the loss function uses multi-class cross-entropy.
-
-### Skip-gram model 
-
-The advantages of CBOW is that it smooths over the word embeddings of the context and reduces noise, so it is very effective on small dataset. Skip-gram uses a word to predict its context and get multiple context for the given word, so it can be used in larger datasets. 
-
-<p align="center">	
-	<img src="image/skipgram_en.png" width=250><br/>
-	Figure 4. Skip-gram model
-</p>
-
-As illustrated in the figure above, skip-gram model maps the word embedding of the given word onto $2n$ word embeddings (including $n$ words before and $n$ words after the given word), and then combine the classification loss of all those $2n$ words by softmax. 
-
-## Data Preparation
-
-## Model Configuration
-<p align="center">	
-	<img src="image/ngram.en.png" width=400><br/>
-	Figure 5. N-gram neural network model in model configuration
-</p>
-
-	
-## Model Training
-
-## Model Application
- 
-## Conclusion
-
-This chapter introduces word embedding, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
-
-In information retrieval, the relevance between the query and document keyword can be computed through the cosine similarity of their word embeddings. In grammar analysis and semantic analysis, a previously trained word embedding can initialize models for better performance. In document classification, clustering the word embedding can group synonyms in the documents. We hope that readers can use word embedding models in their work after reading this chapter.
-
-
-## Referenes
-1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155.
-2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201.
-3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013.
-4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605.
-5. https://en.wikipedia.org/wiki/Singular_value_decomposition
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
diff --git a/word2vec/index.en.html b/word2vec/index.en.html
deleted file mode 100644
index f20c2b3122645cfc42d0248c11a74df18e1a7d1e..0000000000000000000000000000000000000000
--- a/word2vec/index.en.html
+++ /dev/null
@@ -1,242 +0,0 @@
-<html>
-<head>
-  <script type="text/x-mathjax-config">
-  MathJax.Hub.Config({
-    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
-    jax: ["input/TeX", "output/HTML-CSS"],
-    tex2jax: {
-      inlineMath: [ ['$','$'], ["\\(","\\)"] ],
-      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
-      processEscapes: true
-    },
-    "HTML-CSS": { availableFonts: ["TeX"] }
-  });
-  </script>
-  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
-  <script type="text/javascript" src="../.tmpl/marked.js">
-  </script>
-  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
-  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
-  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
-  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
-  <link href="../.tmpl/github-markdown.css" rel='stylesheet'>
-</head>
-<style type="text/css" >
-.markdown-body {
-    box-sizing: border-box;
-    min-width: 200px;
-    max-width: 980px;
-    margin: 0 auto;
-    padding: 45px;
-}
-</style>
-
-
-<body>
-
-<div id="context" class="container markdown-body">
-</div>
-
-<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
-<div id="markdown" style='display:none'>
-# Word2Vec
-
-This is intended as a reference tutorial. The source code of this tutorial lives on [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/word2vec).
-
-For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](http://www.paddlepaddle.org/doc_cn/build_and_install/index.html).
-
-## Background Introduction
-
-This section introduces the concept of **word embedding**, which is a vector representation of words. It is a popular technique used in natural language processing. Word embeddings support many Internet services, including search engines, advertising systems, and recommendation systems.
-
-### One-Hot Vectors
-
-Building these services requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
-
-For each word, its vector representation has the corresponding entry in the vector as 1, and all other entries as 0. The lengths of one-hot vectors match the size of the dictionary. Each entry of a vector corresponds to the presence (or absence) of a word in the dictionary.
-
-One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
-
-Like many machine learning models, word embeddings can represent knowledge in various ways. Another model may project an one-hot vector to an embedding vector of lower dimension e.g. $embedding(mother's day) = [0.3, 4.2, -1.5, ...], embedding(carnations) = [0.2, 5.6, -2.3, ...]$. Mapping one-hot vectors onto an embedded vector space has the potential to bring the embedding vectors of similar words (either semantically or usage-wise) closer to each other, so that the cosine similarity between the corresponding vectors for words like "Mother's Day" and "carnations" are no longer zero.
-
-A word embedding model could be a probabilistic model, a co-occurrence matrix model, or a neural network. Before people started using neural networks to generate word embedding, the traditional method was to calculate a co-occurrence matrix $X$ of words. Here, $X$ is a $|V| \times |V|$ matrix, where $X_{ij}$ represents the co-occurrence times of the $i$th and $j$th words in the vocabulary `V` within all corpus, and $|V|$ is the size of the vocabulary. By performing matrix decomposition on $X$ e.g. Singular Value Decomposition \[[5](#References)\]
-
-$$X = USV^T$$
-
-the resulting $U$ can be seen as the word embedding of all the words.
-
-However, this method suffers from many drawbacks:
-1) Since many pairs of words don't co-occur, the co-occurrence matrix is sparse. To achieve good performance of matrix factorization, further treatment on word frequency is needed;
-2) The matrix is large, frequently on the order of $10^6*10^6$;
-3) We need to manually filter out stop words (like "although", "a", ...), otherwise these frequent words will affect the performance of matrix factorization.
-
-The neural network based model does not require storing huge hash tables of statistics on all of the corpus. It obtains the word embedding by learning from semantic information, hence could avoid the aforementioned problems in the traditional method. In this chapter, we will introduce the details of neural network word embedding model and how to train such model in PaddlePaddle.
-
-## Results Demonstration
-
-In this section, after training the word embedding model, we could use the data visualization algorithm $t-$SNE\[[4](#reference)\] to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we could see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
-
-<p align="center">
-	<img src = "image/2d_similarity.png" width=400><br/>
-	Figure 1. Two dimension projection of word embeddings
-</p>
-
-### Cosine Similarity
-
-On the other hand, we know that the cosine similarity between two vectors falls between $[-1,1]$. Specifically, the cosine similarity is 1 when the vectors are identical, 0 when the vectors are perpendicular, -1 when the are of opposite directions. That is, the cosine similarity between two vectors scales with their relevance. So we can calculate the cosine similarity of two word embedding vectors to represent their relevance:
-
-```
-please input two words: big huge
-similarity: 0.899180685161
-
-please input two words: from company
-similarity: -0.0997506977351
-```
-
-The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](#Model Application).
-
-
-## Model Overview
-
-In this section, we will introduce three word embedding models: N-gram model, CBOW, and Skip-gram, which all output the frequency of each word given its immediate context.
-
-For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Model Training](#Model Training).
-
-The latter two models, which became popular recently, are neural word embedding model developed by Tomas Mikolov at Google \[[3](#reference)\]. Despite their apparent simplicity, these models train very well.
-
-### Language Model
-
-Before diving into word embedding models, we will first introduce the concept of **language model**. Language models build the joint probability function $P(w_1, ..., w_T)$ of a sentence, where $w_i$ is the i-th word in the sentence. The goal is to give higher probabilities to meaningful sentences, and lower probabilities to meaningless constructions.
-
-In general, models that generate the probability of a sequence can be applied to many fields, like machine translation, speech recognition, information retrieval, part-of-speech tagging, and handwriting recognition. Take information retrieval, for example. If you were to search for "how long is a football bame" (where bame is a medical noun), the search engine would have asked if you had meant "how long is a football game" instead. This is because the probability of "how long is a football bame" is very low according to the language model; in addition, among all of the words easily confused with "bame", "game" would build the most probable sentence.
-
-#### Target Probability
-For language model's target probability $P(w_1, ..., w_T)$, if the words in the sentence were to be independent, the joint probability of the whole sentence would be the product of each word's probability: 
-
-$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t)$$
-
-However, the frequency of words in a sentence typically relates to the words before them, so canonical language models are constructed using conditional probability in its target probability:
-
-$$P(w_1, ..., w_T) = \prod_{t=1}^TP(w_t | w_1, ... , w_{t-1})$$ 
-
-
-### N-gram neural model 
-
-In computational linguistics, n-gram is an important method to represent text. An n-gram represents a contiguous sequence of n consecutive items given a text. Based on the desired application scenario, each item could be a letter, a syllable or a word. The N-gram model is also an important method in statistical language modeling. When training language models with n-grams, the first (n-1) words of an n-gram are used to predict the *n*th word.
-
-Yoshua Bengio and other scientists describe how to train a word embedding model using neural network in the famous paper of Neural Probabilistic Language Models \[[1](#reference)\] published in 2003. The Neural Network Language Model (NNLM) described in the paper learns the language model and word embedding simultaneously through a linear transformation and a non-linear hidden connection. That is, after training on large amounts of corpus, the model learns the word embedding; then, it computes the probability of the whole sentence, using the embedding. This type of language model can overcome the **curse of dimensionality** i.e. model inaccuracy caused by the difference in dimensionality between training and testing data. Note that the term *neural network language model* is ill-defined, so we will not use the name NNLM but only refer to it as *N-gram neural model* in this section.
-
-We have previously described language model using conditional probability, where the probability of the *t*-th word in a sentence depends on all $t-1$ words before it. Furthermore, since words further prior have less impact on a word, and every word within an n-gram is only effected by its previous n-1 words, we have:
-
-$$P(w_1, ..., w_T) = \prod_{t=n}^TP(w_t|w_{t-1}, w_{t-2}, ..., w_{t-n+1})$$
-
-Given some real corpus in which all sentences are meaningful, the n-gram model should maximize the following objective function: 
-
-$$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
-
-where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term. 
-
-<p align="center">	
-   	<img src="image/nnlm_en.png" width=500><br/>
-   	Figure 2. N-gram neural network model
-</p>
-
-
-Figure 2 shows the N-gram neural network model. From the bottom up, the model has the following components:
-
- - For each sample, the model gets input $w_{t-n+1},...w_{t-1}$, and outputs the probability that the t-th word is one of `|V|` in the dictionary.
- 
- Every input word $w_{t-n+1},...w_{t-1}$ first gets transformed into word embedding $C(w_{t-n+1}),...C(w_{t-1})$ through a transformation matrix. 
- 
- - All the word embeddings concatenate into a single vector, which is mapped (nonlinearly) into the $t$-th word hidden representation:
-
-	$$g=Utanh(\theta^Tx + b_1) + Wx + b_2$$
-	
-   where $x$ is the large vector concatenated from all the word embeddings representing the context; $\theta$, $U$, $b_1$, $b_2$ and $W$ are parameters connecting word embedding layers to the hidden layers. $g$ represents the unnormalized probability of the output word, $g_i$ represents the unnormalized probability of the output word being the i-th word in the dictionary. 
-
- - Based on the definition of softmax, using normalized $g_i$, the probability that the output word is $w_t$ is represented as:
- 
-  $$P(w_t | w_1, ..., w_{t-n+1}) = \frac{e^{g_{w_t}}}{\sum_i^{|V|} e^{g_i}}$$
- 
- - The cost of the entire network is a multi-class cross-entropy and can be described by the following loss function
-
-   $$J(\theta) = -\sum_{i=1}^N\sum_{c=1}^{|V|}y_k^{i}log(softmax(g_k^i))$$ 
-
-   where $y_k^i$ represents the true label for the $k$-th class in the $i$-th sample ($0$ or $1$), $softmax(g_k^i)$ represents the softmax probability for the $k$-th class in the $i$-th sample.
-
-### Continuous Bag-of-Words model(CBOW) 
-
-CBOW model predicts the current word based on the N words both before and after it. When $N=2$, the model is as the figure below:
-
-<p align="center">	
-	<img src="image/cbow_en.png" width=250><br/>
-	Figure 3. CBOW model
-</p>
-
-Specifically, by ignoring the order of words in the sequence, CBOW uses the average value of the word embedding of the context to predict the current word:
-
-$$\text{context} = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
-
-where $x_t$ is the word embedding of the t-th word, classification score vector is $z=U*\text{context}$, the final classification $y$ uses softmax and the loss function uses multi-class cross-entropy.
-
-### Skip-gram model 
-
-The advantages of CBOW is that it smooths over the word embeddings of the context and reduces noise, so it is very effective on small dataset. Skip-gram uses a word to predict its context and get multiple context for the given word, so it can be used in larger datasets. 
-
-<p align="center">	
-	<img src="image/skipgram_en.png" width=250><br/>
-	Figure 4. Skip-gram model
-</p>
-
-As illustrated in the figure above, skip-gram model maps the word embedding of the given word onto $2n$ word embeddings (including $n$ words before and $n$ words after the given word), and then combine the classification loss of all those $2n$ words by softmax. 
-
-## Data Preparation
-
-## Model Configuration
-<p align="center">	
-	<img src="image/ngram.en.png" width=400><br/>
-	Figure 5. N-gram neural network model in model configuration
-</p>
-
-	
-## Model Training
-
-## Model Application
- 
-## Conclusion
-
-This chapter introduces word embedding, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
-
-In information retrieval, the relevance between the query and document keyword can be computed through the cosine similarity of their word embeddings. In grammar analysis and semantic analysis, a previously trained word embedding can initialize models for better performance. In document classification, clustering the word embedding can group synonyms in the documents. We hope that readers can use word embedding models in their work after reading this chapter.
-
-
-## Referenes
-1. Bengio Y, Ducharme R, Vincent P, et al. [A neural probabilistic language model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)[J]. journal of machine learning research, 2003, 3(Feb): 1137-1155.
-2. Mikolov T, Kombrink S, Deoras A, et al. [Rnnlm-recurrent neural network language modeling toolkit](http://www.fit.vutbr.cz/~imikolov/rnnlm/rnnlm-demo.pdf)[C]//Proc. of the 2011 ASRU Workshop. 2011: 196-201.
-3. Mikolov T, Chen K, Corrado G, et al. [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf)[J]. arXiv preprint arXiv:1301.3781, 2013.
-4. Maaten L, Hinton G. [Visualizing data using t-SNE](https://lvdmaaten.github.io/publications/papers/JMLR_2008.pdf)[J]. Journal of Machine Learning Research, 2008, 9(Nov): 2579-2605.
-5. https://en.wikipedia.org/wiki/Singular_value_decomposition
-
-<br/>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="知识共享许可协议" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" href="http://purl.org/dc/dcmitype/Text" property="dct:title" rel="dct:type">本教程</span> 由 <a xmlns:cc="http://creativecommons.org/ns#" href="http://book.paddlepaddle.org" property="cc:attributionName" rel="cc:attributionURL">PaddlePaddle</a> 创作，采用 <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">知识共享 署名-非商业性使用-相同方式共享 4.0 国际 许可协议</a>进行许可。
-</div>
-<!-- You can change the lines below now. -->
-
-<script type="text/javascript">
-marked.setOptions({
-  renderer: new marked.Renderer(),
-  gfm: true,
-  breaks: false,
-  smartypants: true,
-  highlight: function(code, lang) {
-    code = code.replace(/&amp;/g, "&")
-    code = code.replace(/&gt;/g, ">")
-    code = code.replace(/&lt;/g, "<")
-    code = code.replace(/&nbsp;/g, " ")
-    return hljs.highlightAuto(code, [lang]).value;
-  }
-});
-document.getElementById("context").innerHTML = marked(
-		document.getElementById("markdown").innerHTML)
-</script>
-</body>
diff --git a/word2vec/train.py b/word2vec/train.py
deleted file mode 100644
index 15ad6a01cc2230ad1c8a6a44c1d3d828331a0d1d..0000000000000000000000000000000000000000
--- a/word2vec/train.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import math
-
-import paddle.v2 as paddle
-
-embsize = 32
-hiddensize = 256
-N = 5
-
-
-def wordemb(inlayer):
-    wordemb = paddle.layer.table_projection(
-        input=inlayer,
-        size=embsize,
-        param_attr=paddle.attr.Param(
-            name="_proj",
-            initial_std=0.001,
-            learning_rate=1,
-            l2_rate=0, ))
-    return wordemb
-
-
-def main():
-    paddle.init(use_gpu=False, trainer_count=1)
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-    firstword = paddle.layer.data(
-        name="firstw", type=paddle.data_type.integer_value(dict_size))
-    secondword = paddle.layer.data(
-        name="secondw", type=paddle.data_type.integer_value(dict_size))
-    thirdword = paddle.layer.data(
-        name="thirdw", type=paddle.data_type.integer_value(dict_size))
-    fourthword = paddle.layer.data(
-        name="fourthw", type=paddle.data_type.integer_value(dict_size))
-    nextword = paddle.layer.data(
-        name="fifthw", type=paddle.data_type.integer_value(dict_size))
-
-    Efirst = wordemb(firstword)
-    Esecond = wordemb(secondword)
-    Ethird = wordemb(thirdword)
-    Efourth = wordemb(fourthword)
-
-    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
-    hidden1 = paddle.layer.fc(input=contextemb,
-                              size=hiddensize,
-                              act=paddle.activation.Sigmoid(),
-                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
-                              bias_attr=paddle.attr.Param(learning_rate=2),
-                              param_attr=paddle.attr.Param(
-                                  initial_std=1. / math.sqrt(embsize * 8),
-                                  learning_rate=1))
-    predictword = paddle.layer.fc(input=hidden1,
-                                  size=dict_size,
-                                  bias_attr=paddle.attr.Param(learning_rate=2),
-                                  act=paddle.activation.Softmax())
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                result = trainer.test(
-                    paddle.batch(
-                        paddle.dataset.imikolov.test(word_dict, N), 32))
-                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    result.metrics)
-
-    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
-    parameters = paddle.parameters.create(cost)
-    adam_optimizer = paddle.optimizer.Adam(
-        learning_rate=3e-3,
-        regularization=paddle.optimizer.L2Regularization(8e-4))
-    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
-    trainer.train(
-        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
-        num_passes=30,
-        event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()

训练数据	验证数据	测试数据
ptb.train.txt	ptb.valid.txt	ptb.test.txt
42068句	3370句	3761句
训练数据	验证数据	测试数据
ptb.train.txt	ptb.valid.txt	ptb.test.txt
42068句	3370句	3761句
training set	validation set	test set
ptb.train.txt	ptb.valid.txt	ptb.test.txt
42068 lines	3370 lines	3761 lines
Folder Name	French-English Parallel Corpus	Number of Files	Size of Files
train	ccb2_pc30.src, ccb2_pc30.trg, etc	12	3.55G
test	ntst1213.src, ntst1213.trg	2	1636k
gen	ntst14.src, ntst14.trg	2	864k
文件夹名	法英平行语料文件	文件数	文件大小
train	ccb2_pc30.src, ccb2_pc30.trg, etc	12	3.55G
test	ntst1213.src, ntst1213.trg	2	1636k
gen	ntst14.src, ntst14.trg	2	864k