diff --git a/.gitignore b/.gitignore
index fab52d7877497804092cf72d52bd64dc2f4f1747..e7f8501f2c04d0ddb9a27202b3e91d33c47d9de8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,3 @@ pandoc.template
 py_env*
 *.ipynb
 build
-Dockerfile
diff --git a/.tools/build_docker.sh b/.tools/build_docker.sh
index 60474b1c6c56d7f84c8e5ee19132d93dbf19df9c..242f8de16639f6c333d99f2ff4c7e24f86788c98 100755
--- a/.tools/build_docker.sh
+++ b/.tools/build_docker.sh
@@ -64,7 +64,7 @@ RUN pip install -U nltk \
 RUN ${update_mirror_cmd}
     apt-get update && \
     apt-get install -y locales patch && \
-    apt-get -y install gcc curl git && \
+    apt-get -y install gcc curl git vim && \
     apt-get -y clean && \
     localedef -f UTF-8 -i en_US en_US.UTF-8 && \
     pip install --upgrade pip && \
diff --git a/01.fit_a_line/README.cn.md b/01.fit_a_line/README.cn.md
index 30e78a42ddb5c36072485c5dbecd63433dc91065..5aa1b1d5bc524a6d570130471d17733c894f0a0e 100644
--- a/01.fit_a_line/README.cn.md
+++ b/01.fit_a_line/README.cn.md
@@ -126,8 +126,18 @@ y_predict = paddle.layer.fc(input=x,
                                 size=1,
                                 act=paddle.activation.Linear())
 y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-cost = paddle.layer.mse_cost(input=y_predict, label=y)
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 ```
+
+### 保存网络拓扑
+
+```python
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
+```
+
 ### 创建参数
 
 ```python
diff --git a/01.fit_a_line/README.md b/01.fit_a_line/README.md
index ce4b5334402c52548791fab636593b613cb74096..363f9d06bd37d14d9865332e540396ce9640600d 100644
--- a/01.fit_a_line/README.md
+++ b/01.fit_a_line/README.md
@@ -4,9 +4,9 @@ Let us begin the tutorial with a classical problem called Linear Regression \[[1
 The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Problem Setup
-Suppose we have a dataset of $n$ real estate properties. These real estate properties will be referred to as *homes* in this chapter for clarity.
+Suppose we have a dataset of $n$ real estate properties. Each real estate property will be referred to as **homes** in this chapter for clarity.
 
-Each home is associated with $d$ attributes. The attributes describe characteristics such the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
+Each home is associated with $d$ attributes. The attributes describe characteristics such as the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
 
 In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely,
 
@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b,  i=1,\
 where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
 
 ## Results Demonstration
-We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of simlilar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the more precise the model predicts, the closer the point is to the dotted line.
+We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of similar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the closer the point is to the dotted line, better the model's prediction.
 <p align="center">
     <img src = "image/predictions_en.png" width=400><br/>
     Figure 1. Predicted Value V.S. Actual Value
@@ -45,7 +45,7 @@ After setting up our model, there are several major steps to go through to train
 1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s.
 2. Feedforward. Evaluate the network output and compute the corresponding loss.
 3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
-4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of repeats is reached.
+4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of epochs is reached.
 
 ## Dataset
 
@@ -60,8 +60,8 @@ import paddle.v2.dataset.uci_housing as uci_housing
 
 We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`.  This module can
 
-1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if not yet, and
-2.  [preprocesses](#preprocessing) the dataset.
+1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if you haven't yet, and
+2.  [preprocess](#preprocessing) the dataset.
 
 ### An Introduction of the Dataset
 
@@ -93,7 +93,7 @@ We define a feature vector of length 13 for each home, where each entry correspo
 Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically.  The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
 
 #### Feature Normalization
-We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale te values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we substract the mean value from the feature value and divide the result by the width of the original range.
+We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale the values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we subtract the mean value from the feature value and divide the result by the width of the original range.
 
 There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
 - A value range that is too large or too small might cause floating number overflow or underflow during computation.
@@ -106,7 +106,7 @@ There are at least three reasons for [Feature Normalization](https://en.wikipedi
 </p>
 
 #### Prepare Training and Test Sets
-We split the dataset in two, one for adjusting the model parameters, namely, for model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
+We split the dataset in two, one for adjusting the model parameters, namely, for training the model, and the other for testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
 
 
 When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process.
@@ -124,7 +124,7 @@ paddle.init(use_gpu=False, trainer_count=1)
 
 ### Model Configuration
 
-Logistic regression is essentially a fully-connected layer with linear activation:
+Linear regression is essentially a fully-connected layer with linear activation:
 
 ```python
 x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
@@ -132,8 +132,19 @@ y_predict = paddle.layer.fc(input=x,
                             size=1,
                             act=paddle.activation.Linear())
 y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-cost = paddle.layer.mse_cost(input=y_predict, label=y)
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 ```
+
+### Save Topology
+
+```python
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
+```
+
+
 ### Create Parameters
 
 ```python
@@ -154,7 +165,7 @@ trainer = paddle.trainer.SGD(cost=cost,
 
 PaddlePaddle provides the
 [reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
-for loadinng training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
+for loading the training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
 
 ```python
 feeding={'x': 0, 'y': 1}
@@ -179,7 +190,7 @@ def event_handler(event):
 ```
 
 ```python
-# event_handler to print training and testing info
+# event_handler to plot training and testing info
 from paddle.v2.plot import Ploter
 
 train_title = "Train cost"
diff --git a/01.fit_a_line/index.cn.html b/01.fit_a_line/index.cn.html
index 933e5a4d8bfb53cb50a9675ea907d939203742da..c69a8dc1006cad4e2b051cc25fd2fed8a4e25706 100644
--- a/01.fit_a_line/index.cn.html
+++ b/01.fit_a_line/index.cn.html
@@ -168,8 +168,18 @@ y_predict = paddle.layer.fc(input=x,
                                 size=1,
                                 act=paddle.activation.Linear())
 y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-cost = paddle.layer.mse_cost(input=y_predict, label=y)
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 ```
+
+### 保存网络拓扑
+
+```python
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
+```
+
 ### 创建参数
 
 ```python
diff --git a/01.fit_a_line/index.html b/01.fit_a_line/index.html
index 22afb004ad3701c4f7d6b00bb8d638ff029c7edc..28f72cace59bbfa80bebf965527ed44e3853f47d 100644
--- a/01.fit_a_line/index.html
+++ b/01.fit_a_line/index.html
@@ -46,9 +46,9 @@ Let us begin the tutorial with a classical problem called Linear Regression \[[1
 The source code for this tutorial lives on [book/fit_a_line](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line). For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Problem Setup
-Suppose we have a dataset of $n$ real estate properties. These real estate properties will be referred to as *homes* in this chapter for clarity.
+Suppose we have a dataset of $n$ real estate properties. Each real estate property will be referred to as **homes** in this chapter for clarity.
 
-Each home is associated with $d$ attributes. The attributes describe characteristics such the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
+Each home is associated with $d$ attributes. The attributes describe characteristics such as the number of rooms in the home, the number of schools or hospitals in the neighborhood, and the traffic condition nearby.
 
 In our problem setup, the attribute $x_{i,j}$ denotes the $j$th characteristic of the $i$th home. In addition, $y_i$ denotes the price of the $i$th home. Our task is to predict $y_i$ given a set of attributes $\{x_{i,1}, ..., x_{i,d}\}$. We assume that the price of a home is a linear combination of all of its attributes, namely,
 
@@ -57,7 +57,7 @@ $$y_i = \omega_1x_{i,1} + \omega_2x_{i,2} + \ldots + \omega_dx_{i,d} + b,  i=1,\
 where $\vec{\omega}$ and $b$ are the model parameters we want to estimate. Once they are learned, we will be able to predict the price of a home, given the attributes associated with it. We call this model **Linear Regression**. In other words, we want to regress a value against several values linearly. In practice, a linear model is often too simplistic to capture the real relationships between the variables. Yet, because Linear Regression is easy to train and analyze, it has been applied to a large number of real problems. As a result, it is an important topic in many classic Statistical Learning and Machine Learning textbooks \[[2,3,4](#References)\].
 
 ## Results Demonstration
-We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of simlilar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the more precise the model predicts, the closer the point is to the dotted line.
+We first show the result of our model. The dataset [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) is used to train a linear model to predict the home prices in Boston. The figure below shows the predictions the model makes for some home prices. The $X$-axis represents the median value of the prices of similar homes within a bin, while the $Y$-axis represents the home value our linear model predicts. The dotted line represents points where $X=Y$. When reading the diagram, the closer the point is to the dotted line, better the model's prediction.
 <p align="center">
     <img src = "image/predictions_en.png" width=400><br/>
     Figure 1. Predicted Value V.S. Actual Value
@@ -87,7 +87,7 @@ After setting up our model, there are several major steps to go through to train
 1. Initialize the parameters including the weights $\vec{\omega}$ and the bias $b$. For example, we can set their mean values as $0$s, and their standard deviations as $1$s.
 2. Feedforward. Evaluate the network output and compute the corresponding loss.
 3. [Backpropagate](https://en.wikipedia.org/wiki/Backpropagation) the errors. The errors will be propagated from the output layer back to the input layer, during which the model parameters will be updated with the corresponding errors.
-4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of repeats is reached.
+4. Repeat steps 2~3, until the loss is below a predefined threshold or the maximum number of epochs is reached.
 
 ## Dataset
 
@@ -102,8 +102,8 @@ import paddle.v2.dataset.uci_housing as uci_housing
 
 We encapsulated the [UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing) in our Python module `uci_housing`.  This module can
 
-1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if not yet, and
-2.  [preprocesses](#preprocessing) the dataset.
+1. download the dataset to `~/.cache/paddle/dataset/uci_housing/housing.data`, if you haven't yet, and
+2.  [preprocess](#preprocessing) the dataset.
 
 ### An Introduction of the Dataset
 
@@ -135,7 +135,7 @@ We define a feature vector of length 13 for each home, where each entry correspo
 Note that although a discrete value is also written as numeric values such as 0, 1, or 2, its meaning differs from a continuous value drastically.  The linear difference between two discrete values has no meaning. For example, suppose $0$, $1$, and $2$ are used to represent colors *Red*, *Green*, and *Blue* respectively. Judging from the numeric representation of these colors, *Red* differs more from *Blue* than it does from *Green*. Yet in actuality, it is not true that extent to which the color *Blue* is different from *Red* is greater than the extent to which *Green* is different from *Red*. Therefore, when handling a discrete feature that has $d$ possible values, we usually convert it to $d$ new features where each feature takes a binary value, $0$ or $1$, indicating whether the original value is absent or present. Alternatively, the discrete features can be mapped onto a continuous multi-dimensional vector through an embedding table. For our problem here, because CHAS itself is a binary discrete value, we do not need to do any preprocessing.
 
 #### Feature Normalization
-We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale te values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we substract the mean value from the feature value and divide the result by the width of the original range.
+We also observe a huge difference among the value ranges of the 13 features (Figure 2). For instance, the values of feature *B* fall in $[0.32, 396.90]$, whereas those of feature *NOX* has a range of $[0.3850, 0.8170]$. An effective optimization would require data normalization. The goal of data normalization is to scale the values of each feature into roughly the same range, perhaps $[-0.5, 0.5]$. Here, we adopt a popular normalization technique where we subtract the mean value from the feature value and divide the result by the width of the original range.
 
 There are at least three reasons for [Feature Normalization](https://en.wikipedia.org/wiki/Feature_scaling) (Feature Scaling):
 - A value range that is too large or too small might cause floating number overflow or underflow during computation.
@@ -148,7 +148,7 @@ There are at least three reasons for [Feature Normalization](https://en.wikipedi
 </p>
 
 #### Prepare Training and Test Sets
-We split the dataset in two, one for adjusting the model parameters, namely, for model training, and the other for model testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict new outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
+We split the dataset in two, one for adjusting the model parameters, namely, for training the model, and the other for testing. The model error on the former is called the **training error**, and the error on the latter is called the **test error**. Our goal in training a model is to find the statistical dependency between the outputs and the inputs, so that we can predict outputs given new inputs. As a result, the test error reflects the performance of the model better than the training error does. We consider two things when deciding the ratio of the training set to the test set: 1) More training data will decrease the variance of the parameter estimation, yielding more reliable models; 2) More test data will decrease the variance of the test error, yielding more reliable test errors. One standard split ratio is $8:2$.
 
 
 When training complex models, we usually have one more split: the validation set. Complex models usually have [Hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_optimization) that need to be set before the training process, such as the number of layers in the network. Because hyperparameters are not part of the model parameters, they cannot be trained using the same loss function. Thus we will try several sets of hyperparameters to train several models and cross-validate them on the validation set to pick the best one; finally, the selected trained model is tested on the test set. Because our model is relatively simple, we will omit this validation process.
@@ -166,7 +166,7 @@ paddle.init(use_gpu=False, trainer_count=1)
 
 ### Model Configuration
 
-Logistic regression is essentially a fully-connected layer with linear activation:
+Linear regression is essentially a fully-connected layer with linear activation:
 
 ```python
 x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
@@ -174,8 +174,19 @@ y_predict = paddle.layer.fc(input=x,
                             size=1,
                             act=paddle.activation.Linear())
 y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-cost = paddle.layer.mse_cost(input=y_predict, label=y)
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 ```
+
+### Save Topology
+
+```python
+# Save the inference topology to protobuf.
+inference_topology = paddle.topology.Topology(layers=y_predict)
+with open("inference_topology.pkl", 'wb') as f:
+    inference_topology.serialize_for_inference(f)
+```
+
+
 ### Create Parameters
 
 ```python
@@ -196,7 +207,7 @@ trainer = paddle.trainer.SGD(cost=cost,
 
 PaddlePaddle provides the
 [reader mechanism](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader)
-for loadinng training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
+for loading the training data. A reader may return multiple columns, and we need a Python dictionary to specify the mapping from column index to data layers.
 
 ```python
 feeding={'x': 0, 'y': 1}
@@ -221,7 +232,7 @@ def event_handler(event):
 ```
 
 ```python
-# event_handler to print training and testing info
+# event_handler to plot training and testing info
 from paddle.v2.plot import Ploter
 
 train_title = "Train cost"
diff --git a/01.fit_a_line/train.py b/01.fit_a_line/train.py
index 255180d3c4322e8dd201e96917e288e3ee209d61..79a320fcb1d7fdef53a2254dfdb5d0317227cb2b 100644
--- a/01.fit_a_line/train.py
+++ b/01.fit_a_line/train.py
@@ -1,10 +1,13 @@
+import os
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 
 def main():
     # init
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
 
     # network config
     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
@@ -12,6 +15,11 @@ def main():
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
     cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 
+    # Save the inference topology to protobuf.
+    inference_topology = paddle.topology.Topology(layers=y_predict)
+    with open("inference_topology.pkl", 'wb') as f:
+        inference_topology.serialize_for_inference(f)
+
     # create parameters
     parameters = paddle.parameters.create(cost)
 
@@ -21,10 +29,6 @@ def main():
     trainer = paddle.trainer.SGD(
         cost=cost, parameters=parameters, update_equation=optimizer)
 
-    # save model proto as file
-    with open("model.proto", "w") as f:
-        f.write(str(trainer.__topology_in_proto__))
-
     feeding = {'x': 0, 'y': 1}
 
     # event_handler to print training and testing info
diff --git a/02.recognize_digits/README.md b/02.recognize_digits/README.md
index 198897fb55c731ad8b4210fa4dc6e36978c6f197..b7836415c507e86eb0b627e894a449092fcb5d85 100644
--- a/02.recognize_digits/README.md
+++ b/02.recognize_digits/README.md
@@ -1,22 +1,20 @@
 # Recognize Digits
 
-The source code for this tutorial is live at [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits). For instructions on getting started with Paddle, please refer to [installation instructions](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+The source code for this tutorial is here:  [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits). For instructions on getting started with Paddle, please refer to [installation instructions](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Introduction
-When one learns to program, the first task is usually to write a program that prints "Hello World!". In Machine Learning or Deep Learning, the equivalent task is to train a model to recognize hand-written digits on the dataset [MNIST](http://yann.lecun.com/exdb/mnist/). Handwriting recognition is a classic image classification problem. The problem is relatively easy and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a $28\times28$ matrix, and the label is one of the digits from $0$ to $9$. All images are normalized, meaning that they are both rescaled and centered.
+When one learns to program, the first task is usually to write a program that prints "Hello World!". In Machine Learning or Deep Learning, an equivalent task is to train a model to recognize hand-written digits using the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. Handwriting recognition is a classic image classification problem. The problem is relatively easy and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a $28\times28$ matrix, and the label is one of the digits from $0$ to $9$. All images are normalized, meaning that they are both rescaled and centered.
 
 <p align="center">
 <img src="image/mnist_example_image.png" width="400"><br/>
 Fig. 1. Examples of MNIST images
 </p>
 
-The MNIST dataset is created from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students the in U.S. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set (60,000 samples) and test set (10,000 samples), where training set was labeled by 250 different annotators, and it was guaranteed that there wasn't a complete overlap of annotators of training set and test set.
+The MNIST dataset is from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set of 60,000 samples and test set of 10,000 samples. 250 annotators labeled the training set, thus guaranteed that there wasn't a complete overlap of annotators of training set and test set.
 
-Yann LeCun, one of the founders of Deep Learning, have previously made tremendous contributions to handwritten character recognition and proposed the **Convolutional Neural Network** (CNN), which drastically improved recognition capability for handwritten characters. CNNs are now a critical concept in Deep Learning. From the LeNet proposal by Yann LeCun, to those winning models in ImageNet competitions, such as VGGNet, GoogLeNet, and ResNet (See [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) tutorial), CNNs have achieved a series of impressive results in Image Classification tasks.
+The MNIST dataset has been used for evaluating many image recognition algorithms such as a single layer linear classifier, Multilayer Perceptron (MLP) and Multilayer CNN LeNet\[[1](#references)\], K-Nearest Neighbors (k-NN) \[[2](#references)\], Support Vector Machine (SVM) \[[3](#references)\], Neural Networks \[[4-7](#references)\], Boosting \[[8](#references)\] and preprocessing methods like distortion removal, noise removal, and blurring.  Among these algorithms, the *Convolutional Neural Network* (CNN) has achieved a series of impressive results in Image Classification tasks, including VGGNet, GoogLeNet, and ResNet (See [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) tutorial).
 
-Many algorithms are tested on MNIST. In 1998, LeCun experimented with single layer linear classifier, Multilayer Perceptron (MLP) and Multilayer CNN LeNet. These algorithms quickly reduced test error from 12% to 0.7% \[[1](#references)\]. Since then, researchers have worked on many algorithms such as **K-Nearest Neighbors** (k-NN) \[[2](#references)\], **Support Vector Machine** (SVM) \[[3](#references)\], **Neural Networks** \[[4-7](#references)\] and **Boosting** \[[8](#references)\]. Various preprocessing methods like distortion removal, noise removal, and blurring, have also been applied to increase recognition accuracy.
-
-In this tutorial, we tackle the task of handwritten character recognition. We start with a simple **softmax** regression model and guide our readers step-by-step to improve this model's performance on the task of recognition.
+In this tutorial, we start with a simple **softmax** regression model and go on with MLP and CNN.  Readers will see how these methods improve the recognition accuracy step-by-step.
 
 
 ## Model Overview
@@ -36,7 +34,7 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
 
 where $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
 
-For an $N$-class classification problem with $N$ output nodes, Softmax normalizes the resulting $N$ dimensional vector so that each of its entries falls in the range $[0,1]\in\math{R}$, representing the probability that the sample belongs to a certain class. Here $y_i$ denotes the predicted probability that an image is of digit $i$.
+For an $N$-class classification problem with $N$ output nodes, Softmax normalizes the resulting $N$ dimensional vector so that each of its entries falls in the range $[0,1]\in {R}$, representing the probability that the sample belongs to a certain class. Here $y_i$ denotes the predicted probability that an image is of digit $i$.
 
 In such a classification problem, we usually use the cross entropy loss function:
 
@@ -76,7 +74,7 @@ Fig. 4. Convolutional layer<br/>
 
 The **convolutional layer** is the core of a Convolutional Neural Network. The parameters in this layer are composed of a set of filters, also called kernels. We could visualize the convolution step in the following fashion: Each kernel slides horizontally and vertically till it covers the whole image. At every window, we compute the dot product of the kernel and the input. Then, we add the bias and apply an activation function. The result is a two-dimensional activation map. For example, some kernel may recognize corners, and some may recognize circles. These convolution kernels may respond strongly to the corresponding features.
 
-Fig. 4 illustrates the dynamic programming of a convolutional layer, where depths are flattened for simplicity. The input is $W_1=5$, $H_1=5$, $D_1=3$. In fact, this is a common representation for colored images. $W_1$ and $H_1$ correspond to the width and height in a colored image. $D_1$ corresponds to the 3 color channels for RGB. The parameters of the convolutional layer are $K=2$, $F=3$, $S=2$, $P=1$. $K$ denotes the number of kernels; specifically, $Filter$ $W_0$ and $Filter$ $W_1$ are the kernels. $F$ is kernel size while $W0$ and $W1$ are both $F\timesF = 3\times3$ matrices in all depths. $S$ is the stride, which is the width of the sliding window; here, kernels move leftwards or downwards by 2 units each time. $P$ is the width of the padding, which denotes an extension of the input; here, the gray area shows zero padding with size 1.
+Fig. 4 illustrates the dynamic programming of a convolutional layer, where depths are flattened for simplicity. The input is $W_1=5$, $H_1=5$, $D_1=3$. In fact, this is a common representation for colored images. $W_1$ and $H_1$ correspond to the width and height in a colored image. $D_1$ corresponds to the three color channels for RGB. The parameters of the convolutional layer are $K=2$, $F=3$, $S=2$, $P=1$. $K$ denotes the number of kernels; specifically, $Filter$ $W_0$ and $Filter$ $W_1$ are the kernels. $F$ is kernel size while $W0$ and $W1$ are both $F\timesF = 3\times3$ matrices in all depths. $S$ is the stride, which is the width of the sliding window; here, kernels move leftwards or downwards by two units each time. $P$ is the width of the padding, which denotes an extension of the input; here, the gray area shows zero padding with size 1.
 
 #### Pooling Layer
 
@@ -96,11 +94,11 @@ Fig. 6. LeNet-5 Convolutional Neural Network architecture<br/>
 
 [**LeNet-5**](http://yann.lecun.com/exdb/lenet/) is one of the simplest Convolutional Neural Networks. Fig. 6. shows its architecture: A 2-dimensional input image is fed into two sets of convolutional layers and pooling layers. This output is then fed to a fully connected layer and a softmax classifier. Compared to multilayer, fully connected perceptrons, the LeNet-5 can recognize images better. This is due to the following three properties of the convolution:
 
-- The 3D nature of the neurons: a convolutional layer is organized by width, height and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
+- The 3D nature of the neurons: a convolutional layer is organized by width, height, and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
 - Local connectivity: A CNN utilizes the local space correlation by connecting local neurons. This design guarantees that the learned filter has a strong response to local input features. Stacking many such layers generates a non-linear filter that is more global. This enables the network to first obtain good representation for small parts of input and then combine them to represent a larger region.
-- Weight sharing: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means that all the neurons in the same depth of the output respond to the same feature. This allows the network to detect a feature regardless of its position in the input. In other words, it is shift invariant.
+- Weight sharing: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means that all the neurons in the same depth of the output response to the same feature. This allows the network to detect a feature regardless of its position in the input.
 
-For more details on Convolutional Neural Networks, please refer to the tutorial on [Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) and the [relevant lecture](http://cs231n.github.io/convolutional-networks/) from a Stanford open course.
+For more details on Convolutional Neural Networks, please refer to the tutorial on [Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) and the [relevant lecture](http://cs231n.github.io/convolutional-networks/) from a Stanford course.
 
 ### List of Common Activation Functions
 - Sigmoid activation function: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
@@ -221,11 +219,11 @@ trainer = paddle.trainer.SGD(cost=cost,
                              update_equation=optimizer)
 ```
 
-Then we specify the training data `paddle.dataset.movielens.train()` and testing data `paddle.dataset.movielens.test()`. These two methods are *reader creators*. Once called, a reader creator returns a *reader*.  A reader is a Python method, which, once called, returns a Python generator, which yields instances of data.
+Then we specify the training data `paddle.dataset.mnist.train()` and testing data `paddle.dataset.mnist.test()`. These two methods are *reader creators*. Once called, a reader creator returns a *reader*.  A reader is a Python method, which, once called, returns a Python generator, which yields instances of data.
 
-`shuffle` is a reader decorator. It takes in a reader A as input and returns a new reader B. Under the hood, B calls A to read data in the following fashion: it copies in `buffer_size` instances at a time into a buffer, shuffles the data, and yields the shuffled instances one at a time. A large buffer size would yield very shuffled data.
+`shuffle` is a reader decorator. It takes a reader A as input and returns a new reader B. Under the hood, B calls A to read data in the following fashion: it copies in `buffer_size` instances at a time into a buffer, shuffles the data, and yields the shuffled instances one at a time. A large buffer size would yield very shuffled data.
 
-`batch` is a special decorator, which takes in reader and outputs a *batch reader*, which doesn't yield an instance, but a minibatch at a time.
+`batch` is a special decorator, which takes a reader and outputs a *batch reader*, which doesn't yield an instance, but a minibatch at a time.
 
 `event_handler_plot` is used to plot a figure like below：
 
@@ -263,6 +261,7 @@ def event_handler_plot(event):
 ```python
 lists = []
 
+# event handler to print the progress
 def event_handler(event):
     if isinstance(event, paddle.event.EndIteration):
         if event.batch_id % 100 == 0:
@@ -282,6 +281,7 @@ def event_handler(event):
 ```
 
 ```python
+# Train the model now
 trainer.train(
     reader=paddle.batch(
         paddle.reader.shuffle(
@@ -315,7 +315,7 @@ Usually, with MNIST data, the softmax regression model achieves an accuracy arou
 
 ## Application
 
-After training is done, user can use the trained model to classify images. The following code shows how to inference MNIST images through `paddle.infer` interface.
+After training, users can use the trained model to classify images. The following code shows how to inference MNIST images through `paddle.infer` interface.
 
 ```python
 from PIL import Image
@@ -343,15 +343,15 @@ print "Label of image/infer_3.png is: %d" % lab[0][0]
 
 This tutorial describes a few common deep learning models using **Softmax regression**, **Multilayer Perceptron Network**, and **Convolutional Neural Network**. Understanding these models is crucial for future learning; the subsequent tutorials derive more sophisticated networks by building on top of them.
 
-When our model evolves from a simple softmax regression to a slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST data set achieves a large improvement in accuracy. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve the results of an old one.
+When our model evolves from a simple softmax regression to a slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST dataset achieves a large improvement. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve the results of an old one.
 
-Moreover, this tutorial introduces the basic flow of PaddlePaddle model design, which starts with a *dataprovider*, a model layer construction, and finally training and prediction. Motivated readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
+Moreover, this tutorial introduces the basic flow of PaddlePaddle model design, which starts with a *data provider*, a model layer construction, and finally training and prediction. Motivated readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
 
 
 ## References
 
 1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
-2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014).
+2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2:753279&dswid=-434) (2014).
 3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
 4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
 5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
diff --git a/02.recognize_digits/client/client.py b/02.recognize_digits/client/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b338d6de402100e0ea20bd4361c70dc1bdb80a
--- /dev/null
+++ b/02.recognize_digits/client/client.py
@@ -0,0 +1,21 @@
+import requests
+from PIL import Image
+import numpy as np
+import os
+
+
+def load_image(file):
+    im = Image.open(file).convert('L')
+    im = im.resize((28, 28), Image.ANTIALIAS)
+    im = np.array(im).astype(np.float32).flatten()
+    im = im / 255.0
+    return im
+
+
+cur_dir = os.path.dirname(os.path.realpath(__file__))
+data = load_image(cur_dir + '/../image/infer_3.png')
+data = data.tolist()
+
+r = requests.post("http://0.0.0.0:8000", json={'img': data})
+
+print(r.text)
diff --git a/02.recognize_digits/index.html b/02.recognize_digits/index.html
index 635e7fa30d6d57e0b8d81c086c9935c74093160b..4de8d78216850512c44aef94a172aba5f15d0203 100644
--- a/02.recognize_digits/index.html
+++ b/02.recognize_digits/index.html
@@ -42,23 +42,21 @@
 <div id="markdown" style='display:none'>
 # Recognize Digits
 
-The source code for this tutorial is live at [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits). For instructions on getting started with Paddle, please refer to [installation instructions](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+The source code for this tutorial is here:  [book/recognize_digits](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits). For instructions on getting started with Paddle, please refer to [installation instructions](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Introduction
-When one learns to program, the first task is usually to write a program that prints "Hello World!". In Machine Learning or Deep Learning, the equivalent task is to train a model to recognize hand-written digits on the dataset [MNIST](http://yann.lecun.com/exdb/mnist/). Handwriting recognition is a classic image classification problem. The problem is relatively easy and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a $28\times28$ matrix, and the label is one of the digits from $0$ to $9$. All images are normalized, meaning that they are both rescaled and centered.
+When one learns to program, the first task is usually to write a program that prints "Hello World!". In Machine Learning or Deep Learning, an equivalent task is to train a model to recognize hand-written digits using the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. Handwriting recognition is a classic image classification problem. The problem is relatively easy and MNIST is a complete dataset. As a simple Computer Vision dataset, MNIST contains images of handwritten digits and their corresponding labels (Fig. 1). The input image is a $28\times28$ matrix, and the label is one of the digits from $0$ to $9$. All images are normalized, meaning that they are both rescaled and centered.
 
 <p align="center">
 <img src="image/mnist_example_image.png" width="400"><br/>
 Fig. 1. Examples of MNIST images
 </p>
 
-The MNIST dataset is created from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students the in U.S. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set (60,000 samples) and test set (10,000 samples), where training set was labeled by 250 different annotators, and it was guaranteed that there wasn't a complete overlap of annotators of training set and test set.
+The MNIST dataset is from the [NIST](https://www.nist.gov/srd/nist-special-database-19) Special Database 3 (SD-3) and the Special Database 1 (SD-1). The SD-3 is labeled by the staff of the U.S. Census Bureau, while SD-1 is labeled by high school students. Therefore the SD-3 is cleaner and easier to recognize than the SD-1 dataset. Yann LeCun et al. used half of the samples from each of SD-1 and SD-3 to create the MNIST training set of 60,000 samples and test set of 10,000 samples. 250 annotators labeled the training set, thus guaranteed that there wasn't a complete overlap of annotators of training set and test set.
 
-Yann LeCun, one of the founders of Deep Learning, have previously made tremendous contributions to handwritten character recognition and proposed the **Convolutional Neural Network** (CNN), which drastically improved recognition capability for handwritten characters. CNNs are now a critical concept in Deep Learning. From the LeNet proposal by Yann LeCun, to those winning models in ImageNet competitions, such as VGGNet, GoogLeNet, and ResNet (See [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) tutorial), CNNs have achieved a series of impressive results in Image Classification tasks.
+The MNIST dataset has been used for evaluating many image recognition algorithms such as a single layer linear classifier, Multilayer Perceptron (MLP) and Multilayer CNN LeNet\[[1](#references)\], K-Nearest Neighbors (k-NN) \[[2](#references)\], Support Vector Machine (SVM) \[[3](#references)\], Neural Networks \[[4-7](#references)\], Boosting \[[8](#references)\] and preprocessing methods like distortion removal, noise removal, and blurring.  Among these algorithms, the *Convolutional Neural Network* (CNN) has achieved a series of impressive results in Image Classification tasks, including VGGNet, GoogLeNet, and ResNet (See [Image Classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification) tutorial).
 
-Many algorithms are tested on MNIST. In 1998, LeCun experimented with single layer linear classifier, Multilayer Perceptron (MLP) and Multilayer CNN LeNet. These algorithms quickly reduced test error from 12% to 0.7% \[[1](#references)\]. Since then, researchers have worked on many algorithms such as **K-Nearest Neighbors** (k-NN) \[[2](#references)\], **Support Vector Machine** (SVM) \[[3](#references)\], **Neural Networks** \[[4-7](#references)\] and **Boosting** \[[8](#references)\]. Various preprocessing methods like distortion removal, noise removal, and blurring, have also been applied to increase recognition accuracy.
-
-In this tutorial, we tackle the task of handwritten character recognition. We start with a simple **softmax** regression model and guide our readers step-by-step to improve this model's performance on the task of recognition.
+In this tutorial, we start with a simple **softmax** regression model and go on with MLP and CNN.  Readers will see how these methods improve the recognition accuracy step-by-step.
 
 
 ## Model Overview
@@ -78,7 +76,7 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
 
 where $ \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} $
 
-For an $N$-class classification problem with $N$ output nodes, Softmax normalizes the resulting $N$ dimensional vector so that each of its entries falls in the range $[0,1]\in\math{R}$, representing the probability that the sample belongs to a certain class. Here $y_i$ denotes the predicted probability that an image is of digit $i$.
+For an $N$-class classification problem with $N$ output nodes, Softmax normalizes the resulting $N$ dimensional vector so that each of its entries falls in the range $[0,1]\in {R}$, representing the probability that the sample belongs to a certain class. Here $y_i$ denotes the predicted probability that an image is of digit $i$.
 
 In such a classification problem, we usually use the cross entropy loss function:
 
@@ -118,7 +116,7 @@ Fig. 4. Convolutional layer<br/>
 
 The **convolutional layer** is the core of a Convolutional Neural Network. The parameters in this layer are composed of a set of filters, also called kernels. We could visualize the convolution step in the following fashion: Each kernel slides horizontally and vertically till it covers the whole image. At every window, we compute the dot product of the kernel and the input. Then, we add the bias and apply an activation function. The result is a two-dimensional activation map. For example, some kernel may recognize corners, and some may recognize circles. These convolution kernels may respond strongly to the corresponding features.
 
-Fig. 4 illustrates the dynamic programming of a convolutional layer, where depths are flattened for simplicity. The input is $W_1=5$, $H_1=5$, $D_1=3$. In fact, this is a common representation for colored images. $W_1$ and $H_1$ correspond to the width and height in a colored image. $D_1$ corresponds to the 3 color channels for RGB. The parameters of the convolutional layer are $K=2$, $F=3$, $S=2$, $P=1$. $K$ denotes the number of kernels; specifically, $Filter$ $W_0$ and $Filter$ $W_1$ are the kernels. $F$ is kernel size while $W0$ and $W1$ are both $F\timesF = 3\times3$ matrices in all depths. $S$ is the stride, which is the width of the sliding window; here, kernels move leftwards or downwards by 2 units each time. $P$ is the width of the padding, which denotes an extension of the input; here, the gray area shows zero padding with size 1.
+Fig. 4 illustrates the dynamic programming of a convolutional layer, where depths are flattened for simplicity. The input is $W_1=5$, $H_1=5$, $D_1=3$. In fact, this is a common representation for colored images. $W_1$ and $H_1$ correspond to the width and height in a colored image. $D_1$ corresponds to the three color channels for RGB. The parameters of the convolutional layer are $K=2$, $F=3$, $S=2$, $P=1$. $K$ denotes the number of kernels; specifically, $Filter$ $W_0$ and $Filter$ $W_1$ are the kernels. $F$ is kernel size while $W0$ and $W1$ are both $F\timesF = 3\times3$ matrices in all depths. $S$ is the stride, which is the width of the sliding window; here, kernels move leftwards or downwards by two units each time. $P$ is the width of the padding, which denotes an extension of the input; here, the gray area shows zero padding with size 1.
 
 #### Pooling Layer
 
@@ -138,11 +136,11 @@ Fig. 6. LeNet-5 Convolutional Neural Network architecture<br/>
 
 [**LeNet-5**](http://yann.lecun.com/exdb/lenet/) is one of the simplest Convolutional Neural Networks. Fig. 6. shows its architecture: A 2-dimensional input image is fed into two sets of convolutional layers and pooling layers. This output is then fed to a fully connected layer and a softmax classifier. Compared to multilayer, fully connected perceptrons, the LeNet-5 can recognize images better. This is due to the following three properties of the convolution:
 
-- The 3D nature of the neurons: a convolutional layer is organized by width, height and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
+- The 3D nature of the neurons: a convolutional layer is organized by width, height, and depth. Neurons in each layer are connected to only a small region in the previous layer. This region is called the receptive field.
 - Local connectivity: A CNN utilizes the local space correlation by connecting local neurons. This design guarantees that the learned filter has a strong response to local input features. Stacking many such layers generates a non-linear filter that is more global. This enables the network to first obtain good representation for small parts of input and then combine them to represent a larger region.
-- Weight sharing: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means that all the neurons in the same depth of the output respond to the same feature. This allows the network to detect a feature regardless of its position in the input. In other words, it is shift invariant.
+- Weight sharing: In a CNN, computation is iterated on shared parameters (weights and bias) to form a feature map. This means that all the neurons in the same depth of the output response to the same feature. This allows the network to detect a feature regardless of its position in the input.
 
-For more details on Convolutional Neural Networks, please refer to the tutorial on [Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) and the [relevant lecture](http://cs231n.github.io/convolutional-networks/) from a Stanford open course.
+For more details on Convolutional Neural Networks, please refer to the tutorial on [Image Classification](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md) and the [relevant lecture](http://cs231n.github.io/convolutional-networks/) from a Stanford course.
 
 ### List of Common Activation Functions
 - Sigmoid activation function: $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
@@ -263,11 +261,11 @@ trainer = paddle.trainer.SGD(cost=cost,
                              update_equation=optimizer)
 ```
 
-Then we specify the training data `paddle.dataset.movielens.train()` and testing data `paddle.dataset.movielens.test()`. These two methods are *reader creators*. Once called, a reader creator returns a *reader*.  A reader is a Python method, which, once called, returns a Python generator, which yields instances of data.
+Then we specify the training data `paddle.dataset.mnist.train()` and testing data `paddle.dataset.mnist.test()`. These two methods are *reader creators*. Once called, a reader creator returns a *reader*.  A reader is a Python method, which, once called, returns a Python generator, which yields instances of data.
 
-`shuffle` is a reader decorator. It takes in a reader A as input and returns a new reader B. Under the hood, B calls A to read data in the following fashion: it copies in `buffer_size` instances at a time into a buffer, shuffles the data, and yields the shuffled instances one at a time. A large buffer size would yield very shuffled data.
+`shuffle` is a reader decorator. It takes a reader A as input and returns a new reader B. Under the hood, B calls A to read data in the following fashion: it copies in `buffer_size` instances at a time into a buffer, shuffles the data, and yields the shuffled instances one at a time. A large buffer size would yield very shuffled data.
 
-`batch` is a special decorator, which takes in reader and outputs a *batch reader*, which doesn't yield an instance, but a minibatch at a time.
+`batch` is a special decorator, which takes a reader and outputs a *batch reader*, which doesn't yield an instance, but a minibatch at a time.
 
 `event_handler_plot` is used to plot a figure like below：
 
@@ -305,6 +303,7 @@ def event_handler_plot(event):
 ```python
 lists = []
 
+# event handler to print the progress
 def event_handler(event):
     if isinstance(event, paddle.event.EndIteration):
         if event.batch_id % 100 == 0:
@@ -324,6 +323,7 @@ def event_handler(event):
 ```
 
 ```python
+# Train the model now
 trainer.train(
     reader=paddle.batch(
         paddle.reader.shuffle(
@@ -357,7 +357,7 @@ Usually, with MNIST data, the softmax regression model achieves an accuracy arou
 
 ## Application
 
-After training is done, user can use the trained model to classify images. The following code shows how to inference MNIST images through `paddle.infer` interface.
+After training, users can use the trained model to classify images. The following code shows how to inference MNIST images through `paddle.infer` interface.
 
 ```python
 from PIL import Image
@@ -385,15 +385,15 @@ print "Label of image/infer_3.png is: %d" % lab[0][0]
 
 This tutorial describes a few common deep learning models using **Softmax regression**, **Multilayer Perceptron Network**, and **Convolutional Neural Network**. Understanding these models is crucial for future learning; the subsequent tutorials derive more sophisticated networks by building on top of them.
 
-When our model evolves from a simple softmax regression to a slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST data set achieves a large improvement in accuracy. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve the results of an old one.
+When our model evolves from a simple softmax regression to a slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST dataset achieves a large improvement. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve the results of an old one.
 
-Moreover, this tutorial introduces the basic flow of PaddlePaddle model design, which starts with a *dataprovider*, a model layer construction, and finally training and prediction. Motivated readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
+Moreover, this tutorial introduces the basic flow of PaddlePaddle model design, which starts with a *data provider*, a model layer construction, and finally training and prediction. Motivated readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.
 
 
 ## References
 
 1. LeCun, Yann, Léon Bottou, Yoshua Bengio, and Patrick Haffner. ["Gradient-based learning applied to document recognition."](http://ieeexplore.ieee.org/abstract/document/726791/) Proceedings of the IEEE 86, no. 11 (1998): 2278-2324.
-2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A753279&dswid=-434) (2014).
+2. Wejéus, Samuel. ["A Neural Network Approach to Arbitrary SymbolRecognition on Modern Smartphones."](http://www.diva-portal.org/smash/record.jsf?pid=diva2:753279&dswid=-434) (2014).
 3. Decoste, Dennis, and Bernhard Schölkopf. ["Training invariant support vector machines."](http://link.springer.com/article/10.1023/A:1012454411458) Machine learning 46, no. 1-3 (2002): 161-190.
 4. Simard, Patrice Y., David Steinkraus, and John C. Platt. ["Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis."](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.160.8494&rep=rep1&type=pdf) In ICDAR, vol. 3, pp. 958-962. 2003.
 5. Salakhutdinov, Ruslan, and Geoffrey E. Hinton. ["Learning a Nonlinear Embedding by Preserving Class Neighbourhood Structure."](http://www.jmlr.org/proceedings/papers/v2/salakhutdinov07a/salakhutdinov07a.pdf) In AISTATS, vol. 11. 2007.
diff --git a/02.recognize_digits/train.py b/02.recognize_digits/train.py
index 52f09b78c781868e55655bc05bf8aa66926304a3..f71eea3e4739ab6dfb1bee5c4205d605a9a4c0ba 100644
--- a/02.recognize_digits/train.py
+++ b/02.recognize_digits/train.py
@@ -3,6 +3,8 @@ from PIL import Image
 import numpy as np
 import paddle.v2 as paddle
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 
 def softmax_regression(img):
     predict = paddle.layer.fc(
@@ -49,7 +51,7 @@ def convolutional_neural_network(img):
 
 
 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
 
     # define network topology
     images = paddle.layer.data(
diff --git a/03.image_classification/README.cn.md b/03.image_classification/README.cn.md
index 3e057b8951bbf613c5d7368f86ae51b5001ea5d4..37027043a5cdbad7e87dbc3a99959305dff474a0 100644
--- a/03.image_classification/README.cn.md
+++ b/03.image_classification/README.cn.md
@@ -1,3 +1,4 @@
+
 # 图像分类
 
 本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。
diff --git a/03.image_classification/README.md b/03.image_classification/README.md
index 9c1554975f27716e61749b3d145cdd3e37318bdc..c7803f4f55d756cbbaa54f8cd658757a470b4c54 100644
--- a/03.image_classification/README.md
+++ b/03.image_classification/README.md
@@ -1,3 +1,4 @@
+
 Image Classification
 =======================
 
@@ -9,7 +10,7 @@ Compared to words, images provide much more vivid and easier to understand infor
 
 Image classification is the task of distinguishing images in different categories based on their semantic meaning. It is a core problem in computer vision and is also the foundation of other higher level computer vision tasks such as object detection, image segmentation, object tracking, action recognition, etc. Image classification has applications in many areas such as face recognition, intelligent video analysis in security systems, traffic scene recognition in transportation systems, content-based image retrieval and automatic photo indexing in web services, image classification in medicine, etc.
 
-To classify an image we first encode the entire image using handcrafted or learned features and then determine the category using a classifier. Thus, feature extraction plays an important role in image classification. Prior to deep learning the BoW(Bag of Words) model was the most widely used method for classifying an image as well as an object. The BoW technique was introduced in Natural Language Processing where a training sentence is represented as a bag of words. In the context of image classification, the BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding** and **classifier design**.
+To classify an image we firstly encode the entire image using handcrafted or learned features and then determine the category using a classifier. Thus, feature extraction plays an important role in image classification. Prior to deep learning the BoW(Bag of Words) model was the most widely used method for classifying an image as well as an object. The BoW technique was introduced in Natural Language Processing where a training sentence is represented as a bag of words. In the context of image classification, the BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding** and **classifier design**.
 
 Using Deep learning, image classification can be framed as a supervised or unsupervised learning problem that uses hierarchical features automatically without any need for manually crafted features from the image. In recent years, Convolutional Neural Networks (CNNs) have made significant progress in image classification. CNNs use raw image pixels as input, extract low-level and high-level abstract features through convolution operations, and directly output the classification results from the model. This style of end-to-end learning has lead to not only increased performance but also wider adoption various applications.
 
@@ -46,7 +47,7 @@ Figure 3. Disturbed images [22]
 
 ## Model Overview
 
-A large amount of research in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/) etc. Many image classification algorithms are usually evaluated and compared on these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset for Large Scale Visual Recognition Challenge (ILSVRC) started in 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
+A large amount of research in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/) etc. Many image classification algorithms are usually evaluated and compared on top of these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset for Large Scale Visual Recognition Challenge (ILSVRC) started in 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
 
 Before 2012, traditional image classification was accomplished with the three steps described in the background section. A complete model construction usually involves the following stages: low-level feature extraction, feature encoding, spatial constraint or feature clustering, classifier design, model ensemble.
 
@@ -86,7 +87,7 @@ Figure 5. A CNN example [20]
 
 - Dropout [10]: At each training stage, individual nodes are dropped out of the network with a certain probability. This improves the network's ability to generalize and avoids overfitting.
 
-Parameter updates at each layer during training causes input layer distributions to change and in turn requires hyper-parameters to be careful tuned. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
+Parameter updates at each layer during training causes input layer distributions to change and in turn requires hyper-parameters to be carefully tuned. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
 
 In the following sections, we will introduce the following network architectures - VGG, GoogleNet and ResNets.
 
@@ -118,7 +119,7 @@ Figure 7. Inception block
 
 GoogleNet consists of multiple stacked Inception blocks followed by an avg-pooling layer as in NIN instead of traditional fully connected layers. The difference between GoogleNet and NIN is that GoogleNet adds a fully connected layer after avg-pooling layer to output a vector of category size. Besides these two characteristics, the features from middle layers of a GoogleNet are also very discriminative. Therefore, GoogeleNet inserts two auxiliary classifiers in the model for enhancing gradient and regularization when doing backpropagation. The loss function of the whole network is the weighted sum of these three classifiers.
 
-Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks -- the first group contains two Inception blocks, the second one five, and the third one two. It ends up with an average pooling and a fully-connected layer.
+Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks -- the first group contains two Inception blocks, the second group has five, and the third group has two. It ends with an average pooling and a fully-connected layer.
 
 <p align="center">
 <img src="image/googlenet.jpeg" ><br/>
@@ -129,7 +130,7 @@ The above model is the first version of GoogleNet or GoogelNet-v1. GoogleNet-v2
 
 ### ResNet
 
-Residual Network(ResNet)[15] won the 2015 championship on three ImageNet competitions -- image classification, object localization, and object detection. The main challenge in training deeper networks is that accuracy degrades with network depth. The authors of ResNet proposed a residual learning approach to ease the difficulty of training deeper networks. Based on the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer inputs. The outputs of these two branches are then added up.
+Residual Network(ResNet)[15] won the 2015 championship on three ImageNet competitions -- image classification, object localization, and object detection. The main challenge in training deeper networks is that accuracy degrades with network depth. The authors of ResNet proposed a residual learning approach to ease the difficulty of training deeper networks. Based on the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer's inputs. The outputs of these two branches are then added up.
 
 Figure 9 illustrates the ResNet architecture. To the left is the basic building block, it consists of two 3x3 convolutional layers of the same channels. To the right is a Bottleneck block. The bottleneck is a 1x1 convolutional layer used to reduce dimension from 256 to 64. The other 1x1 convolutional layer is used to increase dimension from 64 to 256. Thus, the number of input and output channels of the middle 3x3 convolutional layer is 64, which is relatively small.
 
@@ -159,7 +160,7 @@ Figure 11. CIFAR10 dataset[21]
 
  `paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need to manually download and preprocess CIFAR-10.
 
-After issuing a command `python train.py`, training will start immediately. The following sections describe the details:
+After running the command `python train.py`, training will start immediately. The following sections will describe in details.
 
 ## Model Structure
 
@@ -176,12 +177,11 @@ from resnet import resnet_cifar10
 # PaddlePaddle init
 paddle.init(use_gpu=False, trainer_count=1)
 ```
-
-As mentioned in section [Model Overview](#model-overview), here we provide the implementations of the VGG and ResNet models.
+Now we are going to walk you through the implementations of the VGG and ResNet.
 
 ### VGG
 
-First, we use a VGG network. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we use a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
+Let's start with the VGG model. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we use a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
 
 1. Define input data and its dimension
 
@@ -232,7 +232,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
         return fc2
     ```
 
-    2.1. First, define a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
+    2.1. Firstly, it defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
 
     2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
 
@@ -260,7 +260,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
 
 ### ResNet
 
-The first, third and fourth steps of a ResNet are the same as a VGG. The second one is the main module.
+The first, third and fourth steps of a ResNet are the same as a VGG. The second step is the main module of ResNet.
 
 ```python
 net = resnet_cifar10(image, depth=56)
@@ -343,7 +343,7 @@ def resnet_cifar10(ipt, depth=32):
 
 ### Define Parameters
 
-First, we create the model parameters according to the previous model configuration `cost`.
+Firstly, we create the model parameters according to the previous model configuration `cost`.
 
 ```python
 # Create parameters
@@ -481,7 +481,7 @@ Figure 12. The error rate of VGG model on CIFAR10
 
 ## Application
 
-After training is done, users can use the trained model to classify images. The following code shows how to infer through `paddle.infer` interface. You can remove the comments to change the model name.
+After training is completed, users can use the trained model to classify images. The following code shows how to infer through `paddle.infer` interface. You can uncomment some lines from below to change the model name.
 
 ```python
 from PIL import Image
@@ -519,7 +519,7 @@ print "Label of image/dog.png is: %d" % lab[0][0]
 
 ## Conclusion
 
-Traditional image classification methods have complicated frameworks that involve multiple stages of processing. In contrast, CNN models can be trained end-to-end with a significant increase in classification accuracy. In this chapter, we introduced three models -- VGG, GoogleNet, ResNet and provided PaddlePaddle config files for training VGG and ResNet on CIFAR10. We also explained how to perform prediction and feature extraction using the PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
+Traditional image classification methods involve multiple stages of processing, which has to utilize complex frameworks. Contrarily, CNN models can be trained end-to-end with a significant increase in classification accuracy. In this chapter, we introduced three models -- VGG, GoogleNet, ResNet and provided PaddlePaddle config files for training VGG and ResNet on CIFAR10. We also explained how to perform prediction and feature extraction using the PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
 
 
 ## Reference
diff --git a/03.image_classification/index.cn.html b/03.image_classification/index.cn.html
index eff3e3446b65ce0120e106b9b18173f773d29e67..32d64a32f54859b260f3589fba80286f883a8def 100644
--- a/03.image_classification/index.cn.html
+++ b/03.image_classification/index.cn.html
@@ -40,6 +40,7 @@
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
+
 # 图像分类
 
 本教程源代码目录在[book/image_classification](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)， 初次使用请参考PaddlePaddle[安装教程](https://github.com/PaddlePaddle/book/blob/develop/README.cn.md#运行这本书)，更多内容请参考本教程的[视频课堂](http://bit.baidu.com/course/detail/id/168.html)。
diff --git a/03.image_classification/index.html b/03.image_classification/index.html
index afc72f64cd8d08586802ee9534a65a18547d20ce..e4caac7b858a6ebe0e9b3b36eb8705c73e62034a 100644
--- a/03.image_classification/index.html
+++ b/03.image_classification/index.html
@@ -40,6 +40,7 @@
 
 <!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
 <div id="markdown" style='display:none'>
+
 Image Classification
 =======================
 
@@ -51,7 +52,7 @@ Compared to words, images provide much more vivid and easier to understand infor
 
 Image classification is the task of distinguishing images in different categories based on their semantic meaning. It is a core problem in computer vision and is also the foundation of other higher level computer vision tasks such as object detection, image segmentation, object tracking, action recognition, etc. Image classification has applications in many areas such as face recognition, intelligent video analysis in security systems, traffic scene recognition in transportation systems, content-based image retrieval and automatic photo indexing in web services, image classification in medicine, etc.
 
-To classify an image we first encode the entire image using handcrafted or learned features and then determine the category using a classifier. Thus, feature extraction plays an important role in image classification. Prior to deep learning the BoW(Bag of Words) model was the most widely used method for classifying an image as well as an object. The BoW technique was introduced in Natural Language Processing where a training sentence is represented as a bag of words. In the context of image classification, the BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding** and **classifier design**.
+To classify an image we firstly encode the entire image using handcrafted or learned features and then determine the category using a classifier. Thus, feature extraction plays an important role in image classification. Prior to deep learning the BoW(Bag of Words) model was the most widely used method for classifying an image as well as an object. The BoW technique was introduced in Natural Language Processing where a training sentence is represented as a bag of words. In the context of image classification, the BoW model requires constructing a dictionary. The simplest BoW framework can be designed with three steps: **feature extraction**, **feature encoding** and **classifier design**.
 
 Using Deep learning, image classification can be framed as a supervised or unsupervised learning problem that uses hierarchical features automatically without any need for manually crafted features from the image. In recent years, Convolutional Neural Networks (CNNs) have made significant progress in image classification. CNNs use raw image pixels as input, extract low-level and high-level abstract features through convolution operations, and directly output the classification results from the model. This style of end-to-end learning has lead to not only increased performance but also wider adoption various applications.
 
@@ -88,7 +89,7 @@ Figure 3. Disturbed images [22]
 
 ## Model Overview
 
-A large amount of research in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/) etc. Many image classification algorithms are usually evaluated and compared on these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset for Large Scale Visual Recognition Challenge (ILSVRC) started in 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
+A large amount of research in image classification is built upon public datasets such as [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/), [ImageNet](http://image-net.org/) etc. Many image classification algorithms are usually evaluated and compared on top of these datasets. PASCAL VOC is a computer vision competition started in 2005, and ImageNet is a dataset for Large Scale Visual Recognition Challenge (ILSVRC) started in 2010. In this chapter, we introduce some image classification models from the submissions to these competitions.
 
 Before 2012, traditional image classification was accomplished with the three steps described in the background section. A complete model construction usually involves the following stages: low-level feature extraction, feature encoding, spatial constraint or feature clustering, classifier design, model ensemble.
 
@@ -128,7 +129,7 @@ Figure 5. A CNN example [20]
 
 - Dropout [10]: At each training stage, individual nodes are dropped out of the network with a certain probability. This improves the network's ability to generalize and avoids overfitting.
 
-Parameter updates at each layer during training causes input layer distributions to change and in turn requires hyper-parameters to be careful tuned. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
+Parameter updates at each layer during training causes input layer distributions to change and in turn requires hyper-parameters to be carefully tuned. In 2015, Sergey Ioffe and Christian Szegedy proposed a Batch Normalization (BN) algorithm [14], which normalizes the features of each batch in a layer, and enables relatively stable distribution in each layer. Not only does BN algorithm act as a regularizer, but also reduces the need for careful hyper-parameter design. Experiments demonstrate that BN algorithm accelerates the training convergence and has been widely used in later deeper models.
 
 In the following sections, we will introduce the following network architectures - VGG, GoogleNet and ResNets.
 
@@ -160,7 +161,7 @@ Figure 7. Inception block
 
 GoogleNet consists of multiple stacked Inception blocks followed by an avg-pooling layer as in NIN instead of traditional fully connected layers. The difference between GoogleNet and NIN is that GoogleNet adds a fully connected layer after avg-pooling layer to output a vector of category size. Besides these two characteristics, the features from middle layers of a GoogleNet are also very discriminative. Therefore, GoogeleNet inserts two auxiliary classifiers in the model for enhancing gradient and regularization when doing backpropagation. The loss function of the whole network is the weighted sum of these three classifiers.
 
-Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks -- the first group contains two Inception blocks, the second one five, and the third one two. It ends up with an average pooling and a fully-connected layer.
+Figure 8 illustrates the neural architecture of a GoogleNet which consists of 22 layers: it starts with three regular convolutional layers followed by three groups of sub-networks -- the first group contains two Inception blocks, the second group has five, and the third group has two. It ends with an average pooling and a fully-connected layer.
 
 <p align="center">
 <img src="image/googlenet.jpeg" ><br/>
@@ -171,7 +172,7 @@ The above model is the first version of GoogleNet or GoogelNet-v1. GoogleNet-v2
 
 ### ResNet
 
-Residual Network(ResNet)[15] won the 2015 championship on three ImageNet competitions -- image classification, object localization, and object detection. The main challenge in training deeper networks is that accuracy degrades with network depth. The authors of ResNet proposed a residual learning approach to ease the difficulty of training deeper networks. Based on the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer inputs. The outputs of these two branches are then added up.
+Residual Network(ResNet)[15] won the 2015 championship on three ImageNet competitions -- image classification, object localization, and object detection. The main challenge in training deeper networks is that accuracy degrades with network depth. The authors of ResNet proposed a residual learning approach to ease the difficulty of training deeper networks. Based on the design ideas of BN, small convolutional kernels, full convolutional network, ResNets reformulate the layers as residual blocks, with each block containing two branches, one directly connecting input to the output, the other performing two to three convolutions and calculating the residual function with reference to the layer's inputs. The outputs of these two branches are then added up.
 
 Figure 9 illustrates the ResNet architecture. To the left is the basic building block, it consists of two 3x3 convolutional layers of the same channels. To the right is a Bottleneck block. The bottleneck is a 1x1 convolutional layer used to reduce dimension from 256 to 64. The other 1x1 convolutional layer is used to increase dimension from 64 to 256. Thus, the number of input and output channels of the middle 3x3 convolutional layer is 64, which is relatively small.
 
@@ -201,7 +202,7 @@ Figure 11. CIFAR10 dataset[21]
 
  `paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens` and `wmt14`, etc. There's no need to manually download and preprocess CIFAR-10.
 
-After issuing a command `python train.py`, training will start immediately. The following sections describe the details:
+After running the command `python train.py`, training will start immediately. The following sections will describe in details.
 
 ## Model Structure
 
@@ -218,12 +219,11 @@ from resnet import resnet_cifar10
 # PaddlePaddle init
 paddle.init(use_gpu=False, trainer_count=1)
 ```
-
-As mentioned in section [Model Overview](#model-overview), here we provide the implementations of the VGG and ResNet models.
+Now we are going to walk you through the implementations of the VGG and ResNet.
 
 ### VGG
 
-First, we use a VGG network. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we use a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
+Let's start with the VGG model. Since the image size and amount of CIFAR10 are relatively small comparing to ImageNet, we use a small version of VGG network for CIFAR10. Convolution groups incorporate BN and dropout operations.
 
 1. Define input data and its dimension
 
@@ -274,7 +274,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
         return fc2
     ```
 
-    2.1. First, define a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
+    2.1. Firstly, it defines a convolution block or conv_block. The default convolution kernel is 3x3, and the default pooling size is 2x2 with stride 2. Dropout specifies the probability in dropout operation. Function `img_conv_group` is defined in `paddle.networks` consisting of a series of `Conv->BN->ReLu->Dropout` and a `Pooling`.
 
     2.2. Five groups of convolutions. The first two groups perform two convolutions, while the last three groups perform three convolutions. The dropout rate of the last convolution in each group is set to 0, which means there is no dropout for this layer.
 
@@ -302,7 +302,7 @@ First, we use a VGG network. Since the image size and amount of CIFAR10 are rela
 
 ### ResNet
 
-The first, third and fourth steps of a ResNet are the same as a VGG. The second one is the main module.
+The first, third and fourth steps of a ResNet are the same as a VGG. The second step is the main module of ResNet.
 
 ```python
 net = resnet_cifar10(image, depth=56)
@@ -385,7 +385,7 @@ def resnet_cifar10(ipt, depth=32):
 
 ### Define Parameters
 
-First, we create the model parameters according to the previous model configuration `cost`.
+Firstly, we create the model parameters according to the previous model configuration `cost`.
 
 ```python
 # Create parameters
@@ -523,7 +523,7 @@ Figure 12. The error rate of VGG model on CIFAR10
 
 ## Application
 
-After training is done, users can use the trained model to classify images. The following code shows how to infer through `paddle.infer` interface. You can remove the comments to change the model name.
+After training is completed, users can use the trained model to classify images. The following code shows how to infer through `paddle.infer` interface. You can uncomment some lines from below to change the model name.
 
 ```python
 from PIL import Image
@@ -561,7 +561,7 @@ print "Label of image/dog.png is: %d" % lab[0][0]
 
 ## Conclusion
 
-Traditional image classification methods have complicated frameworks that involve multiple stages of processing. In contrast, CNN models can be trained end-to-end with a significant increase in classification accuracy. In this chapter, we introduced three models -- VGG, GoogleNet, ResNet and provided PaddlePaddle config files for training VGG and ResNet on CIFAR10. We also explained how to perform prediction and feature extraction using the PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
+Traditional image classification methods involve multiple stages of processing, which has to utilize complex frameworks. Contrarily, CNN models can be trained end-to-end with a significant increase in classification accuracy. In this chapter, we introduced three models -- VGG, GoogleNet, ResNet and provided PaddlePaddle config files for training VGG and ResNet on CIFAR10. We also explained how to perform prediction and feature extraction using the PaddlePaddle API. For other datasets such as ImageNet, the procedure for config and training are the same and you are welcome to give it a try.
 
 
 ## Reference
diff --git a/03.image_classification/train.py b/03.image_classification/train.py
index 0c800308ed2b19a86e27b5114b53a8b69eb7a77c..e6caf7dc9bf3f8420b60d84407d27160fde81708 100644
--- a/03.image_classification/train.py
+++ b/03.image_classification/train.py
@@ -12,20 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-import sys
+import sys, os
 
 import paddle.v2 as paddle
 
 from vgg import vgg_bn_drop
 from resnet import resnet_cifar10
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 
 def main():
     datadim = 3 * 32 * 32
     classdim = 10
 
     # PaddlePaddle init
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
 
     image = paddle.layer.data(
         name="image", type=paddle.data_type.dense_vector(datadim))
@@ -79,6 +81,12 @@ def main():
     # Create trainer
     trainer = paddle.trainer.SGD(
         cost=cost, parameters=parameters, update_equation=momentum_optimizer)
+
+    # Save the inference topology to protobuf.
+    inference_topology = paddle.topology.Topology(layers=out)
+    with open("inference_topology.pkl", 'wb') as f:
+        inference_topology.serialize_for_inference(f)
+
     trainer.train(
         reader=paddle.batch(
             paddle.reader.shuffle(
diff --git a/04.word2vec/README.cn.md b/04.word2vec/README.cn.md
index 7c5e9bc1eb4dc89ebe2eb61cb250c0663008412c..bee25b65923535005bce68ec37338a3ffbe4cdd4 100644
--- a/04.word2vec/README.cn.md
+++ b/04.word2vec/README.cn.md
@@ -207,6 +207,28 @@ hiddensize = 256 # 隐层维度
 N = 5 # 训练5-Gram
 ```
 
+用于保存和加载word_dict和embedding table的函数
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
 接着，定义网络结构：
 
 - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
@@ -333,6 +355,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 
 经过30个pass，我们将得到平均错误率为classification_error_evaluator=0.735611。
 
+## 保存词典和embedding
+
+训练完成之后，我们可以把词典和embedding table单独保存下来，后面可以直接使用
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
 
 ## 应用模型
 训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型查看参数用来做后续应用。
diff --git a/04.word2vec/README.md b/04.word2vec/README.md
index eeb965069e5c13b657d44f8ede053acbf1ad08ad..39b93063fb6dc4c811136a949c0c967488dde9d1 100644
--- a/04.word2vec/README.md
+++ b/04.word2vec/README.md
@@ -1,20 +1,20 @@
 # Word2Vec
 
-This is intended as a reference tutorial. The source code of this tutorial lives on [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec).
+This is intended as a reference tutorial. The source code of this tutorial is located at [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec).
 
 For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Background Introduction
 
-This section introduces the concept of **word embedding**, which is a vector representation of words. It is a popular technique used in natural language processing. Word embeddings support many Internet services, including search engines, advertising systems, and recommendation systems.
+This section introduces the concept of **word embeddings**, which are vector representations of words. Word embeddings is a popular technique used in natural language processing to support applications such as search engines, advertising systems, and recommendation systems.
 
 ### One-Hot Vectors
 
-Building these services requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
+Building these applications requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
 
 For each word, its vector representation has the corresponding entry in the vector as 1, and all other entries as 0. The lengths of one-hot vectors match the size of the dictionary. Each entry of a vector corresponds to the presence (or absence) of a word in the dictionary.
 
-One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
+One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword "carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
 
 Like many machine learning models, word embeddings can represent knowledge in various ways. Another model may project an one-hot vector to an embedding vector of lower dimension e.g. $embedding(mother's day) = [0.3, 4.2, -1.5, ...], embedding(carnations) = [0.2, 5.6, -2.3, ...]$. Mapping one-hot vectors onto an embedded vector space has the potential to bring the embedding vectors of similar words (either semantically or usage-wise) closer to each other, so that the cosine similarity between the corresponding vectors for words like "Mother's Day" and "carnations" are no longer zero.
 
@@ -33,7 +33,7 @@ The neural network based model does not require storing huge hash tables of stat
 
 ## Results Demonstration
 
-In this section, after training the word embedding model, we could use the data visualization algorithm $t-$SNE\[[4](#reference)\] to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we could see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
+In this section, we use the $t-$SNE\[[4](#reference)\] data visualization algorithm to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we can see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
 
 <p align="center">
     <img src = "image/2d_similarity.png" width=400><br/>
@@ -52,14 +52,14 @@ please input two words: from company
 similarity: -0.0997506977351
 ```
 
-The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](#Model Application).
+The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
 
 
 ## Model Overview
 
 In this section, we will introduce three word embedding models: N-gram model, CBOW, and Skip-gram, which all output the frequency of each word given its immediate context.
 
-For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Model Training](#Model Training).
+For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Training](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
 
 The latter two models, which became popular recently, are neural word embedding model developed by Tomas Mikolov at Google \[[3](#reference)\]. Despite their apparent simplicity, these models train very well.
 
@@ -93,7 +93,7 @@ Given some real corpus in which all sentences are meaningful, the n-gram model s
 
 $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 
-where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
+where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional logarithmic probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
 
 <p align="center">
        <img src="image/nnlm_en.png" width=500><br/>
@@ -151,7 +151,7 @@ As illustrated in the figure above, skip-gram model maps the word embedding of t
 
 ## Dataset
 
-We will use Peen Treebank (PTB) (Tomas Mikolov's pre-processed version) dataset. PTB is a small dataset, used in Recurrent Neural Network Language Modeling Toolkit\[[2](#reference)\]. Its statistics are as follows:
+We will use Penn Treebank (PTB) (Tomas Mikolov's pre-processed version) dataset. PTB is a small dataset, used in Recurrent Neural Network Language Modeling Toolkit\[[2](#reference)\]. Its statistics are as follows:
 
 <p align="center">
 <table>
@@ -224,6 +224,29 @@ hiddensize = 256 # hidden layer dimension
 N = 5 # train 5-gram
 ```
 
+
+- functions used to save and load word dict and embedding table
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
 - Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example).
 
 ```python
@@ -343,10 +366,20 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 
 After 30 passes, we can get average error rate around 0.735611.
 
+## Save word dict and embedding table
+
+after training, we can save the word dict and embedding table for the future usage.
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
 
 ## Model Application
 
-After the model is trained, we can load saved model parameters and uses it for other models. We can also use the parameters in applications.
+After the model is trained, we can load the  saved model parameters and use it for other models. We can also use the parameters in various applications.
 
 ### Viewing Word Vector
 
@@ -401,7 +434,7 @@ print spatial.distance.cosine(emb_1, emb_2)
 
 ## Conclusion
 
-This chapter introduces word embedding, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
+This chapter introduces word embeddings, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
 
 In information retrieval, the relevance between the query and document keyword can be computed through the cosine similarity of their word embeddings. In grammar analysis and semantic analysis, a previously trained word embedding can initialize models for better performance. In document classification, clustering the word embedding can group synonyms in the documents. We hope that readers can use word embedding models in their work after reading this chapter.
 
diff --git a/04.word2vec/index.cn.html b/04.word2vec/index.cn.html
index 5bb2b31efb61b1edb188778daada87a3dee1cdff..23729bf387e9eea06970aee13b29c846e680f4fb 100644
--- a/04.word2vec/index.cn.html
+++ b/04.word2vec/index.cn.html
@@ -249,6 +249,28 @@ hiddensize = 256 # 隐层维度
 N = 5 # 训练5-Gram
 ```
 
+用于保存和加载word_dict和embedding table的函数
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
 接着，定义网络结构：
 
 - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
@@ -375,6 +397,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 
 经过30个pass，我们将得到平均错误率为classification_error_evaluator=0.735611。
 
+## 保存词典和embedding
+
+训练完成之后，我们可以把词典和embedding table单独保存下来，后面可以直接使用
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
 
 ## 应用模型
 训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型查看参数用来做后续应用。
diff --git a/04.word2vec/index.html b/04.word2vec/index.html
index 1d15b583e6fb47250adf9d02ad6923d1bd86d178..3a17a95541a9672ff3740da256086a6d8f936eed 100644
--- a/04.word2vec/index.html
+++ b/04.word2vec/index.html
@@ -42,21 +42,21 @@
 <div id="markdown" style='display:none'>
 # Word2Vec
 
-This is intended as a reference tutorial. The source code of this tutorial lives on [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec).
+This is intended as a reference tutorial. The source code of this tutorial is located at [book/word2vec](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec).
 
 For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Background Introduction
 
-This section introduces the concept of **word embedding**, which is a vector representation of words. It is a popular technique used in natural language processing. Word embeddings support many Internet services, including search engines, advertising systems, and recommendation systems.
+This section introduces the concept of **word embeddings**, which are vector representations of words. Word embeddings is a popular technique used in natural language processing to support applications such as search engines, advertising systems, and recommendation systems.
 
 ### One-Hot Vectors
 
-Building these services requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
+Building these applications requires us to quantify the similarity between two words or paragraphs. This calls for a new representation of all the words to make them more suitable for computation. An obvious way to achieve this is through the vector space model, where every word is represented as an **one-hot vector**.
 
 For each word, its vector representation has the corresponding entry in the vector as 1, and all other entries as 0. The lengths of one-hot vectors match the size of the dictionary. Each entry of a vector corresponds to the presence (or absence) of a word in the dictionary.
 
-One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
+One-hot vectors are intuitive, yet they have limited usefulness. Take the example of an Internet advertising system: Suppose a customer enters the query "Mother's Day", while an ad bids for the keyword "carnations". Because the one-hot vectors of these two words are perpendicular, the metric distance (either Euclidean or cosine similarity) between them would indicate  little relevance. However, *we* know that these two queries are connected semantically, since people often gift their mothers bundles of carnation flowers on Mother's Day. This discrepancy is due to the low information capacity in each vector. That is, comparing the vector representations of two words does not assess their relevance sufficiently. To calculate their similarity accurately, we need more information, which could be learned from large amounts of data through machine learning methods.
 
 Like many machine learning models, word embeddings can represent knowledge in various ways. Another model may project an one-hot vector to an embedding vector of lower dimension e.g. $embedding(mother's day) = [0.3, 4.2, -1.5, ...], embedding(carnations) = [0.2, 5.6, -2.3, ...]$. Mapping one-hot vectors onto an embedded vector space has the potential to bring the embedding vectors of similar words (either semantically or usage-wise) closer to each other, so that the cosine similarity between the corresponding vectors for words like "Mother's Day" and "carnations" are no longer zero.
 
@@ -75,7 +75,7 @@ The neural network based model does not require storing huge hash tables of stat
 
 ## Results Demonstration
 
-In this section, after training the word embedding model, we could use the data visualization algorithm $t-$SNE\[[4](#reference)\] to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we could see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
+In this section, we use the $t-$SNE\[[4](#reference)\] data visualization algorithm to draw the word embedding vectors after projecting them onto a two-dimensional space (see figure below). From the figure we can see that the semantically relevant words -- *a*, *the*, and *these* or *big* and *huge* -- are close to each other in the projected space, while irrelevant words -- *say* and *business* or *decision* and *japan* -- are far from each other.
 
 <p align="center">
     <img src = "image/2d_similarity.png" width=400><br/>
@@ -94,14 +94,14 @@ please input two words: from company
 similarity: -0.0997506977351
 ```
 
-The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](#Model Application).
+The above results could be obtained by running `calculate_dis.py`, which loads the words in the dictionary and their corresponding trained word embeddings. For detailed instruction, see section [Model Application](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
 
 
 ## Model Overview
 
 In this section, we will introduce three word embedding models: N-gram model, CBOW, and Skip-gram, which all output the frequency of each word given its immediate context.
 
-For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Model Training](#Model Training).
+For N-gram model, we will first introduce the concept of language model, and implement it using PaddlePaddle in section [Training](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec#model-application).
 
 The latter two models, which became popular recently, are neural word embedding model developed by Tomas Mikolov at Google \[[3](#reference)\]. Despite their apparent simplicity, these models train very well.
 
@@ -135,7 +135,7 @@ Given some real corpus in which all sentences are meaningful, the n-gram model s
 
 $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 
-where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
+where $f(w_t, w_{t-1}, ..., w_{t-n+1})$ represents the conditional logarithmic probability of the current word $w_t$ given its previous $n-1$ words, and $R(\theta)$ represents parameter regularization term.
 
 <p align="center">
        <img src="image/nnlm_en.png" width=500><br/>
@@ -193,7 +193,7 @@ As illustrated in the figure above, skip-gram model maps the word embedding of t
 
 ## Dataset
 
-We will use Peen Treebank (PTB) (Tomas Mikolov's pre-processed version) dataset. PTB is a small dataset, used in Recurrent Neural Network Language Modeling Toolkit\[[2](#reference)\]. Its statistics are as follows:
+We will use Penn Treebank (PTB) (Tomas Mikolov's pre-processed version) dataset. PTB is a small dataset, used in Recurrent Neural Network Language Modeling Toolkit\[[2](#reference)\]. Its statistics are as follows:
 
 <p align="center">
 <table>
@@ -266,6 +266,29 @@ hiddensize = 256 # hidden layer dimension
 N = 5 # train 5-gram
 ```
 
+
+- functions used to save and load word dict and embedding table
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
+
 - Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example).
 
 ```python
@@ -385,10 +408,20 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 
 After 30 passes, we can get average error rate around 0.735611.
 
+## Save word dict and embedding table
+
+after training, we can save the word dict and embedding table for the future usage.
+
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
+
 
 ## Model Application
 
-After the model is trained, we can load saved model parameters and uses it for other models. We can also use the parameters in applications.
+After the model is trained, we can load the  saved model parameters and use it for other models. We can also use the parameters in various applications.
 
 ### Viewing Word Vector
 
@@ -443,7 +476,7 @@ print spatial.distance.cosine(emb_1, emb_2)
 
 ## Conclusion
 
-This chapter introduces word embedding, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
+This chapter introduces word embeddings, the relationship between language model and word embedding, and how to train neural networks to learn word embedding.
 
 In information retrieval, the relevance between the query and document keyword can be computed through the cosine similarity of their word embeddings. In grammar analysis and semantic analysis, a previously trained word embedding can initialize models for better performance. In document classification, clustering the word embedding can group synonyms in the documents. We hope that readers can use word embedding models in their work after reading this chapter.
 
diff --git a/04.word2vec/train.py b/04.word2vec/train.py
index eb596673ce8ad55dfe8c5c258beb344e825b7c25..6940c6eebfc9a5dca1af9deef9f8f93ac90f1635 100644
--- a/04.word2vec/train.py
+++ b/04.word2vec/train.py
@@ -1,7 +1,11 @@
 import math
+import os
 
+import numpy
 import paddle.v2 as paddle
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 embsize = 32
 hiddensize = 256
 N = 5
@@ -16,8 +20,28 @@ def wordemb(inlayer):
     return wordemb
 
 
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+
+
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = int(value)
+
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+
+
 def main():
-    paddle.init(use_gpu=False, trainer_count=3)
+    paddle.init(use_gpu=with_gpu, trainer_count=3)
     word_dict = paddle.dataset.imikolov.build_dict()
     dict_size = len(word_dict)
     # Every layer takes integer value of range [0, dict_size)
@@ -77,6 +101,10 @@ def main():
         num_passes=100,
         event_handler=event_handler)
 
+    # save word dict and embedding table
+    embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+    save_dict_and_embedding(word_dict, embeddings)
+
 
 if __name__ == '__main__':
     main()
diff --git a/05.recommender_system/README.md b/05.recommender_system/README.md
index 6ba636f705bb0a9bbd7672f50cb462d7cfc66069..1089bbfc928098d4ca01469026a0c83197d7530d 100644
--- a/05.recommender_system/README.md
+++ b/05.recommender_system/README.md
@@ -1,25 +1,19 @@
 # Personalized Recommendation
 
-The source code of this tutorial is in [book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system).
-
-For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+The source code from this tutorial is at [here](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system).  For instructions to run it, please refer to [this guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 
 ## Background
 
-With the fast growth of e-commerce, online videos, and online reading business, users have to rely on recommender systems to avoid manually browsing tremendous volume of choices.  Recommender systems understand users' interest by mining user behavior and other properties of users and products.
-
-Some well know approaches include:
-
-- User behavior-based approach.  A well-known method is collaborative filtering. The underlying assumption is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.
+The recommender system is a component of e-commerce, online videos, and online reading services.  There are several different approaches for recommender systems to learn from user behavior and product properties and to understand users' interests.
 
-- Content-based recommendation[[1](#reference)]. This approach infers feature vectors that represent products from their descriptions.  It also infers feature vectors that represent users' interests.  Then it measures the relevance of users and products by some distances between these feature vectors.
+- User behavior-based approach.  A well-known method of this approach is collaborative filtering, which assumes that if two users made similar purchases, they share common interests and would likely go on making the same decision. Some variants of collaborative filtering are user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.
 
-- Hybrid approach[[2](#reference)]: This approach uses the content-based information to help address the cold start problem[[6](#reference)] in behavior-based approach.
+- Content-based approach[[1](#reference)].  This approach represents product properties and user interests as feature vectors of the same space so that it could measure how much a user is interested in a product by the distance between two feature vectors.
 
-Among these options, collaborative filtering might be the most studied one.  Some of its variants include user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.
+- Hybrid approach[[2](#reference)]: This one combines above two to help with each other about the data sparsity problem[[6](#reference)].
 
-This tutorial explains a deep learning based approach and how to implement it using PaddlePaddle.  We will train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
+This tutorial explains a deep learning based hybrid approach and its implement in PaddlePaddle.  We are going to train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
 
 
 ## Model Overview
diff --git a/05.recommender_system/index.html b/05.recommender_system/index.html
index 8273b2ce00625cc83e142751614cbf7f650f4e32..ce07fa2638ac5d13e091d0654b46e608c6227a4d 100644
--- a/05.recommender_system/index.html
+++ b/05.recommender_system/index.html
@@ -42,26 +42,20 @@
 <div id="markdown" style='display:none'>
 # Personalized Recommendation
 
-The source code of this tutorial is in [book/recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system).
-
-For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+The source code from this tutorial is at [here](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system).  For instructions to run it, please refer to [this guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 
 ## Background
 
-With the fast growth of e-commerce, online videos, and online reading business, users have to rely on recommender systems to avoid manually browsing tremendous volume of choices.  Recommender systems understand users' interest by mining user behavior and other properties of users and products.
-
-Some well know approaches include:
-
-- User behavior-based approach.  A well-known method is collaborative filtering. The underlying assumption is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.
+The recommender system is a component of e-commerce, online videos, and online reading services.  There are several different approaches for recommender systems to learn from user behavior and product properties and to understand users' interests.
 
-- Content-based recommendation[[1](#reference)]. This approach infers feature vectors that represent products from their descriptions.  It also infers feature vectors that represent users' interests.  Then it measures the relevance of users and products by some distances between these feature vectors.
+- User behavior-based approach.  A well-known method of this approach is collaborative filtering, which assumes that if two users made similar purchases, they share common interests and would likely go on making the same decision. Some variants of collaborative filtering are user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.
 
-- Hybrid approach[[2](#reference)]: This approach uses the content-based information to help address the cold start problem[[6](#reference)] in behavior-based approach.
+- Content-based approach[[1](#reference)].  This approach represents product properties and user interests as feature vectors of the same space so that it could measure how much a user is interested in a product by the distance between two feature vectors.
 
-Among these options, collaborative filtering might be the most studied one.  Some of its variants include user-based[[3](#reference)], item-based [[4](#reference)], social network based[[5](#reference)], and model-based.
+- Hybrid approach[[2](#reference)]: This one combines above two to help with each other about the data sparsity problem[[6](#reference)].
 
-This tutorial explains a deep learning based approach and how to implement it using PaddlePaddle.  We will train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
+This tutorial explains a deep learning based hybrid approach and its implement in PaddlePaddle.  We are going to train a model using a dataset that includes user information, movie information, and ratings.  Once we train the model, we will be able to get a predicted rating given a pair of user and movie IDs.
 
 
 ## Model Overview
diff --git a/05.recommender_system/train.py b/05.recommender_system/train.py
index cb549e49abca240ea3268bc443aa37568c724cbf..e1f3853f5ed0f0d2b1f66494bd98e33479bf6601 100644
--- a/05.recommender_system/train.py
+++ b/05.recommender_system/train.py
@@ -1,6 +1,9 @@
 import paddle.v2 as paddle
 import cPickle
 import copy
+import os
+
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
 
 
 def get_usr_combined_features():
@@ -67,7 +70,7 @@ def get_mov_combined_features():
 
 
 def main():
-    paddle.init(use_gpu=False)
+    paddle.init(use_gpu=with_gpu)
     usr_combined_features = get_usr_combined_features()
     mov_combined_features = get_mov_combined_features()
     inference = paddle.layer.cos_sim(
diff --git a/06.understand_sentiment/README.cn.md b/06.understand_sentiment/README.cn.md
index 695db8f41bd432403c332c77ede5cb7b21470515..37f5d4e04d4e567ba1b86c6333552a30eb8c37ac 100644
--- a/06.understand_sentiment/README.cn.md
+++ b/06.understand_sentiment/README.cn.md
@@ -164,7 +164,6 @@ def stacked_lstm_net(input_dim,
     """
     assert stacked_num % 2 == 1
 
-    layer_attr = paddle.attr.Extra(drop_rate=0.5)
     fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
     lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
     para_attr = [fc_para_attr, lstm_para_attr]
@@ -181,7 +180,7 @@ def stacked_lstm_net(input_dim,
                           act=linear,
                           bias_attr=bias_attr)
     lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+        input=fc1, act=relu, bias_attr=bias_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
@@ -194,8 +193,7 @@ def stacked_lstm_net(input_dim,
             input=fc,
             reverse=(i % 2) == 0,
             act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
+            bias_attr=bias_attr)
         inputs = [fc, lstm]
 
     fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=paddle.pooling.Max())
@@ -292,6 +290,9 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
                 sys.stdout.write('.')
                 sys.stdout.flush()
         if isinstance(event, paddle.event.EndPass):
+            with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                parameters.to_tar(f)
+
             result = trainer.test(reader=test_reader, feeding=feeding)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
diff --git a/06.understand_sentiment/README.md b/06.understand_sentiment/README.md
index c264a65535dfcaf286fa1dffb3008a0c9c6607ee..e5aabfe6fde72fab599512a66b9d6d584747f461 100644
--- a/06.understand_sentiment/README.md
+++ b/06.understand_sentiment/README.md
@@ -1,6 +1,6 @@
 # Sentiment Analysis
 
-The source codes of this section can be located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+The source codes of this section is located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Background
 
@@ -15,9 +15,9 @@ In natural language processing, sentiment analysis refers to determining the emo
 
 <p align="center">Table 1 Sentiment Analysis in Movie Reviews</p>
 
-In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before the emergence of deep learning techniques, the mainstream methods for text representation include BOW (*bag of words*) and topic modeling, while the latter contain SVM (*support vector machine*) and LR (*logistic regression*).
+In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before the emergence of deep learning techniques, the mainstream methods for text representation include BOW (*bag of words*) and topic modeling, while the latter contains SVM (*support vector machine*) and LR (*logistic regression*).
 
-The BOW model does not capture all the information in a piece of text, as it ignores syntax and grammar and just treats the text as a set of words. For example, “this movie is extremely bad“ and “boring, dull, and empty work” describe very similar semantic meaning, yet their BOW representations have with little similarity. Furthermore, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW features, but they express completely opposite semantics.
+The BOW model does not capture all the information in a piece of text, as it ignores syntax and grammar and just treats the text as a set of words. For example, “this movie is extremely bad“ and “boring, dull, and empty work” describe very similar semantic meaning, yet their BOW representations have very little similarity. Furthermore, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW features, but they express completely opposite semantics.
 
 This chapter introduces a deep learning model that handles these issues in BOW. Our model embeds texts into a low-dimensional space and takes word order into consideration. It is an end-to-end framework and it has large performance improvement over traditional methods \[[1](#Reference)\].
 
@@ -28,15 +28,15 @@ The model we used in this chapter uses **Convolutional Neural Networks** (**CNNs
 
 ### Revisit to the Convolutional Neural Networks for Texts (CNN)
 
-The convolutional neural network for texts is introduced in chapter [recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), here we make a brief overview.
+The convolutional neural network for texts is introduced in chapter [recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), here is a brief overview.
 
-CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. We first apply the convolution operation: we apply the kernel in each window, extracting features. Convolving by the kernel at every window produces a feature map. Next, we apply *max pooling* over time to represent the whole sentence, which is the maximum element across the feature map. In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size. Finally, concatenating the resulting features produces a fixed-length representation, which can be combined with a softmax to form the model for the sentiment analysis problem.
+CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. We firstly apply the convolution operation: we apply the kernel in each window, extracting features. Convolving by the kernel at every window produces a feature map. Next, we apply *max pooling* over time to represent the whole sentence, which is the maximum element across the feature map. In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size. Finally, concatenating the resulting features produces a fixed-length representation, which can be combined with a softmax to form the model for the sentiment analysis problem.
 
 For short texts, the aforementioned CNN model can achieve very high accuracy \[[1](#Reference)\]. If we want to extract more abstract representations, we may apply a deeper CNN model \[[2](#Reference),[3](#Reference)\].
 
 ### Recurrent Neural Network (RNN)
 
-RNN is an effective model for sequential data. In terms of computability, the RNN is Turing-complete \[[4](#Reference)\]. Since NLP is a classical problem on sequential data, the RNN, especially its variant LSTM\[[5](#Reference)\]), achieves state-of-the-art performance on various NLP tasks, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation, and so forth.
+RNN is an effective model for sequential data. In terms of computability, the RNN is Turing-complete \[[4](#Reference)\]. Since NLP is a classical problem of sequential data, the RNN, especially its variant LSTM\[[5](#Reference)\]), achieves state-of-the-art performance on various NLP tasks, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation, and so forth.
 
 <p align="center">
 <img src="image/rnn.png" width = "60%" align="center"/><br/>
@@ -49,7 +49,7 @@ $$\vec{h_t}=f(\vec{x_t},\vec{h_{t-1}})=\sigma(W_{xh}\vec{x_t}+W_{hh}\vec{h_{h-1}
 
 where $W_{xh}$ is the weight matrix to feed into the latent layer; $W_{hh}$ is the latent-to-latent matrix; $b_h$ is the latent bias and $\sigma$ refers to the $sigmoid$ function.
 
-In NLP, words are often represented as a one-hot vectors and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN, such as a deep or stacked RNN. Finally, the last latent state may be used as a feature for sentence classification.
+In NLP, words are often represented as one-hot vectors and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN, such as a deep or stacked RNN. Finally, the last latent state may be used as a feature for sentence classification.
 
 ### Long-Short Term Memory (LSTM)
 
@@ -75,7 +75,7 @@ In the equation，$i_t, f_t, c_t, o_t$ stand for input gate, forget gate, memory
 Figure 2. LSTM at time step $t$ [7].
 </p>
 
-LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of last time-step, and the process goes on recurrently until all input are consumed:**
+LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with a simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of the last time step, and the process goes on recurrently until all inputs are consumed:**
 
 $$ h_t=Recrurent(x_t,h_{t-1})$$
 where $Recrurent$ is a simple RNN, GRU or LSTM.
@@ -93,7 +93,7 @@ Figure 3. Stacked Bidirectional LSTM for NLP modeling.
 
 ## Dataset
 
-We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for sentiment analysis in this tutorial, which consists of 50,000 movie reviews split evenly into 25k train and 25k test sets. In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10.
+We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for sentiment analysis in this tutorial, which consists of 50,000 movie reviews split evenly into a 25k train set and a 25k test set. In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10.
 
 `paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens`, and `wmt14`, etc. There's no need for us to manually download and preprocess IMDB.
 
@@ -136,7 +136,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
                              act=paddle.activation.Softmax())
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 ```
 
 1. Define input data and its dimension
@@ -163,9 +163,9 @@ def stacked_lstm_net(input_dim,
                      stacked_num=3):
     """
     A Wrapper for sentiment classification task.
-    This network uses bi-directional recurrent network,
-    consisting three LSTM layers. This configure is referred to
-    the paper as following url, but use fewer layrs.
+    This network uses a bi-directional recurrent network,
+    consisting of three LSTM layers. This configuration is
+    motivated from the following paper, but uses few layers.
         http://www.aclweb.org/anthology/P15-1109
     input_dim: here is word dictionary dimension.
     class_dim: number of categories.
@@ -175,7 +175,6 @@ def stacked_lstm_net(input_dim,
     """
     assert stacked_num % 2 == 1
 
-    layer_attr = paddle.attr.Extra(drop_rate=0.5)
     fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
     lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
     para_attr = [fc_para_attr, lstm_para_attr]
@@ -192,7 +191,7 @@ def stacked_lstm_net(input_dim,
                           act=linear,
                           bias_attr=bias_attr)
     lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+        input=fc1, act=relu, bias_attr=bias_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
@@ -205,8 +204,7 @@ def stacked_lstm_net(input_dim,
             input=fc,
             reverse=(i % 2) == 0,
             act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
+            bias_attr=bias_attr)
         inputs = [fc, lstm]
 
     fc_last = paddle.layer.pooling(
@@ -221,7 +219,7 @@ def stacked_lstm_net(input_dim,
 
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 ```
 
 1. Define input data and its dimension
@@ -245,9 +243,9 @@ dict_dim = len(word_dict)
 class_dim = 2
 
 # option 1
-cost = convolution_net(dict_dim, class_dim=class_dim)
+[cost, output] = convolution_net(dict_dim, class_dim=class_dim)
 # option 2
-# cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+# [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
 ```
 
 ## Model Training
@@ -311,6 +309,9 @@ def event_handler(event):
             sys.stdout.write('.')
             sys.stdout.flush()
     if isinstance(event, paddle.event.EndPass):
+        with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                parameters.to_tar(f)
+
         result = trainer.test(reader=test_reader, feeding=feeding)
         print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
@@ -333,7 +334,7 @@ In this chapter, we use sentiment analysis as an example to introduce applying d
 ## Reference
 
 1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
-2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
+2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modeling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
 3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
 4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
 5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
diff --git a/06.understand_sentiment/index.cn.html b/06.understand_sentiment/index.cn.html
index 57639035a1b0319a4ff98a6bf5e89186c472b583..2b1b05466ef67729a6d64a4a8795c2c27178e715 100644
--- a/06.understand_sentiment/index.cn.html
+++ b/06.understand_sentiment/index.cn.html
@@ -206,7 +206,6 @@ def stacked_lstm_net(input_dim,
     """
     assert stacked_num % 2 == 1
 
-    layer_attr = paddle.attr.Extra(drop_rate=0.5)
     fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
     lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
     para_attr = [fc_para_attr, lstm_para_attr]
@@ -223,7 +222,7 @@ def stacked_lstm_net(input_dim,
                           act=linear,
                           bias_attr=bias_attr)
     lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+        input=fc1, act=relu, bias_attr=bias_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
@@ -236,8 +235,7 @@ def stacked_lstm_net(input_dim,
             input=fc,
             reverse=(i % 2) == 0,
             act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
+            bias_attr=bias_attr)
         inputs = [fc, lstm]
 
     fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=paddle.pooling.Max())
@@ -334,6 +332,9 @@ Paddle中提供了一系列优化算法的API，这里使用Adam优化算法。
                 sys.stdout.write('.')
                 sys.stdout.flush()
         if isinstance(event, paddle.event.EndPass):
+            with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                parameters.to_tar(f)
+
             result = trainer.test(reader=test_reader, feeding=feeding)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
diff --git a/06.understand_sentiment/index.html b/06.understand_sentiment/index.html
index ffcd78552d434f996b82c59b176bde3d2b4d7452..17df65a29e737677d313395a92cac84133ddf02c 100644
--- a/06.understand_sentiment/index.html
+++ b/06.understand_sentiment/index.html
@@ -42,7 +42,7 @@
 <div id="markdown" style='display:none'>
 # Sentiment Analysis
 
-The source codes of this section can be located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
+The source codes of this section is located at [book/understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment). First-time users may refer to PaddlePaddle for [Installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
 ## Background
 
@@ -57,9 +57,9 @@ In natural language processing, sentiment analysis refers to determining the emo
 
 <p align="center">Table 1 Sentiment Analysis in Movie Reviews</p>
 
-In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before the emergence of deep learning techniques, the mainstream methods for text representation include BOW (*bag of words*) and topic modeling, while the latter contain SVM (*support vector machine*) and LR (*logistic regression*).
+In natural language processing, sentiment analysis can be categorized as a **Text Classification problem**, i.e., to categorize a piece of text to a specific class. It involves two related tasks: text representation and classification. Before the emergence of deep learning techniques, the mainstream methods for text representation include BOW (*bag of words*) and topic modeling, while the latter contains SVM (*support vector machine*) and LR (*logistic regression*).
 
-The BOW model does not capture all the information in a piece of text, as it ignores syntax and grammar and just treats the text as a set of words. For example, “this movie is extremely bad“ and “boring, dull, and empty work” describe very similar semantic meaning, yet their BOW representations have with little similarity. Furthermore, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW features, but they express completely opposite semantics.
+The BOW model does not capture all the information in a piece of text, as it ignores syntax and grammar and just treats the text as a set of words. For example, “this movie is extremely bad“ and “boring, dull, and empty work” describe very similar semantic meaning, yet their BOW representations have very little similarity. Furthermore, “the movie is bad“ and “the movie is not bad“ have high similarity with BOW features, but they express completely opposite semantics.
 
 This chapter introduces a deep learning model that handles these issues in BOW. Our model embeds texts into a low-dimensional space and takes word order into consideration. It is an end-to-end framework and it has large performance improvement over traditional methods \[[1](#Reference)\].
 
@@ -70,15 +70,15 @@ The model we used in this chapter uses **Convolutional Neural Networks** (**CNNs
 
 ### Revisit to the Convolutional Neural Networks for Texts (CNN)
 
-The convolutional neural network for texts is introduced in chapter [recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), here we make a brief overview.
+The convolutional neural network for texts is introduced in chapter [recommender_system](https://github.com/PaddlePaddle/book/tree/develop/05.recommender_system), here is a brief overview.
 
-CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. We first apply the convolution operation: we apply the kernel in each window, extracting features. Convolving by the kernel at every window produces a feature map. Next, we apply *max pooling* over time to represent the whole sentence, which is the maximum element across the feature map. In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size. Finally, concatenating the resulting features produces a fixed-length representation, which can be combined with a softmax to form the model for the sentiment analysis problem.
+CNN mainly contains convolution and pooling operation, with versatile combinations in various applications. We firstly apply the convolution operation: we apply the kernel in each window, extracting features. Convolving by the kernel at every window produces a feature map. Next, we apply *max pooling* over time to represent the whole sentence, which is the maximum element across the feature map. In real applications, we will apply multiple CNN kernels on the sentences. It can be implemented efficiently by concatenating the kernels together as a matrix. Also, we can use CNN kernels with different kernel size. Finally, concatenating the resulting features produces a fixed-length representation, which can be combined with a softmax to form the model for the sentiment analysis problem.
 
 For short texts, the aforementioned CNN model can achieve very high accuracy \[[1](#Reference)\]. If we want to extract more abstract representations, we may apply a deeper CNN model \[[2](#Reference),[3](#Reference)\].
 
 ### Recurrent Neural Network (RNN)
 
-RNN is an effective model for sequential data. In terms of computability, the RNN is Turing-complete \[[4](#Reference)\]. Since NLP is a classical problem on sequential data, the RNN, especially its variant LSTM\[[5](#Reference)\]), achieves state-of-the-art performance on various NLP tasks, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation, and so forth.
+RNN is an effective model for sequential data. In terms of computability, the RNN is Turing-complete \[[4](#Reference)\]. Since NLP is a classical problem of sequential data, the RNN, especially its variant LSTM\[[5](#Reference)\]), achieves state-of-the-art performance on various NLP tasks, such as language modeling, syntax parsing, POS-tagging, image captioning, dialog, machine translation, and so forth.
 
 <p align="center">
 <img src="image/rnn.png" width = "60%" align="center"/><br/>
@@ -91,7 +91,7 @@ $$\vec{h_t}=f(\vec{x_t},\vec{h_{t-1}})=\sigma(W_{xh}\vec{x_t}+W_{hh}\vec{h_{h-1}
 
 where $W_{xh}$ is the weight matrix to feed into the latent layer; $W_{hh}$ is the latent-to-latent matrix; $b_h$ is the latent bias and $\sigma$ refers to the $sigmoid$ function.
 
-In NLP, words are often represented as a one-hot vectors and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN, such as a deep or stacked RNN. Finally, the last latent state may be used as a feature for sentence classification.
+In NLP, words are often represented as one-hot vectors and then mapped to an embedding. The embedded feature goes through an RNN as input $x_t$ at every time step. Moreover, we can add other layers on top of RNN, such as a deep or stacked RNN. Finally, the last latent state may be used as a feature for sentence classification.
 
 ### Long-Short Term Memory (LSTM)
 
@@ -117,7 +117,7 @@ In the equation，$i_t, f_t, c_t, o_t$ stand for input gate, forget gate, memory
 Figure 2. LSTM at time step $t$ [7].
 </p>
 
-LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of last time-step, and the process goes on recurrently until all input are consumed:**
+LSTM enhances the ability of considering long-term reliance, with the help of memory cell and gate. Similar structures are also proposed in Gated Recurrent Unit (GRU)\[[8](Reference)\] with a simpler design. **The structures are still similar to RNN, though with some modifications (As shown in Figure 2), i.e., latent status depends on input as well as the latent status of the last time step, and the process goes on recurrently until all inputs are consumed:**
 
 $$ h_t=Recrurent(x_t,h_{t-1})$$
 where $Recrurent$ is a simple RNN, GRU or LSTM.
@@ -135,7 +135,7 @@ Figure 3. Stacked Bidirectional LSTM for NLP modeling.
 
 ## Dataset
 
-We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for sentiment analysis in this tutorial, which consists of 50,000 movie reviews split evenly into 25k train and 25k test sets. In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10.
+We use [IMDB](http://ai.stanford.edu/%7Eamaas/data/sentiment/) dataset for sentiment analysis in this tutorial, which consists of 50,000 movie reviews split evenly into a 25k train set and a 25k test set. In the labeled train/test sets, a negative review has a score <= 4 out of 10, and a positive review has a score >= 7 out of 10.
 
 `paddle.datasets` package encapsulates multiple public datasets, including `cifar`, `imdb`, `mnist`, `moivelens`, and `wmt14`, etc. There's no need for us to manually download and preprocess IMDB.
 
@@ -178,7 +178,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
                              act=paddle.activation.Softmax())
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 ```
 
 1. Define input data and its dimension
@@ -205,9 +205,9 @@ def stacked_lstm_net(input_dim,
                      stacked_num=3):
     """
     A Wrapper for sentiment classification task.
-    This network uses bi-directional recurrent network,
-    consisting three LSTM layers. This configure is referred to
-    the paper as following url, but use fewer layrs.
+    This network uses a bi-directional recurrent network,
+    consisting of three LSTM layers. This configuration is
+    motivated from the following paper, but uses few layers.
         http://www.aclweb.org/anthology/P15-1109
     input_dim: here is word dictionary dimension.
     class_dim: number of categories.
@@ -217,7 +217,6 @@ def stacked_lstm_net(input_dim,
     """
     assert stacked_num % 2 == 1
 
-    layer_attr = paddle.attr.Extra(drop_rate=0.5)
     fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
     lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
     para_attr = [fc_para_attr, lstm_para_attr]
@@ -234,7 +233,7 @@ def stacked_lstm_net(input_dim,
                           act=linear,
                           bias_attr=bias_attr)
     lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+        input=fc1, act=relu, bias_attr=bias_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
@@ -247,8 +246,7 @@ def stacked_lstm_net(input_dim,
             input=fc,
             reverse=(i % 2) == 0,
             act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
+            bias_attr=bias_attr)
         inputs = [fc, lstm]
 
     fc_last = paddle.layer.pooling(
@@ -263,7 +261,7 @@ def stacked_lstm_net(input_dim,
 
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 ```
 
 1. Define input data and its dimension
@@ -287,9 +285,9 @@ dict_dim = len(word_dict)
 class_dim = 2
 
 # option 1
-cost = convolution_net(dict_dim, class_dim=class_dim)
+[cost, output] = convolution_net(dict_dim, class_dim=class_dim)
 # option 2
-# cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+# [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
 ```
 
 ## Model Training
@@ -353,6 +351,9 @@ def event_handler(event):
             sys.stdout.write('.')
             sys.stdout.flush()
     if isinstance(event, paddle.event.EndPass):
+        with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                parameters.to_tar(f)
+
         result = trainer.test(reader=test_reader, feeding=feeding)
         print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 ```
@@ -375,7 +376,7 @@ In this chapter, we use sentiment analysis as an example to introduce applying d
 ## Reference
 
 1. Kim Y. [Convolutional neural networks for sentence classification](http://arxiv.org/pdf/1408.5882)[J]. arXiv preprint arXiv:1408.5882, 2014.
-2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modelling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
+2. Kalchbrenner N, Grefenstette E, Blunsom P. [A convolutional neural network for modeling sentences](http://arxiv.org/pdf/1404.2188.pdf?utm_medium=App.net&utm_source=PourOver)[J]. arXiv preprint arXiv:1404.2188, 2014.
 3. Yann N. Dauphin, et al. [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083v1.pdf)[J] arXiv preprint arXiv:1612.08083, 2016.
 4. Siegelmann H T, Sontag E D. [On the computational power of neural nets](http://research.cs.queensu.ca/home/akl/cisc879/papers/SELECTED_PAPERS_FROM_VARIOUS_SOURCES/05070215382317071.pdf)[C]//Proceedings of the fifth annual workshop on Computational learning theory. ACM, 1992: 440-449.
 5. Hochreiter S, Schmidhuber J. [Long short-term memory](http://web.eecs.utk.edu/~itamar/courses/ECE-692/Bobby_paper1.pdf)[J]. Neural computation, 1997, 9(8): 1735-1780.
diff --git a/06.understand_sentiment/train.py b/06.understand_sentiment/train.py
index 7878f00b6401ed0e6a0863d2cec129b6e51b163d..b55506f07d28711ec6ef70eef177745f2d26c01d 100644
--- a/06.understand_sentiment/train.py
+++ b/06.understand_sentiment/train.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
+import sys, os
 import paddle.v2 as paddle
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 
 def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
     data = paddle.layer.data("word",
@@ -28,7 +30,7 @@ def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
         input=[conv_3, conv_4], size=class_dim, act=paddle.activation.Softmax())
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 
 
 def stacked_lstm_net(input_dim,
@@ -51,7 +53,6 @@ def stacked_lstm_net(input_dim,
     """
     assert stacked_num % 2 == 1
 
-    layer_attr = paddle.attr.Extra(drop_rate=0.5)
     fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
     lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
     para_attr = [fc_para_attr, lstm_para_attr]
@@ -65,8 +66,7 @@ def stacked_lstm_net(input_dim,
 
     fc1 = paddle.layer.fc(
         input=emb, size=hid_dim, act=linear, bias_attr=bias_attr)
-    lstm1 = paddle.layer.lstmemory(
-        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+    lstm1 = paddle.layer.lstmemory(input=fc1, act=relu, bias_attr=bias_attr)
 
     inputs = [fc1, lstm1]
     for i in range(2, stacked_num + 1):
@@ -77,11 +77,7 @@ def stacked_lstm_net(input_dim,
             param_attr=para_attr,
             bias_attr=bias_attr)
         lstm = paddle.layer.lstmemory(
-            input=fc,
-            reverse=(i % 2) == 0,
-            act=relu,
-            bias_attr=bias_attr,
-            layer_attr=layer_attr)
+            input=fc, reverse=(i % 2) == 0, act=relu, bias_attr=bias_attr)
         inputs = [fc, lstm]
 
     fc_last = paddle.layer.pooling(
@@ -97,12 +93,12 @@ def stacked_lstm_net(input_dim,
 
     lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
     cost = paddle.layer.classification_cost(input=output, label=lbl)
-    return cost
+    return cost, output
 
 
 if __name__ == '__main__':
     # init
-    paddle.init(use_gpu=False)
+    paddle.init(use_gpu=with_gpu)
 
     #data
     print 'load dictionary...'
@@ -121,8 +117,8 @@ if __name__ == '__main__':
     # network config
     # Please choose the way to build the network
     # by uncommenting the corresponding line.
-    cost = convolution_net(dict_dim, class_dim=class_dim)
-    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+    [cost, output] = convolution_net(dict_dim, class_dim=class_dim)
+    # [cost, output] = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
 
     # create parameters
     parameters = paddle.parameters.create(cost)
@@ -143,15 +139,22 @@ if __name__ == '__main__':
                 sys.stdout.write('.')
                 sys.stdout.flush()
         if isinstance(event, paddle.event.EndPass):
+            with open('./params_pass_%d.tar' % event.pass_id, 'w') as f:
+                parameters.to_tar(f)
+
             result = trainer.test(reader=test_reader, feeding=feeding)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 
     # create trainer
     trainer = paddle.trainer.SGD(
         cost=cost, parameters=parameters, update_equation=adam_optimizer)
+    # Save the inference topology to protobuf.
+    inference_topology = paddle.topology.Topology(layers=output)
+    with open("./inference_topology.pkl", 'wb') as f:
+        inference_topology.serialize_for_inference(f)
 
     trainer.train(
         reader=train_reader,
         event_handler=event_handler,
         feeding=feeding,
-        num_passes=2)
+        num_passes=20)
diff --git a/07.label_semantic_roles/README.md b/07.label_semantic_roles/README.md
index 192ab794262a44155cf2a99db8050d5bdb78e262..608afd5aac640472f075d67c63de4bd4798c8f65 100644
--- a/07.label_semantic_roles/README.md
+++ b/07.label_semantic_roles/README.md
@@ -1,6 +1,6 @@
 # Semantic Role Labeling
 
-The source code of this chapter is live on [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles).
+The source code of this chapter locates at [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles).
 
 For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
@@ -14,7 +14,7 @@ In the following example of a Chinese sentence, "to encounter" is the predicate
 
 $$\mbox{[小明 Ming]}_{\mbox{Agent}}\mbox{[昨天 yesterday]}_{\mbox{Time}}\mbox{[晚上 evening]}_\mbox{Time}\mbox{在[公园 a park]}_{\mbox{Location}}\mbox{[遇到 to encounter]}_{\mbox{Predicate}}\mbox{了[小红 Hong]}_{\mbox{Patient}}\mbox{。}$$
 
-Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relation between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
+Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relationship between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
 
 Conventional SRL systems mostly build on top of syntactic analysis, usually consisting of five steps:
 
@@ -31,7 +31,7 @@ Fig 1. Syntax tree
 </div>
 
 
-However, a complete syntactic analysis requires identifying the relation among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out receive the tag O.
+However, a complete syntactic analysis requires identifying the relationship among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out will receive the tag O.
 
 The BIO representation of above example is shown in Fig.1.
 
@@ -50,7 +50,7 @@ In this tutorial, our SRL system is built as an end-to-end system via a neural n
 
 ## Model
 
-**Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
+**Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim modeling long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
 
 ### Stacked Recurrent Neural Network
 
@@ -77,9 +77,9 @@ Fig 3. Stacked Recurrent Neural Networks
 
 ### Bidirectional Recurrent Neural Network
 
-While LSTMs can summarize the history -- all the previous input seen up until now -- they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
+While LSTMs can summarize the history, they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
 
-To address, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
+To address this, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
 
 
 <p align="center">  
@@ -87,16 +87,16 @@ To address, we can design a bidirectional recurrent neural network by making a m
 Fig 4. Bidirectional LSTMs
 </p>
 
-Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks [machine translation](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md)
+Note that, this bidirectional RNNs is different from the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following chapter [machine translation](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md)
 
 ### Conditional Random Field (CRF)
 
-Typically, a neural network's lower layers learn representations while its very top layer learns the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes as input the representations provided by the last LSTM layer.
+Typically, a neural network's lower layers learn representations while its very top layer accomplishs the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes representations provided by the last LSTM layer as input.
 
 
 The CRF is an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. In essence, CRFs learn the conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input and $Y = (y_1, y_2, ... , y_n)$ are label sequences; to decode, simply search through $Y$ for a sequence that maximizes the conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
 
-Sequence tagging tasks do not assume a lot of conditional independence, because they are only concerned with the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
+Sequence tagging tasks do not assume a lot of conditional independence, because they only concern about the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
 
 <p align="center">  
 <img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
@@ -129,7 +129,7 @@ Given predicates and a sentence, SRL tasks aim to identify arguments of the give
  - expand input 1 into a sequence of the same length with input 2's sentence, using one-hot representation;
 2. Convert the one-hot sequences from step 1 to vector sequences via a word embedding's lookup table;
 3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
-4. Take the representation from step 3 as input, label sequence as supervisory signal, and realize sequence tagging tasks.
+4. Take the representation from step 3 as input, label sequence as a supervisory signal, and realize sequence tagging tasks.
 
 Here, we propose some improvements by introducing two simple but effective features:
 
@@ -140,11 +140,11 @@ Here, we propose some improvements by introducing two simple but effective featu
 After these modifications, the model is as follows, as illustrated in Figure 6:
 
 1. Construct inputs
- - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if word is located in the predicate context region, 0 otherwise.
+ - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if the word is located in the predicate context region, 0 otherwise.
  - expand input 2~3 into sequences with the same length with input 1
 2. Convert input 1~4 to vector sequences via word embedding lookup tables; While input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables.
 3. Take the four vector sequences from step 2 as inputs to bidirectional LSTMs; Train the LSTMs to update representations.
-4. Take the representation from step 3 as input to CRF, label sequence as supervisory signal, and complete sequence tagging tasks.
+4. Take the representation from step 3 as input to CRF, label sequence as a supervisory signal, and complete sequence tagging tasks.
 
 
 <div  align="center">  
@@ -161,8 +161,8 @@ The original data includes a variety of information such as POS tagging, naming
 ```text
 conll05st-release/
 └── test.wsj
-    ├── props  # 标注结果
-    └── words  # 输入文本序列
+    ├── props  # label results
+    └── words  # text sequence
 ```
 
 The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The labeling of the PropBank is different from the labeling methods mentioned before, but shares with it the same underlying principle. For descriptions of the labeling, please refer to the paper \[[9](#references)\].
@@ -202,7 +202,7 @@ In addition to the data, we provide following resources:
 | word_dict | dictionary of input sentences, total 44068 words |
 | label_dict | dictionary of labels, total 106 labels |
 | predicate_dict | predicate dictionary, total 3162 predicates |
-| emb | a pre-trained word vector lookup table, 32-dimentional |
+| emb | a pre-trained word vector lookup table, 32-dimensional |
 
 We trained a language model on the English Wikipedia to get a word vector lookup table used to initialize the SRL model. While training the SRL model, the word vector lookup table is no longer updated. To learn more about the language model and the word vector lookup table, please refer to the tutorial [word vector](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.md). There are 995,000,000 tokens in the training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
 
@@ -484,7 +484,7 @@ trainer.train(
 
 ### Application
 
-Aftern training is done, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs does not include the ground truth label.
+When training is completed, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs do not include the ground truth label.
 
 ```python
 predict = paddle.layer.crf_decoding(
diff --git a/07.label_semantic_roles/index.html b/07.label_semantic_roles/index.html
index 4655c00c167d1a2be250d0ea6fd80161840cd84e..5398dedb6abe20613855b92b1d79a1af30fc91a0 100644
--- a/07.label_semantic_roles/index.html
+++ b/07.label_semantic_roles/index.html
@@ -42,7 +42,7 @@
 <div id="markdown" style='display:none'>
 # Semantic Role Labeling
 
-The source code of this chapter is live on [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles).
+The source code of this chapter locates at [book/label_semantic_roles](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles).
 
 For instructions on getting started with PaddlePaddle, see [PaddlePaddle installation guide](https://github.com/PaddlePaddle/book/blob/develop/README.md#running-the-book).
 
@@ -56,7 +56,7 @@ In the following example of a Chinese sentence, "to encounter" is the predicate
 
 $$\mbox{[小明 Ming]}_{\mbox{Agent}}\mbox{[昨天 yesterday]}_{\mbox{Time}}\mbox{[晚上 evening]}_\mbox{Time}\mbox{在[公园 a park]}_{\mbox{Location}}\mbox{[遇到 to encounter]}_{\mbox{Predicate}}\mbox{了[小红 Hong]}_{\mbox{Patient}}\mbox{。}$$
 
-Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relation between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
+Instead of analyzing the semantic information, **Semantic Role Labeling** (**SRL**) identifies the relationship between the predicate and the other constituents surrounding it. The predicate-argument structures are labeled as specific semantic roles. A wide range of natural language understanding tasks, including *information extraction*, *discourse analysis*, and *deepQA*. Research usually assumes a predicate of a sentence to be specified; the only task is to identify its arguments and their semantic roles.
 
 Conventional SRL systems mostly build on top of syntactic analysis, usually consisting of five steps:
 
@@ -73,7 +73,7 @@ Fig 1. Syntax tree
 </div>
 
 
-However, a complete syntactic analysis requires identifying the relation among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out receive the tag O.
+However, a complete syntactic analysis requires identifying the relationship among all constituents. Thus, the accuracy of SRL is sensitive to the preciseness of the syntactic analysis, making SRL challenging. To reduce its complexity and obtain some information on the syntactic structures, we often use *shallow syntactic analysis* a.k.a. partial parsing or chunking. Unlike complete syntactic analysis, which requires the construction of the complete parsing tree, *Shallow Syntactic Analysis* only requires identifying some independent constituents with relatively simple structures, such as verb phrases (chunk). To avoid difficulties in constructing a syntax tree with high accuracy, some work\[[1](#Reference)\] proposed semantic chunking-based SRL methods, which reduces SRL into a sequence tagging problem. Sequence tagging tasks classify syntactic chunks using **BIO representation**. For syntactic chunks forming role A, its first chunk receives the B-A tag (Begin) and the remaining ones receive the tag I-A (Inside); in the end, the chunks left out will receive the tag O.
 
 The BIO representation of above example is shown in Fig.1.
 
@@ -92,7 +92,7 @@ In this tutorial, our SRL system is built as an end-to-end system via a neural n
 
 ## Model
 
-**Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim model long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
+**Recurrent Neural Networks** (*RNN*) are important tools for sequence modeling and have been successfully used in some natural language processing tasks. Unlike feed-forward neural networks, RNNs can model the dependencies between elements of sequences. As a variant of RNNs', LSTMs aim modeling long-term dependency in long sequences. We have introduced this in [understand_sentiment](https://github.com/PaddlePaddle/book/tree/develop/05.understand_sentiment). In this chapter, we continue to use LSTMs to solve SRL problems.
 
 ### Stacked Recurrent Neural Network
 
@@ -119,9 +119,9 @@ Fig 3. Stacked Recurrent Neural Networks
 
 ### Bidirectional Recurrent Neural Network
 
-While LSTMs can summarize the history -- all the previous input seen up until now -- they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
+While LSTMs can summarize the history, they can not see the future. Because most NLP (natural language processing) tasks provide the entirety of sentences, sequential learning can benefit from having the future encoded as well as the history.
 
-To address, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
+To address this, we can design a bidirectional recurrent neural network by making a minor modification. A higher LSTM layer can process the sequence in reversed direction with regards to its immediate lower LSTM layer, i.e., deep LSTM layers take turns to train on input sequences from left-to-right and right-to-left. Therefore, LSTM layers at time-step $t$ can see both histories and the future, starting from the second layer. Fig. 4 illustrates the bidirectional recurrent neural networks.
 
 
 <p align="center">  
@@ -129,16 +129,16 @@ To address, we can design a bidirectional recurrent neural network by making a m
 Fig 4. Bidirectional LSTMs
 </p>
 
-Note that, this bidirectional RNNs is different with the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following tasks [machine translation](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md)
+Note that, this bidirectional RNNs is different from the one proposed by Bengio et al. in machine translation tasks \[[3](#Reference), [4](#Reference)\]. We will introduce another bidirectional RNNs in the following chapter [machine translation](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md)
 
 ### Conditional Random Field (CRF)
 
-Typically, a neural network's lower layers learn representations while its very top layer learns the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes as input the representations provided by the last LSTM layer.
+Typically, a neural network's lower layers learn representations while its very top layer accomplishs the final task. These principles can guide our problem-solving approaches. In SRL tasks, a **Conditional Random Field** (*CRF*) is built on top of the network in order to perform the final prediction to tag sequences. It takes representations provided by the last LSTM layer as input.
 
 
 The CRF is an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. In essence, CRFs learn the conditional probability $P(Y|X)$, where $X = (x_1, x_2, ... , x_n)$ are sequences of input and $Y = (y_1, y_2, ... , y_n)$ are label sequences; to decode, simply search through $Y$ for a sequence that maximizes the conditional probability $P(Y|X)$, i.e., $Y^* = \mbox{arg max}_{Y} P(Y | X)$。
 
-Sequence tagging tasks do not assume a lot of conditional independence, because they are only concerned with the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
+Sequence tagging tasks do not assume a lot of conditional independence, because they only concern about the input and the output being linear sequences. Thus, the graph model of sequence tagging tasks is usually a simple chain or line, which results in a **Linear-Chain Conditional Random Field**, shown in Fig.5.
 
 <p align="center">  
 <img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
@@ -171,7 +171,7 @@ Given predicates and a sentence, SRL tasks aim to identify arguments of the give
  - expand input 1 into a sequence of the same length with input 2's sentence, using one-hot representation;
 2. Convert the one-hot sequences from step 1 to vector sequences via a word embedding's lookup table;
 3. Learn the representation of input sequences by taking vector sequences from step 2 as inputs;
-4. Take the representation from step 3 as input, label sequence as supervisory signal, and realize sequence tagging tasks.
+4. Take the representation from step 3 as input, label sequence as a supervisory signal, and realize sequence tagging tasks.
 
 Here, we propose some improvements by introducing two simple but effective features:
 
@@ -182,11 +182,11 @@ Here, we propose some improvements by introducing two simple but effective featu
 After these modifications, the model is as follows, as illustrated in Figure 6:
 
 1. Construct inputs
- - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if word is located in the predicate context region, 0 otherwise.
+ - Input 1: word sequence. Input 2: predicate. Input 3: predicate context, extract $n$ words before and after predicate. Input 4: region mark sequence, where an entry is 1 if the word is located in the predicate context region, 0 otherwise.
  - expand input 2~3 into sequences with the same length with input 1
 2. Convert input 1~4 to vector sequences via word embedding lookup tables; While input 1 and 3 shares the same lookup table, input 2 and 4 have separate lookup tables.
 3. Take the four vector sequences from step 2 as inputs to bidirectional LSTMs; Train the LSTMs to update representations.
-4. Take the representation from step 3 as input to CRF, label sequence as supervisory signal, and complete sequence tagging tasks.
+4. Take the representation from step 3 as input to CRF, label sequence as a supervisory signal, and complete sequence tagging tasks.
 
 
 <div  align="center">  
@@ -203,8 +203,8 @@ The original data includes a variety of information such as POS tagging, naming
 ```text
 conll05st-release/
 └── test.wsj
-    ├── props  # 标注结果
-    └── words  # 输入文本序列
+    ├── props  # label results
+    └── words  # text sequence
 ```
 
 The annotation information is derived from the results of Penn TreeBank\[[7](#references)\] and PropBank \[[8](# references)\]. The labeling of the PropBank is different from the labeling methods mentioned before, but shares with it the same underlying principle. For descriptions of the labeling, please refer to the paper \[[9](#references)\].
@@ -244,7 +244,7 @@ In addition to the data, we provide following resources:
 | word_dict | dictionary of input sentences, total 44068 words |
 | label_dict | dictionary of labels, total 106 labels |
 | predicate_dict | predicate dictionary, total 3162 predicates |
-| emb | a pre-trained word vector lookup table, 32-dimentional |
+| emb | a pre-trained word vector lookup table, 32-dimensional |
 
 We trained a language model on the English Wikipedia to get a word vector lookup table used to initialize the SRL model. While training the SRL model, the word vector lookup table is no longer updated. To learn more about the language model and the word vector lookup table, please refer to the tutorial [word vector](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.md). There are 995,000,000 tokens in the training corpus, and the dictionary size is 4900,000 words. In the CoNLL 2005 training corpus, 5% of the words are not in the 4900,000 words, and we see them all as unknown words, represented by `<unk>`.
 
@@ -526,7 +526,7 @@ trainer.train(
 
 ### Application
 
-Aftern training is done, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs does not include the ground truth label.
+When training is completed, we need to select an optimal model based one performance index to do inference. In this task, one can simply select the model with the least number of marks on the test set. The `paddle.layer.crf_decoding` layer is used in the inference, but its inputs do not include the ground truth label.
 
 ```python
 predict = paddle.layer.crf_decoding(
diff --git a/07.label_semantic_roles/train.py b/07.label_semantic_roles/train.py
index 94d751dffa5edd620793e8818f19e98477212cdc..ba86abcd1dff2d90e033c857fd73f4d7e13b041e 100644
--- a/07.label_semantic_roles/train.py
+++ b/07.label_semantic_roles/train.py
@@ -1,9 +1,11 @@
-import math
+import math, os
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.evaluator as evaluator
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
 label_dict_len = len(label_dict)
@@ -118,7 +120,7 @@ def load_parameter(file_name, h, w):
 
 
 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
 
     # define network topology
     feature_out = db_lstm()
@@ -158,6 +160,9 @@ def main():
     reader = paddle.batch(
         paddle.reader.shuffle(conll05.test(), buf_size=8192), batch_size=10)
 
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(conll05.test(), buf_size=8192), batch_size=10)
+
     feeding = {
         'word_data': 0,
         'ctx_n2_data': 1,
@@ -176,7 +181,7 @@ def main():
                 print "Pass %d, Batch %d, Cost %f, %s" % (
                     event.pass_id, event.batch_id, event.cost, event.metrics)
             if event.batch_id % 1000 == 0:
-                result = trainer.test(reader=reader, feeding=feeding)
+                result = trainer.test(reader=test_reader, feeding=feeding)
                 print "\nTest with Pass %d, Batch %d, %s" % (
                     event.pass_id, event.batch_id, result.metrics)
 
@@ -185,7 +190,7 @@ def main():
             with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
                 parameters.to_tar(f)
 
-            result = trainer.test(reader=reader, feeding=feeding)
+            result = trainer.test(reader=test_reader, feeding=feeding)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 
     trainer.train(
@@ -209,6 +214,7 @@ def main():
         output_layer=predict,
         parameters=parameters,
         input=test_data,
+        feeding=feeding,
         field='id')
     assert len(probs) == len(test_data[0][0])
     labels_reverse = {}
diff --git a/08.machine_translation/README.md b/08.machine_translation/README.md
index 2c410b4c3d54b5dba9adc155b90d8ffc607ab095..90a5541dee198ae458e3289435998278b179490c 100644
--- a/08.machine_translation/README.md
+++ b/08.machine_translation/README.md
@@ -6,24 +6,24 @@ The source code of this tutorial is live at [book/machine_translation](https://g
 
 Machine translation (MT) leverages computers to translate from one language to another. The language to be translated is referred to as the source language, while the language to be translated into is referred to as the target language. Thus, Machine translation is the process of translating from the source language to the target language. It is one of the most important research topics in the field of natural language processing.
 
-Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one languge. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#References)\].
+Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one language. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#references)\].
 
 
 To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example:
 
-1. human designed features cannot cover all possible linguistic variations;
+1. Human designed features cannot cover all possible linguistic variations;
 
-2. it is difficult to use global features;
+2. It is difficult to use global features;
 
-3. the techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
+3. The techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
 
 
 
 The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are:
 
-1. techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
+1. Techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
 
-2. techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
+2. Techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
 
 <p align="center">
 <img src="image/nmt_en.png" width=400><br/>
@@ -98,9 +98,9 @@ There are three steps for encoding a sentence:
 
 2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation
 
-  * the dimensionality of the vector is typically large, leading to the curse of dimensionality;
+  * The dimensionality of the vector is typically large, leading to the curse of dimensionality;
 
-  * it is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
+  * It is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
 
 3. Encoding of the source sequence via RNN: This can be described mathematically as:
 
@@ -338,10 +338,10 @@ is_generating = False
 
 5. Training mode:
 
-   - word embedding from the target language trg_embedding is passed to `gru_decoder_with_attention` as current_word.
+   - Word embedding from the target language trg_embedding is passed to `gru_decoder_with_attention` as current_word.
    - `recurrent_group` calls `gru_decoder_with_attention` in a recurrent way
-   - the sequence of next words from the target language is used as label (lbl)
-   - multi-class cross-entropy (`classification_cost`) is used to calculate the cost
+   - The sequence of next words from the target language is used as label (lbl)
+   - Multi-class cross-entropy (`classification_cost`) is used to calculate the cost
 
    ```python
    if not is_generating:
@@ -371,7 +371,7 @@ is_generating = False
 
 6. Generating mode:
 
-   - the decoder predicts a next target word based on the the last generated target word. Embedding of the last generated word is automatically gotten by GeneratedInputs.
+   - The decoder predicts a next target word based on the the last generated target word. Embedding of the last generated word is automatically gotten by GeneratedInputs.
    - `beam_search` calls `gru_decoder_with_attention` in a recurrent way, to predict sequence id.
 
    ```python
diff --git a/08.machine_translation/index.html b/08.machine_translation/index.html
index edb43bd33efbbfc909a297b96690961a859072d9..2480eef3bfe85afa622aea1b2a9652f38b4e1538 100644
--- a/08.machine_translation/index.html
+++ b/08.machine_translation/index.html
@@ -48,24 +48,24 @@ The source code of this tutorial is live at [book/machine_translation](https://g
 
 Machine translation (MT) leverages computers to translate from one language to another. The language to be translated is referred to as the source language, while the language to be translated into is referred to as the target language. Thus, Machine translation is the process of translating from the source language to the target language. It is one of the most important research topics in the field of natural language processing.
 
-Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one languge. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#References)\].
+Early machine translation systems are mainly rule-based i.e. they rely on a language expert to specify the translation rules between the two languages. It is quite difficult to cover all the rules used in one language. So it is quite a challenge for language experts to specify all possible rules in two or more different languages. Hence, a major challenge in conventional machine translation has been the difficulty in obtaining a complete rule set \[[1](#references)\].
 
 
 To address the aforementioned problems, statistical machine translation techniques have been developed. These techniques learn the translation rules from a large corpus, instead of being designed by a language expert. While these techniques overcome the bottleneck of knowledge acquisition, there are still quite a lot of challenges, for example:
 
-1. human designed features cannot cover all possible linguistic variations;
+1. Human designed features cannot cover all possible linguistic variations;
 
-2. it is difficult to use global features;
+2. It is difficult to use global features;
 
-3. the techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
+3. The techniques heavily rely on pre-processing techniques like word alignment, word segmentation and tokenization, rule-extraction and syntactic parsing etc. The error introduced in any of these steps could accumulate and impact translation quality.
 
 
 
 The recent development of deep learning provides new solutions to these challenges. The two main categories for deep learning based machine translation techniques are:
 
-1. techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
+1. Techniques based on the statistical machine translation system but with some key components improved with neural networks, e.g., language model, reordering model (please refer to the left part of Figure 1);
 
-2. techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
+2. Techniques mapping from source language to target language directly using a neural network, or end-to-end neural machine translation (NMT).
 
 <p align="center">
 <img src="image/nmt_en.png" width=400><br/>
@@ -140,9 +140,9 @@ There are three steps for encoding a sentence:
 
 2. Word embedding as a representation in the low-dimensional semantic space: There are two problems with one-hot vector representation
 
-  * the dimensionality of the vector is typically large, leading to the curse of dimensionality;
+  * The dimensionality of the vector is typically large, leading to the curse of dimensionality;
 
-  * it is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
+  * It is hard to capture the relationships between words, i.e., semantic similarities. Therefore, it is useful to project the one-hot vector into a low-dimensional semantic space as a dense vector with fixed dimensions, i.e., $s_i=Cw_i$ for the $i$-th word, with $C\epsilon R^{K\times \left | V \right |}$ as the projection matrix and $K$ is the dimensionality of the word embedding vector.
 
 3. Encoding of the source sequence via RNN: This can be described mathematically as:
 
@@ -380,10 +380,10 @@ is_generating = False
 
 5. Training mode:
 
-   - word embedding from the target language trg_embedding is passed to `gru_decoder_with_attention` as current_word.
+   - Word embedding from the target language trg_embedding is passed to `gru_decoder_with_attention` as current_word.
    - `recurrent_group` calls `gru_decoder_with_attention` in a recurrent way
-   - the sequence of next words from the target language is used as label (lbl)
-   - multi-class cross-entropy (`classification_cost`) is used to calculate the cost
+   - The sequence of next words from the target language is used as label (lbl)
+   - Multi-class cross-entropy (`classification_cost`) is used to calculate the cost
 
    ```python
    if not is_generating:
@@ -413,7 +413,7 @@ is_generating = False
 
 6. Generating mode:
 
-   - the decoder predicts a next target word based on the the last generated target word. Embedding of the last generated word is automatically gotten by GeneratedInputs.
+   - The decoder predicts a next target word based on the the last generated target word. Embedding of the last generated word is automatically gotten by GeneratedInputs.
    - `beam_search` calls `gru_decoder_with_attention` in a recurrent way, to predict sequence id.
 
    ```python
diff --git a/08.machine_translation/train.py b/08.machine_translation/train.py
index 6f857b0cad9c23bee62de3ad85770d8d1cf538a9..79e5861e753d0eea7db6b16bbae72fce05c97bfd 100644
--- a/08.machine_translation/train.py
+++ b/08.machine_translation/train.py
@@ -1,8 +1,9 @@
-import sys
+import sys, os
 import numpy as np
-
 import paddle.v2 as paddle
 
+with_gpu = os.getenv('WITH_GPU', '0') != '0'
+
 
 def save_model(parameters, save_path):
     with open(save_path, 'w') as f:
@@ -135,7 +136,7 @@ def seq_to_seq_net(source_dict_dim,
 
 
 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=with_gpu, trainer_count=1)
     is_generating = False
 
     # source and target dict dim.
diff --git a/mnist-client/Dockerfile b/mnist-client/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d7ef76940ce1e7360f178dea26b006d74ba22023
--- /dev/null
+++ b/mnist-client/Dockerfile
@@ -0,0 +1,9 @@
+FROM mhart/alpine-node:6.11.3
+
+RUN mkdir /workspace
+WORKDIR /workspace/
+ADD * /workspace/
+RUN apk add --no-cache python py-pip
+RUN pip install -r /workspace/requirements.txt
+RUN cd /workspace && npm install && mkdir templates && mv index.html templates && mkdir static && mv js static && mv css static
+CMD ["python", "main.py"]
diff --git a/mnist-client/README.md b/mnist-client/README.md
index ade569f19413cb9489179494df8a9d1aa67e2822..118790adb4682a081ff1c0351eb514d0857115f0 100644
--- a/mnist-client/README.md
+++ b/mnist-client/README.md
@@ -1,20 +1,72 @@
 # MNIST classification by PaddlePaddle
 
-Forked from https://github.com/sugyan/tensorflow-mnist
-
 ![screencast](https://cloud.githubusercontent.com/assets/80381/11339453/f04f885e-923c-11e5-8845-33c16978c54d.gif)
 
-## Build
+## Usage
 
-    $ docker build -t paddle-mnist .
+This MNIST classification demo consists of two parts: a PaddlePaddle
+inference server and a Javascript front end. We will start them
+separately.
 
-## Usage
+We will use Docker to run the demo, if you are not familiar with
+Docker, please checkout
+this
+[tutorial](https://github.com/PaddlePaddle/Paddle/wiki/TLDR-for-new-docker-user).
+
+### Start the Inference Server
+
+The inference server can be used to inference any model trained by
+PaddlePaddle. Please see [here](../serve/README.md) for more details.
+
+1. Download the MNIST inference model topylogy and parameters to the
+   current working directory.
+
+    ```bash
+    wget https://s3.us-east-2.amazonaws.com/models.paddlepaddle/end-to-end-mnist/inference_topology.pkl
+    wget https://s3.us-east-2.amazonaws.com/models.paddlepaddle/end-to-end-mnist/param.tar
+    ```
+
+1. Run following command to start the inference server:
+
+    ```bash
+    docker run --name paddle_serve -v `pwd`:/data -d -p 8000:80 -e WITH_GPU=0 paddlepaddle/book:serve
+    ```
+
+    The above command will mount the current working directory to the
+    `/data` directory inside the docker container. The inference
+    server will load the model topology and parameters that we just
+    downloaded from there.
 
+    After you are done with the demo, you can run `docker stop
+    paddle_serve` to stop this docker container.
+
+### Start the Front End
+
+1. Run the following command
+   ```bash
+   docker run -it -p 5000:5000 -e BACKEND_URL=http://localhost:8000/ paddlepaddle/book:mnist
+   ```
+
+   `BACKEND_URL` in the above command specifies the inference server
+   endpoint. If you started the inference server on another machine,
+   or want to visit the front end remotely, you may want to change its
+   value.
+
+1. Visit http://localhost:5000 and you will see the PaddlePaddle MNIST demo.
+
+
+## Build
+
+We have already prepared the pre-built docker image
+`paddlepaddle/book:mnist`, here is the command if you want to build
+the docker image again.
 
-1. Download `inference_topology.pkl` and `param.tar` to current directory
-1. Run following commands:
 ```bash
-docker run -v `pwd`:/data -d -p 8000:80 -e WITH_GPU=0 paddlepaddle/book:serve
-docker run -it -p 5000:5000 paddlepaddle/book:mnist
+docker build -t paddlepaddle/book:mnist .
 ```
-1. Visit http://localhost:5000
+
+
+## Acknowledgement
+
+Thanks to the great project https://github.com/sugyan/tensorflow-mnist
+. Most of the code in this project comes from there.
diff --git a/mnist-client/main.py b/mnist-client/main.py
index 88f162dd82e69deda56d0aec5e71aa8a7deb328c..fc31e3977eba4246fc757a41e6fbf67eaee27f36 100644
--- a/mnist-client/main.py
+++ b/mnist-client/main.py
@@ -1,4 +1,5 @@
 from flask import Flask, jsonify, render_template, request
+import os
 
 # webapp
 app = Flask(__name__)
@@ -6,7 +7,8 @@ app = Flask(__name__)
 
 @app.route('/')
 def main():
-    return render_template('index.html')
+    backend_url = os.getenv('BACKEND_URL', 'http://localhost:8000/')
+    return render_template('index.html', backend_url=backend_url)
 
 
 if __name__ == '__main__':
diff --git a/mnist-client/src/js/main.js b/mnist-client/src/js/main.js
index 53ac0f7bcbf4e8c0758fd795dc137bdfb9172162..9f556b215ca14c18cf76c668f748941862fe525f 100644
--- a/mnist-client/src/js/main.js
+++ b/mnist-client/src/js/main.js
@@ -90,7 +90,7 @@ class Main {
 		}
 	    }
             $.ajax({
-                url: 'http://localhost:8000/',
+                url: BACKEND_URL,
                 method: 'POST',
                 contentType: 'application/json',
                 data: JSON.stringify({"img":inputs}),
diff --git a/mnist-client/templates/index.html b/mnist-client/templates/index.html
index 115fc86b82d16e8cc00aa5e2f42d9c6222217b42..06240bfa95fad434f8c415d42c8ba5a5e04c790a 100644
--- a/mnist-client/templates/index.html
+++ b/mnist-client/templates/index.html
@@ -4,6 +4,9 @@
     <title>MNIST</title>
     <link rel="stylesheet" href="{{ url_for('static', filename='css/bootstrap.min.css') }}">
     <script type="text/javascript" src="{{ url_for('static', filename='js/jquery.min.js') }}"></script>
+    <script type="text/javascript">
+        var BACKEND_URL = "{{ backend_url }}"
+    </script>
     <script type="text/javascript" src="{{ url_for('static', filename='js/main.js') }}"></script>
   </head>
   <body>
diff --git a/serve/Dockerfile b/serve/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dfcd94da4b91898d8aa3983c8e0234e663d9a0f5
--- /dev/null
+++ b/serve/Dockerfile
@@ -0,0 +1,8 @@
+FROM paddlepaddle/paddle
+
+ENV PARAMETER_TAR_PATH=/data/param.tar \
+    TOPOLOGY_FILE_PATH=/data/inference_topology.pkl
+ADD requirements.txt /root
+ADD main.py /root
+RUN pip install -r /root/requirements.txt
+CMD ["python", "/root/main.py"]
diff --git a/serve/Dockerfile.gpu b/serve/Dockerfile.gpu
new file mode 100644
index 0000000000000000000000000000000000000000..7ec79dca05f0d1e0431b39e97fa78cad9165126a
--- /dev/null
+++ b/serve/Dockerfile.gpu
@@ -0,0 +1,8 @@
+FROM paddlepaddle/paddle:latest-gpu
+
+ENV PARAMETER_TAR_PATH=/data/param.tar \
+    TOPOLOGY_FILE_PATH=/data/inference_topology.pkl
+ADD requirements.txt /root
+ADD main.py /root
+RUN pip install -r /root/requirements.txt
+CMD ["python", "/root/main.py"]
diff --git a/serve/README.md b/serve/README.md
index 06bc0d5ad0cfa55e135100b2898314977794c0de..fb9552f32ca54eb28c94bb66aa20ea3ff10cee92 100644
--- a/serve/README.md
+++ b/serve/README.md
@@ -1,11 +1,219 @@
-# PaddlePaddle Serving Example
+# Inference Server Example
 
+The inference server can be used to perform inference on any model trained on
+PaddlePaddle. It provides an HTTP endpoint.
 
-## Build
+## Run
 
-    $ docker build -t serve .
+The inference server reads a trained model (a topology file and a
+parameter file) and serves HTTP request at port `8000`. Because models
+differ in the numbers and types of inputs, **the HTTP API will differ
+slightly for each model,** please see [HTTP API](#http-api) for the
+API spec,
+and
+[here](https://github.com/PaddlePaddle/book/wiki/Using-Pre-trained-Models) for
+the request examples of different models that illustrate the
+difference.
 
-## Run
+We will first show how to obtain the PaddlePaddle model, and then how
+to start the server.
+
+We will use Docker to run the demo, if you are not familiar with
+Docker, please checkout
+this
+[TLDR](https://github.com/PaddlePaddle/Paddle/wiki/Docker-for-Beginners).
+
+### Obtain the PaddlePaddle Model
+
+A neural network model in PaddlePaddle contains two parts: the
+**parameter** and the **topology**.
+
+A PaddlePaddle training script contains the neural network topology,
+which is represented by layers. For example,
+
+```python
+img = paddle.layer.data(name="img", type=paddle.data_type.dense_vector(784))
+hidden = fc_layer(input=type, size=200)
+prediction = fc_layer(input=hidden, size=10, act=paddle.activation.Softmax())
+```
+
+The parameter instance is created by the topology and updated by the
+`train` method.
+
+```python
+...
+params = paddle.parameters.create(cost)
+...
+trainer = paddle.trainer.SGD(cost=cost, parameters=params)
+...
+```
+
+PaddlePaddle stores the topology and parameter separately.
+
+1. To serialize a topology, we need to create a topology instance
+   explicitly by the outputs of the neural network. Then, invoke
+   `serialize_for_inference` method.
+
+  ```python
+  # Save the inference topology to protobuf.
+  inference_topology = paddle.topology.Topology(layers=prediction)
+  with open("inference_topology.pkl", 'wb') as f:
+      inference_topology.serialize_for_inference(f)
+  ```
+
+2. To save a parameter, we need to invoke `to_tar` method in Parameter
+   class.
+
+  ```python
+  with open('param.tar', 'w') as f:
+            params.to_tar(f)
+  ```
+
+ After serializing the parameter and topology into two files, we could
+ use them to set up an inference server.
+
+ For a working example, please see [train.py](https://github.com/reyoung/paddle_mnist_v2_demo/blob/master/train.py).
+
+
+### Start the Server
+
+Make sure the `inference_topology.pkl` and `param.tar` mentioned in
+the last section are in your current working directory, and run the
+command:
+
+```bash
+docker run --name paddle_serve -v `pwd`:/data -d -p 8000:80 -e WITH_GPU=0 paddlepaddle/book:serve
+```
+
+The above command will mount the current working directory to the
+`/data/` directory inside the docker container. The inference server
+will load the model topology and parameters that we just created from
+there.
+
+To run the inference server with GPU support, please make sure you have
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+first, and run:
+
+```bash
+nvidia-docker run --name paddle_serve -v `pwd`:/data -d -p 8000:80 -e WITH_GPU=1 paddlepaddle/book:serve-gpu
+```
+
+this command will start a server on port `8000`.
+
+After you are done with the demo, you can run `docker stop
+paddle_serve` to stop this docker container.
+
+## HTTP API
+
+The inference server will handle HTTP POST request on path `/`. The
+content type of the request and response is json. You need to manually
+add `Content-Type` request header as `Content-Type: application/json`.
+
+The request json object is a single json dictionay object, whose key
+is the layer name of input data. The type of the corresponding value
+is decided by the data type. For most cases the corresponding value
+will be a list of floats. For completeness, we will list all data types
+below:
+
+There are twelve data types supported by PaddePaddle:
+
+| | plain | a sequence | a sequence of sequence |
+| --- | --- | --- | ---|
+| dense | [ f, f, f, f, ... ] | [ [f, f, f, ...], [f, f, f, ...]] | [[[f, f, ...], [f, f, ...]], [[f, f, ...], [f, f, ...]], ...] |
+| integer | i | [i, i, ...] | [[i, i, ...], [i, i, ...], ...] |
+| sparse | [i, i, ...] | [[i, i, ...], [i, i, ...], ...] | [[[i, i, ...], [i, i, ...], ...], [[i, i, ...], [i, i, ...], ...], ...] |
+| sparse | [[i, f], [i, f], ... ] | [[[i, f], [i, f], ... ], ...] | [[[[i, f], [i, f], ... ], ...], ...]
+
+In the table, `i` stands for a `int` value and `f` stands for a
+`float` value.
+
+What `data_type` should be used is decided by the training
+topology. For example,
+
+* For image data, they are usually a plain dense vector, we flatten
+  the image into a vector. The pixel values of that image are usually
+  normalized in `[-1.0, 1.0]` or `[0.0, 1.0]`(depends on each neural
+  network).
+
+    ```text
+    +-------+
+   |243 241|
+   |139 211| +---->[0.95, 0.95, 0.54, 0.82]
+   +-------+
+    ```
+
+* For text data, each word of that text is represented by an
+  integer. The association map between word and integer is decided by
+  the training process. A sentence is represented by a list of
+  integer.
+
+   ```text
+    I am good .
+        +
+        |
+        v
+   23 942 402 19  +----->  [23, 942, 402, 19]
+   ```
+
+A sample request data of a `4x4` image and a sentence could be
+
+```json
+{
+    "img": [
+        0.95,
+        0.95,
+        0.54,
+        0.82
+    ],
+    "sentence": [
+        23,
+        942,
+        402,
+        19
+    ]
+}
+```
+
+The response is a json object, too. The example of return data are:
+
+```json
+{
+  "code": 0,
+  "data": [
+    [
+      0.10060056298971176,
+      0.057179879397153854,
+      0.1453431099653244,
+      0.15825574100017548,
+      0.04464773088693619,
+      0.1566203236579895,
+      0.05657859891653061,
+      0.12077419459819794,
+      0.08073269575834274,
+      0.07926714420318604
+    ]
+  ],
+  "message": "success"
+}
+```
+
+Here, `code` and `message` represent the status of the request.
+`data` corresponds to the outputs of the neural network; they could be a
+probability of each class, could be the IDs of output sentence, and so
+on.
+
+## MNIST Demo Client
+
+If you have trained an model with [train.py](https://github.com/reyoung/paddle_mnist_v2_demo/blob/master/train.py) and
+start a inference server. Then you can use this [client](https://github.com/PaddlePaddle/book/tree/develop/02.recognize_digits/client/client.py) to test if it works right.
+
+## Build
+
+We have already prepared the pre-built docker image
+`paddlepaddle/book:serve`, here is the command if you want to build
+the docker image again.
 
-    $ docker run -v `pwd`:/data -it -p 8000:80 -e WITH_GPU=0 paddlepaddle/book:serve
-    $ curl -H "Content-Type: application/json" -X POST -d '{"img":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]}' http://localhost:8000/
+```bash
+docker build -t paddlepaddle/book:serve .
+docker build -t paddlepaddle/book:serve-gpu -f Dockerfile.gpu .
+```
diff --git a/serve/main.py b/serve/main.py
index 3d9f4bedc5e2ecd44d96d3b11005f315e6bbd5c8..ee1de9313793455c12ba457b004527ef11b2b3f6 100644
--- a/serve/main.py
+++ b/serve/main.py
@@ -4,6 +4,8 @@ import traceback
 import paddle.v2 as paddle
 from flask import Flask, jsonify, request
 from flask_cors import CORS
+from Queue import Queue
+import threading
 
 tarfn = os.getenv('PARAMETER_TAR_PATH', None)
 
@@ -20,7 +22,7 @@ if topology_filepath is None:
     )
 
 with_gpu = os.getenv('WITH_GPU', '0') != '0'
-
+output_field = os.getenv('OUTPUT_FIELD', 'value')
 port = int(os.getenv('PORT', '80'))
 
 app = Flask(__name__)
@@ -35,26 +37,53 @@ def successResp(data):
     return jsonify(code=0, message="success", data=data)
 
 
+sendQ = Queue()
+
+
 @app.route('/', methods=['POST'])
 def infer():
-    global inferer
-    try:
-        feeding = {}
-        d = []
-        for i, key in enumerate(request.json):
-            d.append(request.json[key])
-            feeding[key] = i
-        r = inferer.infer([d], feeding=feeding)
-    except:
-        trace = traceback.format_exc()
-        return errorResp(trace)
-    return successResp(r.tolist())
+    recv_queue = Queue()
+    sendQ.put((request.json, recv_queue))
+    success, resp = recv_queue.get()
+    if success:
+        return successResp(resp)
+    else:
+        return errorResp(resp)
+
+
+# PaddlePaddle v0.10.0 does not support inference from different
+# threads, so we create a single worker thread.
+def worker():
+    paddle.init(use_gpu=with_gpu)
 
+    fields = filter(lambda x: len(x) != 0, output_field.split(","))
 
-if __name__ == '__main__':
-    paddle.init(use_gpu=with_gpu)
     with open(tarfn) as param_f, open(topology_filepath) as topo_f:
         params = paddle.parameters.Parameters.from_tar(param_f)
         inferer = paddle.inference.Inference(parameters=params, fileobj=topo_f)
+
+    while True:
+        j, recv_queue = sendQ.get()
+        try:
+            feeding = {}
+            d = []
+            for i, key in enumerate(j):
+                d.append(j[key])
+                feeding[key] = i
+                r = inferer.infer([d], feeding=feeding, field=fields)
+        except:
+            trace = traceback.format_exc()
+            recv_queue.put((False, trace))
+            continue
+        if isinstance(r, list):
+            recv_queue.put((True, [elem.tolist() for elem in r]))
+        else:
+            recv_queue.put((True, r.tolist()))
+
+
+if __name__ == '__main__':
+    t = threading.Thread(target=worker)
+    t.daemon = True
+    t.start()
     print 'serving on port', port
     app.run(host='0.0.0.0', port=port, threaded=True)