Merge branch 'develop' of https://github.com/PaddlePaddle/book into mnist2

adcddb8e · dangqingqing · aea1c5e6 · 5dd09636 · adcddb8e · adcddb8e
29 changed file
--- a/image_classification/README.md
+++ b/image_classification/README.md
--- a/image_classification/deprecated/README.md
+++ b/image_classification/deprecated/README.md
--- a/image_classification/classify.py
+++ b/image_classification/classify.py
--- a/image_classification/data/cifar10.py
+++ b/image_classification/data/cifar10.py
--- a/image_classification/data/get_data.sh
+++ b/image_classification/data/get_data.sh
--- a/image_classification/dataprovider.py
+++ b/image_classification/dataprovider.py
--- a/image_classification/extract.sh
+++ b/image_classification/extract.sh
--- a/image_classification/models/resnet.py
+++ b/image_classification/models/resnet.py
--- a/image_classification/models/vgg.py
+++ b/image_classification/models/vgg.py
--- a/image_classification/predict.sh
+++ b/image_classification/predict.sh
--- a/image_classification/train.sh
+++ b/image_classification/train.sh
--- a/image_classification/resnet.py
+++ b/image_classification/resnet.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.v2 as paddle
+__all__ = ['resnet_cifar10']
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+def shortcut(ipt, n_in, n_out, stride):
+    if n_in != n_out:
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return ipt
+def basicblock(ipt, ch_out, stride):
+    ch_in = ch_out * 2
+    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
+    short = shortcut(ipt, ch_in, ch_out, stride)
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
+def layer_warp(block_func, ipt, features, count, stride):
+    tmp = block_func(ipt, features, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, features, 1)
+    return tmp
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    return pool
--- a/image_classification/train.py
+++ b/image_classification/train.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+import sys
+import paddle.v2 as paddle
+from vgg import vgg_bn_drop
+from resnet import resnet_cifar10
+def main():
+    datadim = 3 * 32 * 32
+    classdim = 10
+    # PaddlePaddle init
+    paddle.init(use_gpu=False, trainer_count=1)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+    # Add neural network config
+    # option 1. resnet
+    # net = resnet_cifar10(image, depth=32)
+    # option 2. vgg
+    net = vgg_bn_drop(image)
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+    # Create optimizer
+    momentum_optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+        learning_rate=0.1 / 128.0,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=50000 * 100,
+        learning_rate_schedule='discexp',
+        batch_size=128)
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.cifar.test10(), batch_size=128),
+                feeding={'image': 0,
+                         'label': 1})
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=momentum_optimizer)
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10(), buf_size=50000),
+            batch_size=128),
+        num_passes=200,
+        event_handler=event_handler,
+        feeding={'image': 0,
+                 'label': 1})
+if __name__ == '__main__':
+    main()
--- a/image_classification/vgg.py
+++ b/image_classification/vgg.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.v2 as paddle
+__all__ = ['vgg_bn_drop']
+def vgg_bn_drop(input):
+    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=ipt,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=paddle.pooling.Max())
+    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+    drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+    fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+    bn = paddle.layer.batch_norm(
+        input=fc1,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+    return fc2
--- a/label_semantic_roles/README.api.md
+++ b/label_semantic_roles/README.api.md
--- a/label_semantic_roles/README.en.md
+++ b/label_semantic_roles/README.en.md
@@ -440,15 +440,15 @@ trainer = paddle.trainer.SGD(cost=crf_cost,
 As mentioned in data preparation section, we will use CoNLL 2005 test corpus as training data set. `conll05.test()` outputs one training instance at a time. It will be shuffled, and batched into mini batches as input.
 ```python
-reader = paddle.reader.batched(
+reader = paddle.batch(
    paddle.reader.shuffle(
        conll05.test(), buf_size=8192), batch_size=20)
 ```
-`reader_dict` is used to specify relationship between data instance and layer layer. For example, according to following `reader_dict`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`.
+`feeding` is used to specify relationship between data instance and layer layer. For example, according to following `feeding`, the 0th column of data instance produced by`conll05.test()` correspond to data layer named `word_data`.
 ```python
-reader_dict = {
+feeding = {
    'word_data': 0,
    'ctx_n2_data': 1,
    'ctx_n1_data': 2,
@@ -478,7 +478,7 @@ trainer.train(
    reader=reader,
    event_handler=event_handler,
    num_passes=10000,
-    reader_dict=reader_dict)
+    feeding=feeding)
 ```
 ## Conclusion

--- a/label_semantic_roles/README.md
+++ b/label_semantic_roles/README.md
--- a/label_semantic_roles/index.en.html
+++ b/label_semantic_roles/index.en.html
--- a/label_semantic_roles/api_train.py
+++ b/label_semantic_roles/api_train.py
@@ -155,7 +155,7 @@ def main():
                                 parameters=parameters,
                                 update_equation=optimizer)
-    reader = paddle.reader.batched(
+    reader = paddle.batch(
        paddle.reader.shuffle(
            conll05.test(), buf_size=8192), batch_size=10)

--- a/recognize_digits/README.en.md
+++ b/recognize_digits/README.en.md
@@ -42,7 +42,7 @@ In such a classification problem, we usually use the cross entropy loss function
 $$  crossentropy(label, y) = -\sum_i label_ilog(y_i) $$
-Fig. 2 shows a softmax regression network, with weights in black, and bias in red. +1 indicates bias is 1.
+Fig. 2 shows a softmax regression network, with weights in blue, and bias in red. +1 indicates bias is 1.
 <p align="center">
 <img src="image/softmax_regression_en.png" width=400><br/>
@@ -57,7 +57,7 @@ The Softmax regression model described above uses the simplest two-layer neural
 2.  After the second hidden layer, we get $ H_2 = \phi(W_2H_1 + b_2) $.
 3.  Finally, after output layer, we get $Y=softmax(W_3H_2 + b_3)$, the final classification result vector.
-Fig. 3. is Multilayer Perceptron network, with weights in black, and bias in red. +1 indicates bias is 1.
+Fig. 3. is Multilayer Perceptron network, with weights in blue, and bias in red. +1 indicates bias is 1.
 <p align="center">
 <img src="image/mlp_en.png" width=500><br/>
@@ -196,32 +196,31 @@ def convolutional_neural_network(img):
 PaddlePaddle provides a special layer `layer.data` for reading data. Let us create a data layer for reading images and connect it to a classification network created using one of above three functions.  We also need a cost layer for training the model.
 ```python
-def main():
+paddle.init(use_gpu=False, trainer_count=1)
-    paddle.init(use_gpu=False, trainer_count=1)
-    images = paddle.layer.data(
+images = paddle.layer.data(
    name='pixel', type=paddle.data_type.dense_vector(784))
-    label = paddle.layer.data(
+label = paddle.layer.data(
    name='label', type=paddle.data_type.integer_value(10))
-    predict = softmax_regression(images)
+predict = softmax_regression(images)
-    #predict = multilayer_perceptron(images) # uncomment for MLP
+#predict = multilayer_perceptron(images) # uncomment for MLP
-    #predict = convolutional_neural_network(images) # uncomment for LeNet5
+#predict = convolutional_neural_network(images) # uncomment for LeNet5
-    cost = paddle.layer.classification_cost(input=predict, label=label)
+cost = paddle.layer.classification_cost(input=predict, label=label)
 ```
 Now, it is time to specify training parameters. The number 0.9 in the following `Momentum` optimizer means that 90% of the current the momentum comes from the momentum of the previous iteration.
 ```python
-    parameters = paddle.parameters.create(cost)
+parameters = paddle.parameters.create(cost)
-    optimizer = paddle.optimizer.Momentum(
+optimizer = paddle.optimizer.Momentum(
    learning_rate=0.1 / 128.0,
    momentum=0.9,
    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-    trainer = paddle.trainer.SGD(cost=cost,
+trainer = paddle.trainer.SGD(cost=cost,
                             parameters=parameters,
                             update_equation=optimizer)
 ```
@@ -233,9 +232,9 @@ Here `shuffle` is a reader decorator, which takes a reader A as its parameter, a
 `batch` is a special decorator, whose input is a reader and output is a *batch reader*, which doesn't yield an instance at a time, but a minibatch.
 ```python
-    lists = []
+lists = []
-    def event_handler(event):
+def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
        if event.batch_id % 100 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
@@ -248,7 +247,7 @@ Here `shuffle` is a reader decorator, which takes a reader A as its parameter, a
        lists.append((event.pass_id, result.cost,
                      result.metrics['classification_error_evaluator']))
-    trainer.train(
+trainer.train(
    reader=paddle.reader.batched(
        paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=8192),
@@ -260,21 +259,21 @@ Here `shuffle` is a reader decorator, which takes a reader A as its parameter, a
 During training, `trainer.train` invokes `event_handler` for certain events. This gives us a chance to print the training progress.
 ```
-    # Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
+# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
-    # Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
+# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
-    # Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
+# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
-    # Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
+# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
-    # Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
+# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
-    # Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
+# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
 ```
 After the training, we can check the model's prediction accuracy.
 ```
-    # find the best pass
+# find the best pass
-    best = sorted(lists, key=lambda list: float(list[1]))[0]
+best = sorted(lists, key=lambda list: float(list[1]))[0]
-    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
 ```
 Usually, with MNIST data, the softmax regression model can get accuracy around 92.34%, MLP can get about 97.66%, and convolution network can get up to around 99.20%.  Convolution layers have been widely considered a great invention for image processsing.

--- a/recognize_digits/README.md
+++ b/recognize_digits/README.md
@@ -205,20 +205,19 @@ def convolutional_neural_network(img):
 接着，通过`layer.data`调用来获取数据，然后调用分类器（这里我们提供了三个不同的分类器）得到分类结果。训练时，对该结果计算其损失函数，分类问题常常选择交叉熵损失函数。
 ```python
-def main():
+# 该模型运行在单个CPU上
-    # 该模型运行在单个CPU上
+paddle.init(use_gpu=False, trainer_count=1)
-    paddle.init(use_gpu=False, trainer_count=1)
-    images = paddle.layer.data(
+images = paddle.layer.data(
    name='pixel', type=paddle.data_type.dense_vector(784))
-    label = paddle.layer.data(
+label = paddle.layer.data(
    name='label', type=paddle.data_type.integer_value(10))
-    predict = softmax_regression(images) # Softmax回归
+predict = softmax_regression(images) # Softmax回归
-    #predict = multilayer_perceptron(images) #多层感知器
+#predict = multilayer_perceptron(images) #多层感知器
-    #predict = convolutional_neural_network(images) #LeNet5卷积神经网络
+#predict = convolutional_neural_network(images) #LeNet5卷积神经网络
-    cost = paddle.layer.classification_cost(input=predict, label=label)
+cost = paddle.layer.classification_cost(input=predict, label=label)
 ```
 然后，指定训练相关的参数。
@@ -227,14 +226,14 @@ def main():
 - 正则化（regularization）： 是防止网络过拟合的一种手段，此处采用L2正则化。
 ```python
-    parameters = paddle.parameters.create(cost)
+parameters = paddle.parameters.create(cost)
-    optimizer = paddle.optimizer.Momentum(
+optimizer = paddle.optimizer.Momentum(
    learning_rate=0.1 / 128.0,
    momentum=0.9,
    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-    trainer = paddle.trainer.SGD(cost=cost,
+trainer = paddle.trainer.SGD(cost=cost,
                             parameters=parameters,
                             update_equation=optimizer)
 ```
@@ -246,9 +245,9 @@ def main():
 `batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minbatch。
 ```python
-    lists = []
+lists = []
-    def event_handler(event):
+def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
        if event.batch_id % 100 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
@@ -261,8 +260,8 @@ def main():
        lists.append((event.pass_id, result.cost,
                      result.metrics['classification_error_evaluator']))
-    trainer.train(
+trainer.train(
-        reader=paddle.batch(
+    reader=paddle.reader.batched(
        paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=8192),
        batch_size=128),
@@ -273,12 +272,12 @@ def main():
 训练过程是完全自动的，event_handler里打印的日志类似如下所示：
 ```
-    # Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
+# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
-    # Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
+# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
-    # Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
+# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
-    # Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
+# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
-    # Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
+# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
-    # Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
+# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
 ```
 训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。

--- a/recognize_digits/index.en.html
+++ b/recognize_digits/index.en.html
@@ -83,7 +83,7 @@ In such a classification problem, we usually use the cross entropy loss function
 $$  crossentropy(label, y) = -\sum_i label_ilog(y_i) $$
-Fig. 2 shows a softmax regression network, with weights in black, and bias in red. +1 indicates bias is 1.
+Fig. 2 shows a softmax regression network, with weights in blue, and bias in red. +1 indicates bias is 1.
 <p align="center">
 <img src="image/softmax_regression_en.png" width=400><br/>
@@ -98,7 +98,7 @@ The Softmax regression model described above uses the simplest two-layer neural
 2.  After the second hidden layer, we get $ H_2 = \phi(W_2H_1 + b_2) $.
 3.  Finally, after output layer, we get $Y=softmax(W_3H_2 + b_3)$, the final classification result vector.
-Fig. 3. is Multilayer Perceptron network, with weights in black, and bias in red. +1 indicates bias is 1.
+Fig. 3. is Multilayer Perceptron network, with weights in blue, and bias in red. +1 indicates bias is 1.
 <p align="center">
 <img src="image/mlp_en.png" width=500><br/>
@@ -156,15 +156,8 @@ For more information, please refer to [Activation functions on Wikipedia](https:
 ## Data Preparation
-### Data Download
+PaddlePaddle provides a Python module, `paddle.dataset.mnist`, which downloads and caches the [MNIST dataset](http://yann.lecun.com/exdb/mnist/).  The cache is under `/home/username/.cache/paddle/dataset/mnist`:
-Execute the following command to download the [MNIST](http://yann.lecun.com/exdb/mnist/) dataset and unzip. Add paths to the training set and the test set to train.list and test.list respectively for PaddlePaddle to read.
-```bash
-./data/get_mnist_data.sh
-```
-`gzip` downloaded data. The following files can be found in `data/raw_data`:
 |    File name          |       Description              |
 |----------------------|-------------------------|
@@ -173,283 +166,159 @@ Execute the following command to download the [MNIST](http://yann.lecun.com/exdb
 |t10k-images-idx3-ubyte |  Evaluation images, 10,000 |
 |t10k-labels-idx1-ubyte |  Evaluation labels, 10,000 |
-Users can randomly generate 10 images with the following script (Refer to Fig. 1.)
-```bash
-./load_data.py
-```
-### Provide Data to PaddlePaddle
-We use python interface to provide data to system. `mnist_provider.py` shows a complete example for training on MNIST data.
-```python
-# Define a py data provider
-@provider(
-    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
-def process(settings, filename):  # settings is not used currently.
-		# Open image file
-    with open( filename + "-images-idx3-ubyte", "rb") as f:             
-		# Read first 4 parameters. magic is data format. n is number of data. rows and cols are number of rows and columns, respectively
-        magic, n, rows, cols = struct.upack(">IIII", f.read(16))        
-		# With empty string as a unit, read data one by one
-        images = np.fromfile(                                           
-            f, 'ubyte',
-            count=n * rows * cols).reshape(n, rows, cols).astype('float32')
-		# Normalize data of [0, 255] to [-1,1]
-        images = images / 255.0 * 2.0 - 1.0                             
-		# Open label file
-    with open( filename + "-labels-idx1-ubyte", "rb") as l:             
-		# Read first two parameters
-        magic, n = struct.upack(">II", l.read(8))                       
-		# With empty string as a unit, read data one by one
-        labels = np.fromfile(l, 'ubyte', count=n).astype("int")         
-    for i in xrange(n):
-        yield {"pixel": images[i, :], 'label': labels[i]}
-```
-## Model Configurations
-### Data Definition
+## Model Configuration
-In the model configuration, use `define_py_data_sources2` to define reading of data from `dataprovider`. If this configuration is used for prediction, data definition is not necessary.
+A PaddlePaddle program starts from importing the API package:
 ```python
- if not is_predict:
+import paddle.v2 as paddle
-     data_dir = './data/'
-     define_py_data_sources2(
-         train_list=data_dir + 'train.list',
-         test_list=data_dir + 'test.list',
-         module='mnist_provider',
-         obj='process')
 ```
-### Algorithm Configuration
+We want to use this program to demonstrate multiple kinds of models.  Let define each of them as a Python function:
-Set training related parameters.
+- softmax regression: the network has a fully-connection layer with softmax activation:
- batch_size: use 128 samples in each training step.
- learning_rate: determines step taken in each iteration, it determines how fast the model converges.
- learning_method: use optimizer `MomentumOptimizer` for training. The parameter 0.9 indicates momentum keeps 0.9 of previous speed.
- regularization: A method to prevent overfitting. Here L2 regularization is used.
-```python
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128))
-```
-### Model Architecture
-#### Overview
-First get reference labels from `data_layer`, and get classification results (predictions) from classifier. Here we provide three different classifiers. In training, we compute loss function, which is usually cross entropy for classification problem. In prediction, we can directly output the results (predictions).
-``` python
-data_size = 1 * 28 * 28
-label_size = 10
-img = data_layer(name='pixel', size=data_size)
-predict = softmax_regression(img) # Softmax Regression
-#predict = multilayer_perceptron(img) # Multilayer Perceptron
-#predict = convolutional_neural_network(img) #LeNet5 Convolutional Neural Network
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    inputs(img, lbl)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
-```
-#### Softmax Regression
-One simple fully connected layer with softmax activation function outputs classification result.
 ```python
 def softmax_regression(img):
-    predict = fc_layer(input=img, size=10, act=SoftmaxActivation())
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
    return predict
 ```
-#### MultiLayer Perceptron
+- multi-layer perceptron: this network has two hidden fully-connected layers, one with LeRU and the other with softmax activation:
-The following code implements a Multilayer Perceptron with two fully connected hidden layers and a ReLU activation function. The output layer has a Softmax activation function.
 ```python
 def multilayer_perceptron(img):
-    # First fully connected layer with ReLU
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
-    hidden1 = fc_layer(input=img, size=128, act=ReluActivation())
+    hidden2 = paddle.layer.fc(input=hidden1,
-    # Second fully connected layer with ReLU
+                              size=64,
-    hidden2 = fc_layer(input=hidden1, size=64, act=ReluActivation())
+                              act=paddle.activation.Relu())
-    # Output layer as fully connected layer and softmax activation. The size must be 10.
+    predict = paddle.layer.fc(input=hidden2,
-    predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+                              size=10,
+                              act=paddle.activation.Softmax())
    return predict
 ```
-#### Convolutional Neural Network LeNet-5
+- convolution network LeNet-5: the input image is fed through two convolution-pooling layer, a fully-connected layer, and the softmax output layer:
-The following is the LeNet-5 network architecture. A 2D input image is first fed into two sets of convolutional layers and pooling layers, this result is then fed to a fully connected layer, and another fully connected layer with a softmax activation.
 ```python
 def convolutional_neural_network(img):
-    # First convolutional layer - pooling layer
-    conv_pool_1 = simple_img_conv_pool(
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
        input=img,
        filter_size=5,
        num_filters=20,
        num_channel=1,
        pool_size=2,
        pool_stride=2,
-        act=TanhActivation())
+        act=paddle.activation.Tanh())
-    # Second convolutional layer - pooling layer
-    conv_pool_2 = simple_img_conv_pool(
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        num_channel=20,
        pool_size=2,
        pool_stride=2,
-        act=TanhActivation())
+        act=paddle.activation.Tanh())
-    # Fully connected layer
-    fc1 = fc_layer(input=conv_pool_2, size=128, act=TanhActivation())
-    # Output layer as fully connected layer and softmax activation. The size must be 10.
-    predict = fc_layer(input=fc1, size=10, act=SoftmaxActivation())
-    return predict
-```
-## Training Model
-### Training Commands and Logs
-1.Configure `train.sh` to execute training:
-```bash
+    fc1 = paddle.layer.fc(input=conv_pool_2,
-config=mnist_model.py                   # Select network in mnist_model.py
+                          size=128,
-output=./softmax_mnist_model            
+                          act=paddle.activation.Tanh())
-log=softmax_train.log                   
-paddle train \
+    predict = paddle.layer.fc(input=fc1,
--config=$config \                      # Scripts for network configuration.
+                              size=10,
--dot_period=10 \                       # After `dot_period` steps, print one `.`
+                              act=paddle.activation.Softmax())
--log_period=100 \						# Print a log every batchs
+    return predict
--test_all_data_in_one_period=1 \		# Whether to use all data in every test
--use_gpu=0 \							# Whether to use GPU
--trainer_count=1 \						# Number of CPU or GPU
--num_passes=100 \						# Passes for training (One pass uses all data.)
--save_dir=$output \					# Path to saved model
-2>&1 | tee $log
-python -m paddle.utils.plotcurve -i $log > plot.png
 ```
-After configuring parameters, execute `./train.sh`. Training log is as follows.
+PaddlePaddle provides a special layer `layer.data` for reading data. Let us create a data layer for reading images and connect it to a classification network created using one of above three functions.  We also need a cost layer for training the model.
-```
+```python
-I0117 12:52:29.628617  4538 TrainerInternal.cpp:165]  Batch=100 samples=12800 AvgCost=2.63996 CurrentCost=2.63996 Eval: classification_error_evaluator=0.241172  CurrentEval: classification_error_evaluator=0.241172 
+paddle.init(use_gpu=False, trainer_count=1)
-.........
-I0117 12:52:29.768741  4538 TrainerInternal.cpp:165]  Batch=200 samples=25600 AvgCost=1.74027 CurrentCost=0.840582 Eval: classification_error_evaluator=0.185234  CurrentEval: classification_error_evaluator=0.129297 
-.........
-I0117 12:52:29.916970  4538 TrainerInternal.cpp:165]  Batch=300 samples=38400 AvgCost=1.42119 CurrentCost=0.783026 Eval: classification_error_evaluator=0.167786  CurrentEval: classification_error_evaluator=0.132891 
-.........
-I0117 12:52:30.061213  4538 TrainerInternal.cpp:165]  Batch=400 samples=51200 AvgCost=1.23965 CurrentCost=0.695054 Eval: classification_error_evaluator=0.160039  CurrentEval: classification_error_evaluator=0.136797 
-......I0117 12:52:30.223270  4538 TrainerInternal.cpp:181]  Pass=0 Batch=469 samples=60000 AvgCost=1.1628 Eval: classification_error_evaluator=0.156233 
-I0117 12:52:30.366894  4538 Tester.cpp:109]  Test samples=10000 cost=0.50777 Eval: classification_error_evaluator=0.0978 
-```
-2.Use `plot_cost.py` to plot error curve during training.
-```bash
+images = paddle.layer.data(
-python plot_cost.py softmax_train.log            
+    name='pixel', type=paddle.data_type.dense_vector(784))
-```
+label = paddle.layer.data(
+    name='label', type=paddle.data_type.integer_value(10))
-3.Use `evaluate.py ` to select the best trained model.
+predict = softmax_regression(images)
+#predict = multilayer_perceptron(images) # uncomment for MLP
+#predict = convolutional_neural_network(images) # uncomment for LeNet5
-```bash
+cost = paddle.layer.classification_cost(input=predict, label=label)
-python evaluate.py softmax_train.log
 ```
-### Training Results for Softmax Regression
+Now, it is time to specify training parameters. The number 0.9 in the following `Momentum` optimizer means that 90% of the current the momentum comes from the momentum of the previous iteration.
-<p align="center">
+```python
-<img src="image/softmax_train_log_en.png" width="400px"><br/>
+parameters = paddle.parameters.create(cost)
-Fig. 7 Softmax regression error curve<br/>
-</p>
-Evaluation results of the models:
+optimizer = paddle.optimizer.Momentum(
+    learning_rate=0.1 / 128.0,
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-```text
+trainer = paddle.trainer.SGD(cost=cost,
-Best pass is 00013, testing Avgcost is 0.484447
+                             parameters=parameters,
-The classification accuracy is 90.01%
+                             update_equation=optimizer)
 ```
-From the evaluation results, the best pass for softmax regression model is pass-00013, where the classification accuracy is 90.01%, and the last pass-00099 has an accuracy of 89.3%. From Fig. 7, we also see that the best accuracy may not appear in the last pass. This is because during training, the model may already arrive at a local optimum, and it just swings around nearby in the following passes, or it gets a lower local optimum.
+Then we specify the training data `paddle.dataset.movielens.train()` and testing data `paddle.dataset.movielens.test()`.  These two functions are *reader creators*, once called, returns a *reader*.  A reader is a Python function, which, once called, returns a Python generator, which yields instances of data.  
-### Results of Multilayer Perceptron
+Here `shuffle` is a reader decorator, which takes a reader A as its parameter, and returns a new reader B, where B calls A to read in `buffer_size` data instances everytime into a buffer, then shuffles and yield instances in the buffer.  If you want very shuffled data, try use a larger buffer size. 
-<p align="center">
+`batch` is a special decorator, whose input is a reader and output is a *batch reader*, which doesn't yield an instance at a time, but a minibatch.
-<img src="image/mlp_train_log_en.png" width="400px"><br/>
-Fig. 8. Multilayer Perceptron error curve<br/>
-</p>
-Evaluation results of the models：
+```python
+lists = []
-```text
-Best pass is 00085, testing Avgcost is 0.164746
+def event_handler(event):
-The classification accuracy is 94.95%
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(reader=paddle.reader.batched(
+            paddle.dataset.mnist.test(), batch_size=128))
+        print "Test with Pass %d, Cost %f, %s\n" % (
+            event.pass_id, result.cost, result.metrics)
+        lists.append((event.pass_id, result.cost,
+                      result.metrics['classification_error_evaluator']))
+trainer.train(
+    reader=paddle.reader.batched(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=8192),
+        batch_size=128),
+    event_handler=event_handler,
+    num_passes=100)
 ```
-From the evaluation results, the final training accuracy is 94.95%. It is significantly better than the softmax regression model. This is because the softmax regression is simple, and it cannot fit complex data. The Multilayer Perceptron with hidden layers has better capacity to fit complex data than the softmax regression.
+During training, `trainer.train` invokes `event_handler` for certain events. This gives us a chance to print the training progress.
-### Training results for Convolutional Neural Network
-<p align="center">
-<img src="image/cnn_train_log_en.png" width="400px"><br/>
-Fig. 9. Convolutional Neural Network error curve<br/>
-</p>
-Results of model evaluation：
-```text
-Best pass is 00076, testing Avgcost is 0.0244684
-The classification accuracy is 99.20%
 ```
+# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
-From the evaluation result, the best accuracy of Convolutional Neural Network is 99.20%. So for image classification, a Convolutional Neural Network has better recognition results than a fully connected network. This is related to the local connection and parameter sharing of convolutional layers. In Fig. 9, the Convolutional Neural Network achieves good results in early steps, which indicates that it converges faster.
+# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
+# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
-## Application Model
+# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
+# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
-### Prediction Commands and Results
+# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
-Script `predict.py` can make prediction for trained models. For example, in softmax regression:
-```bash
-python predict.py -c mnist_model.py -d data/raw_data/ -m softmax_mnist_model/pass-00047
 ```
- -c sets model architecture
+After the training, we can check the model's prediction accuracy.
- -d sets data for prediction
- -m sets model parameters, here the best trained model is used for prediction
-Follow the instructions to input image ID for prediction. The classifier can output probabilities for each digit, predictions with the highest probability, and ground truth label.
 ```
-Input image_id [0~9999]: 3
+# find the best pass
-Predicted probability of each digit:
+best = sorted(lists, key=lambda list: float(list[1]))[0]
-[[  1.00000000e+00   1.60381094e-28   1.60381094e-28   1.60381094e-28
+print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-    1.60381094e-28   1.60381094e-28   1.60381094e-28   1.60381094e-28
+print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
-    1.60381094e-28   1.60381094e-28]]
-Predict Number: 0 
-Actual Number: 0
 ```
-From the result, this classifier recognizes the digit on the third image as digit 0 with near to 100% probability. This predicted result is consistent with the ground truth label.
+Usually, with MNIST data, the softmax regression model can get accuracy around 92.34%, MLP can get about 97.66%, and convolution network can get up to around 99.20%.  Convolution layers have been widely considered a great invention for image processsing.
 ## Conclusion
 This tutorial describes a few basic Deep Learning models viz. Softmax regression, Multilayer Perceptron Network and Convolutional Neural Network. The subsequent tutorials will derive more sophisticated models from these. So it is crucial to understand these models for future learning. When our model evolved from a simple softmax regression to slightly complex Convolutional Neural Network, the recognition accuracy on the MNIST data set achieved large improvement in accuracy. This is due to the Convolutional layers' local connections and parameter sharing. While learning new models in the future, we encourage the readers to understand the key ideas that lead a new model to improve results of an old one. Moreover, this tutorial introduced the basic flow of PaddlePaddle model design, starting with a dataprovider, model layer construction, to final training and prediction. Readers can leverage the flow used in this MNIST handwritten digit classification example and experiment with different data and network architectures to train models for classification tasks of their choice.

--- a/recognize_digits/index.html
+++ b/recognize_digits/index.html
@@ -83,7 +83,7 @@ $$ y_i = softmax(\sum_j W_{i,j}x_j + b_i) $$
 $$  crossentropy(label, y) = -\sum_i label_ilog(y_i) $$
-图2为softmax回归的网络图，图中权重用黑线表示、偏置用红线表示、+1代表偏置参数的系数为1。
+图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 <p align="center">
 <img src="image/softmax_regression.png" width=400><br/>
@@ -99,7 +99,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 3.  最后，再经过输出层，得到的$Y=softmax(W_3H_2 + b_3)$，即为最后的分类结果向量。
-图3为多层感知器的网络结构图，图中权重用黑线表示、偏置用红线表示、+1代表偏置参数的系数为1。
+图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 <p align="center">
 <img src="image/mlp.png" width=500><br/>
@@ -236,20 +236,19 @@ def convolutional_neural_network(img):
 接着，通过`layer.data`调用来获取数据，然后调用分类器（这里我们提供了三个不同的分类器）得到分类结果。训练时，对该结果计算其损失函数，分类问题常常选择交叉熵损失函数。
 ```python
-def main():
+# 该模型运行在单个CPU上
-    # 该模型运行在单个CPU上
+paddle.init(use_gpu=False, trainer_count=1)
-    paddle.init(use_gpu=False, trainer_count=1)
-    images = paddle.layer.data(
+images = paddle.layer.data(
    name='pixel', type=paddle.data_type.dense_vector(784))
-    label = paddle.layer.data(
+label = paddle.layer.data(
    name='label', type=paddle.data_type.integer_value(10))
-    predict = softmax_regression(images) # Softmax回归
+predict = softmax_regression(images) # Softmax回归
-    #predict = multilayer_perceptron(images) #多层感知器
+#predict = multilayer_perceptron(images) #多层感知器
-    #predict = convolutional_neural_network(images) #LeNet5卷积神经网络
+#predict = convolutional_neural_network(images) #LeNet5卷积神经网络
-    cost = paddle.layer.classification_cost(input=predict, label=label)
+cost = paddle.layer.classification_cost(input=predict, label=label)
 ```
 然后，指定训练相关的参数。
@@ -258,24 +257,28 @@ def main():
 - 正则化（regularization）： 是防止网络过拟合的一种手段，此处采用L2正则化。
 ```python
-    parameters = paddle.parameters.create(cost)
+parameters = paddle.parameters.create(cost)
-    optimizer = paddle.optimizer.Momentum(
+optimizer = paddle.optimizer.Momentum(
    learning_rate=0.1 / 128.0,
    momentum=0.9,
    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-    trainer = paddle.trainer.SGD(cost=cost,
+trainer = paddle.trainer.SGD(cost=cost,
                             parameters=parameters,
                             update_equation=optimizer)
 ```
-下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集，每次训练使用的数据为128条。
+下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。
+下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B —— reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minbatch。
 ```python
-    lists = []
+lists = []
-    def event_handler(event):
+def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
        if event.batch_id % 100 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
@@ -288,7 +291,7 @@ def main():
        lists.append((event.pass_id, result.cost,
                      result.metrics['classification_error_evaluator']))
-    trainer.train(
+trainer.train(
    reader=paddle.reader.batched(
        paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=8192),
@@ -299,43 +302,16 @@ def main():
 训练过程是完全自动的，event_handler里打印的日志类似如下所示：
-```python
-    # Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
-    # Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
-    # Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
-    # Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
-    # Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
-    # Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
-```
-最后，选出最佳模型，并评估其效果。
-```python
-    # find the best pass
-    best = sorted(lists, key=lambda list: float(list[1]))[0]
-    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
-```
- softmax回归模型：分类效果最好的时候是pass-34，分类准确率为92.34%。
-```python
-    # Best pass is 34, testing Avgcost is 0.275004139346
-    # The classification accuracy is 92.34%
 ```
+# Pass 0, Batch 0, Cost 2.780790, {'classification_error_evaluator': 0.9453125}
- 多层感知器：最终训练的准确率为97.66%，相比于softmax回归模型有了显著的提升。原因是softmax回归模型较为简单，无法拟合更为复杂的数据，而加入了隐藏层之后的多层感知器则具有更强的拟合能力。
+# Pass 0, Batch 100, Cost 0.635356, {'classification_error_evaluator': 0.2109375}
+# Pass 0, Batch 200, Cost 0.326094, {'classification_error_evaluator': 0.1328125}
-```python
+# Pass 0, Batch 300, Cost 0.361920, {'classification_error_evaluator': 0.1015625}
-    # Best pass is 85, testing Avgcost is 0.0784368447196
+# Pass 0, Batch 400, Cost 0.410101, {'classification_error_evaluator': 0.125}
-    # The classification accuracy is 97.66%
+# Test with Pass 0, Cost 0.326659, {'classification_error_evaluator': 0.09470000118017197}
 ```
- 卷积神经网络：最好分类准确率达到惊人的99.20%。说明对于图像问题而言，卷积神经网络能够比一般的全连接网络达到更好的识别效果，而这与卷积层具有局部连接和共享权重的特性是分不开的。同时，从训练日志中可以看到，卷积神经网络在很早的时候就能达到很好的效果，说明其收敛速度非常快。
+训练之后，检查模型的预测准确度。用 MNIST 训练的时候，一般 softmax回归模型的分类准确率为约为 92.34%，多层感知器为97.66%，卷积神经网络可以达到 99.20%。
-```python
-    # Best pass is 76, testing Avgcost is 0.0244684
-    # The classification accuracy is 99.20%
-```
 ## 总结

--- a/recognize_digits/train.py
+++ b/recognize_digits/train.py
+import paddle.v2 as paddle
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+paddle.init(use_gpu=False, trainer_count=1)
+# define network topology
+images = paddle.layer.data(
+    name='pixel', type=paddle.data_type.dense_vector(784))
+label = paddle.layer.data(name='label', type=paddle.data_type.integer_value(10))
+# Here we can build the prediction network in different ways. Please
+# choose one by uncomment corresponding line.
+predict = softmax_regression(images)
+#predict = multilayer_perceptron(images)
+#predict = convolutional_neural_network(images)
+cost = paddle.layer.classification_cost(input=predict, label=label)
+parameters = paddle.parameters.create(cost)
+optimizer = paddle.optimizer.Momentum(
+    learning_rate=0.1 / 128.0,
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+lists = []
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            print "Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+    if isinstance(event, paddle.event.EndPass):
+        result = trainer.test(reader=paddle.reader.batched(
+            paddle.dataset.mnist.test(), batch_size=128))
+        print "Test with Pass %d, Cost %f, %s\n" % (event.pass_id, result.cost,
+                                                    result.metrics)
+        lists.append((event.pass_id, result.cost,
+                      result.metrics['classification_error_evaluator']))
+trainer.train(
+    reader=paddle.reader.batched(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=8192),
+        batch_size=128),
+    event_handler=event_handler,
+    num_passes=100)
+# find the best pass
+best = sorted(lists, key=lambda list: float(list[1]))[0]
+print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
--- a/recommender_system/index.en.html
+++ b/recommender_system/index.en.html
@@ -111,7 +111,7 @@ Given the feature vectors of users and movies, we compute the relevance using co
 <p align="center">
-<img src="image/rec_regression_network.png" width="90%" ><br/>
+<img src="image/rec_regression_network_en.png" width="90%" ><br/>
 Figure 3. A hybrid recommendation model.
 </p> 

--- a/understand_sentiment/README.md
+++ b/understand_sentiment/README.md
--- a/understand_sentiment/train.py
+++ b/understand_sentiment/train.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import paddle.trainer_config_helpers.attrs as attrs
+from paddle.trainer_config_helpers.poolings import MaxPooling
+import paddle.v2 as paddle
+def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    """
+    assert stacked_num % 2 == 1
+    layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5)
+    fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3)
+    lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr,
+            layer_attr=layer_attr)
+        inputs = [fc, lstm]
+    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling())
+    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+if __name__ == '__main__':
+    # init
+    paddle.init(use_gpu=False)
+    #data
+    print 'load dictionary...'
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=100)
+    test_reader = paddle.batch(
+        lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
+    reader_dict = {'word': 0, 'label': 1}
+    # network config
+    # Please choose the way to build the network
+    # by uncommenting the corresponding line.
+    cost = convolution_net(dict_dim, class_dim=class_dim)
+    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=2e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=test_reader, reader_dict=reader_dict)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+    # create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=adam_optimizer)
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        reader_dict=reader_dict,
+        num_passes=2)
--- a/word2vec/index.en.html
+++ b/word2vec/index.en.html
@@ -194,7 +194,7 @@ As illustrated in the figure above, skip-gram model maps the word embedding of t
 ## Model Configuration
 <p align="center">	
-	<img src="image/ngram.png" width=400><br/>
+	<img src="image/ngram.en.png" width=400><br/>
 	Figure 5. N-gram neural network model in model configuration
 </p>

--- a/word2vec/index.html
+++ b/word2vec/index.html
@@ -182,7 +182,7 @@ CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去
 ## 数据准备
-### 数据介绍与下载
+### 数据介绍
 本教程使用Penn Tree Bank (PTB)数据集。PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下：
@@ -206,109 +206,24 @@ CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去
 </table>
 </p>
-执行以下命令，可下载该数据集，并分别将训练数据和验证数据输入`train.list`和`test.list`文件中，供PaddlePaddle训练时使用。
-```bash
+### 数据预处理
-./data/getdata.sh
-```
+本章训练的是5-gram模型，表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`，自动做数据的下载与预处理，方便大家使用。
-### 提供数据给PaddlePaddle
+预处理会把数据集中的每一句话前后加上开始符号`<s>`以及结束符号`<e>`。然后依据窗口大小（本教程中为5），从头到尾每次向右滑动窗口并生成一条数据。
-1. 使用initializer函数进行dataprovider的初始化，包括字典的建立（build_dict函数中）和PaddlePaddle输入字段的格式定义。注意：这里N为n-gram模型中的`n`, 本章代码中，定义$N=5$, 表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。大家也可以根据自己的数据和需求自行调整N，但调整的同时要在模型配置文件中加入/减少相应输入字段。
-    ```python
-    from paddle.trainer.PyDataProvider2 import *
-    import collections
-    import logging
-    import pdb
-    logging.basicConfig(
-        format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-    logger = logging.getLogger('paddle')
-    logger.setLevel(logging.INFO)
-    N = 5  # Ngram
-    cutoff = 50  # select words with frequency > cutoff to dictionary
-    def build_dict(ftrain, fdict):
-    	sentences = []
-        with open(ftrain) as fin:
-            for line in fin:
-                line = ['<s>'] + line.strip().split() + ['<e>']
-                sentences += line
-        wordfreq = collections.Counter(sentences)
-        wordfreq = filter(lambda x: x[1] > cutoff, wordfreq.items())
-        dictionary = sorted(wordfreq, key = lambda x: (-x[1], x[0]))
-        words, _ = list(zip(*dictionary))
-        for word in words:
-            print >> fdict, word
-        word_idx = dict(zip(words, xrange(len(words))))
-        logger.info("Dictionary size=%s" %len(words))
-        return word_idx
-    def initializer(settings, srcText, dictfile, **xargs):
-        with open(dictfile, 'w') as fdict:
-            settings.dicts = build_dict(srcText, fdict)
-        input_types = []
-        for i in xrange(N):
-            input_types.append(integer_value(len(settings.dicts)))
-        settings.input_types = input_types
-    ```
-2. 使用process函数中将数据逐一提供给PaddlePaddle。具体来说，将每句话前面补上N-1个开始符号 `<s>`, 末尾补上一个结束符号 `<e>`，然后以N为窗口大小，从头到尾每次向右滑动窗口并生成一条数据。
-    ```python
-    @provider(init_hook=initializer)
-    def process(settings, filename):
-        UNKID = settings.dicts['<unk>']
-        with open(filename) as fin:
-            for line in fin:
-                line = ['<s>']*(N-1)  + line.strip().split() + ['<e>']
-                line = [settings.dicts.get(w, UNKID) for w in line]
-                for i in range(N, len(line) + 1):
-                    yield line[i-N: i]
-    ```
-    如"I have a dream" 一句提供了5条数据:
-    > `<s> <s> <s> <s> I` <br>
-    > `<s> <s> <s> I have` <br>
-    > `<s> <s> I have a`  <br>
-    > `<s> I have a dream` <br>
-    > `I have a dream <e>` <br>
-## 模型配置说明
-### 数据定义
-通过`define_py_data_sources2`函数从dataprovider中读入数据，其中args指定了训练文本(srcText)和词汇表(dictfile)。
-```python
+如"I have a dream that one day" 一句提供了5条数据：
-from paddle.trainer_config_helpers import *
-import math
-args = {'srcText': 'data/simple-examples/data/ptb.train.txt',
-        'dictfile': 'data/vocabulary.txt'}
-define_py_data_sources2(
-    train_list="data/train.list",
-    test_list="data/test.list",
-    module="dataprovider",
-    obj="process",
-    args=args)
-```
-### 算法配置
+```text
+<s> I have a dream
-在这里，我们指定了模型的训练参数, L2正则项系数、学习率和batch size。
+I have a dream that
+have a dream that one
-```python
+a dream that one day
-settings(
+dream that one day <e>
-    batch_size=100, regularization=L2Regularization(8e-4), learning_rate=3e-3)
 ```
-### 模型结构
+## 编程实现
 本配置的模型结构如下图所示：
@@ -317,94 +232,132 @@ settings(
 	图5. 模型配置中的N-gram神经网络模型
 </p>
-1. 定义参数维度和和数据输入。
+首先，加载所需要的包：
-    ```python
+```python
-    dictsize = 1953 # 字典大小
+import math
-    embsize = 32 # 词向量维度
+import paddle.v2 as paddle
-    hiddensize = 256 # 隐层维度
+```
-    firstword = data_layer(name = "firstw", size = dictsize)
+然后，定义参数：
-    secondword = data_layer(name = "secondw", size = dictsize)
+```python
-    thirdword = data_layer(name = "thirdw", size = dictsize)
+embsize = 32 # 词向量维度
-    fourthword = data_layer(name = "fourthw", size = dictsize)
+hiddensize = 256 # 隐层维度
-    nextword = data_layer(name = "fifthw", size = dictsize)
+N = 5 # 训练5-Gram
-    ```
+```
-2. 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
+接着，定义网络结构：
-	```python	
+- 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
-	def wordemb(inlayer):
-		wordemb = table_projection(
+```python	
-        input = inlayer,
+def wordemb(inlayer):
-        size = embsize,
+    wordemb = paddle.layer.table_projection(
-        param_attr=ParamAttr(name = "_proj",
+        input=inlayer,
-            initial_std=0.001, # 参数初始化标准差
+        size=embsize,
-            l2_rate= 0,))      # 词向量不需要稀疏化，因此其l2_rate设为0
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0, ))
    return wordemb
+```
+- 定义输入层接受的数据类型以及名字。
+```python
+def main():
+    paddle.init(use_gpu=False, trainer_count=1) # 初始化PaddlePaddle
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+	# 每个输入层都接受整形数据，这些数据的范围是[0, dict_size)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
    Efirst = wordemb(firstword)
    Esecond = wordemb(secondword)
    Ethird = wordemb(thirdword)
    Efourth = wordemb(fourthword)
-	```
+```
-3. 接着，将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
-	```python
-	contextemb = concat_layer(input = [Efirst, Esecond, Ethird, Efourth])
-	```
-4. 然后，将历史文本特征经过一个全连接得到文本隐层特征。
-    ```python
-	hidden1 = fc_layer(
-	        input = contextemb,
-	        size = hiddensize,
-	        act = SigmoidActivation(),
-	        layer_attr = ExtraAttr(drop_rate=0.5),
-	        bias_attr = ParamAttr(learning_rate = 2),
-	        param_attr = ParamAttr(
-	            initial_std = 1./math.sqrt(embsize*8),
-	            learning_rate = 1))
-    ```
-5. 最后，将文本隐层特征，再经过一个全连接，映射成一个$|V|$维向量，同时通过softmax归一化得到这`|V|`个词的生成概率。
-    ```python
-	# use context embedding to predict nextword
-	predictword = fc_layer(
-	        input = hidden1,
-	        size = dictsize,
-	        bias_attr = ParamAttr(learning_rate = 2),
-	        act = SoftmaxActivation())
-	```
-6. 网络的损失函数为多分类交叉熵，可直接调用`classification_cost`函数。
-	```python
-	cost = classification_cost(
-	        input = predictword,
-	        label = nextword)
-	# network input and output
-	outputs(cost)
-	```
-##训练模型
-模型训练命令为`./train.sh`。脚本内容如下，其中指定了总共需要执行30个pass。
-```bash
+- 将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
-paddle train \
-       --config ngram.py \
+```python
-       --use_gpu=1 \
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
-       --dot_period=100 \
+```
-       --log_period=3000 \
-       --test_period=0 \
+- 将历史文本特征经过一个全连接得到文本隐层特征。
-       --save_dir=model \
-       --num_passes=30
+```python
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+```
+- 将文本隐层特征，再经过一个全连接，映射成一个$|V|$维向量，同时通过softmax归一化得到这`|V|`个词的生成概率。
+```python
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+```
+- 网络的损失函数为多分类交叉熵，可直接调用`classification_cost`函数。
+```python
+cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+```
+然后，指定训练相关的参数：
+- 训练方法（optimizer)： 代表训练过程在更新权重时采用动量优化器，本教程使用Adam优化器。
+- 训练速度（learning_rate）： 迭代的速度，与网络的训练收敛速度有关系。
+- 正则化（regularization）： 是防止网络过拟合的一种手段，此处采用L2正则化。
+```python
+    parameters = paddle.parameters.create(cost)
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+```
+下一步，我们开始训练过程。`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python generator。
+`paddle.batch`的输入是一个reader，输出是一个batched reader —— 在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minbatch。
+```python
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
 ```
-一个pass的训练日志如下所示：
+训练过程是完全自动的，event_handler里打印的日志类似如下所示：
 ```text
 .............................