From 2dc06a83be046d67938a483eec70dfe73c40e58c Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 7 Mar 2017 21:58:36 +0800
Subject: [PATCH] Update trainning

---
 recommender_system/README.en.md | 57 +++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 24 deletions(-)

diff --git a/recommender_system/README.en.md b/recommender_system/README.en.md
index cc77222..4421be9 100644
--- a/recommender_system/README.en.md
+++ b/recommender_system/README.en.md
@@ -87,7 +87,7 @@ We use the [MovieLens ml-1m](http://files.grouplens.org/datasets/movielens/ml-1m
 help(paddle.v2.dataset.movielens)
 ```
 
-The raw `MoiveLens` contains movie ratings, relevant features form both movies and users.
+The raw `MoiveLens` contains movie ratings, relevant features from both movies and users.
 For instance, one movie's feature could be:
 
 ```python
@@ -252,7 +252,7 @@ The movie ID and the movie type are mapped to their corresponding hidden layers.
 inference = paddle.layer.cos_sim(a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
 ```
 
-进而，我们使用余弦相似度计算用户特征与电影特征的相似性。并将这个相似性拟合(回归)到用户评分上。
+Finally, we can use cosine similarity to calculate the similarity between user characteristics and movie features.
 
 ```python
 cost = paddle.layer.regression_cost(
@@ -261,13 +261,11 @@ cost = paddle.layer.regression_cost(
         name='score', type=paddle.data_type.dense_vector(1)))
 ```
 
-至此，我们的优化目标就是这个网络配置中的cost了。
-
 ## Model Training
 
 ### Define Parameters
 
-First we define the model parameters according to the previous model configuration cost.
+First we define the model parameters according to the previous model configuration `cost`.
 
 ```python
 # Create parameters
@@ -290,18 +288,18 @@ trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,
 
 ### Training
 
-下面我们开始训练过程。
-我们直接使用Paddle提供的数据集读取程序。paddle.dataset.movielens.train()和paddle.dataset.movielens.test()分别做训练和预测数据集。并且通过reader_dict来指定每一个数据和data_layer的对应关系。
-例如，这里的reader_dict表示的是，对于数据层 user_id，使用了reader中每一条数据的第0个元素。gender_id数据层使用了第1个元素。以此类推。
-训练过程是完全自动的。我们可以使用event_handler来观察训练过程，或进行测试等。这里我们在event_handler里面绘制了训练误差曲线和测试误差曲线。并且保存了模型。
+`paddle.dataset.movielens.train` will yield records during each pass, after shuffling, a batch input is generated for training.
 
 ```python
-%matplotlib inline
+reader=paddle.reader.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.movielens.trai(), buf_size=8192),
+        batch_size=256)
+```
 
-import matplotlib.pyplot as plt
-from IPython import display
-import cPickle
+`feeding` is devoted to specify the correspondence between each yield record and `paddle.layer.data`. For instance, the first column of data generated by `movielens.train` corresponds to `user_id` feature.
 
+```python
 feeding = {
     'user_id': 0,
     'gender_id': 1,
@@ -312,12 +310,11 @@ feeding = {
     'movie_title': 6,
     'score': 7
 }
+```
 
-step=0
-
-train_costs=[],[]
-test_costs=[],[]
+Callback function `event_handler` is used to track training and testing process that might be triggered once the action to which it is attached is executed.
 
+```python
 def event_handler(event):
     global step
     global train_costs
@@ -327,13 +324,13 @@ def event_handler(event):
         if step % 10 == 0:  # every 10 batches, record a train cost
             train_costs[0].append(step)
             train_costs[1].append(event.cost)
-            
+
         if step % 1000 == 0: # every 1000 batches, record a test cost
             result = trainer.test(reader=paddle.batch(
                   paddle.dataset.movielens.test(), batch_size=256))
             test_costs[0].append(step)
             test_costs[1].append(result.cost)
-        
+
         if step % 100 == 0: # every 100 batches, update cost plot
             plt.plot(*train_costs)
             plt.plot(*test_costs)
@@ -342,15 +339,27 @@ def event_handler(event):
             display.display(plt.gcf())
             plt.gcf().clear()
         step += 1
+```
+
+Finally, we can invoke `trainer.train` to start training:
+
+```python
+%matplotlib inline
+
+import matplotlib.pyplot as plt
+from IPython import display
+import cPickle
+
+step=0
+
+train_costs=[],[]
+test_costs=[],[]
 
 trainer.train(
-    reader=paddle.batch(
-            paddle.reader.shuffle(
-            paddle.dataset.movielens.train(), buf_size=8192),
-                            batch_size=256),
+    reader=reader,
     event_handler=event_handler,
     feeding=feeding,
-    num_passes=2)
+    num_passes=200)
 ```
 
 ## Conclusion
-- 
GitLab