Update quick start.

ISSUE=4602353 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1450 1ad973e4-5ce8-4261-8a94-b56d1f490c56

Update quick start.
ISSUE=4602353 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1450 1ad973e4-5ce8-4261-8a94-b56d1f490c56
eef13ffb · dangqingqing · d8f30da7 · eef13ffb · eef13ffb · eef13ffb
9 changed file
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
@@ -75,7 +75,7 @@ def predict_initializer(settings, dictionary, **kwargs):

 # Declaring a data provider for prediction. The difference with process
 # is that label is not generated.
-@provider(init_hook=predict_initializer)
+@provider(init_hook=predict_initializer, should_shuffle=False)
 def process_predict(settings, file_name):
    with open(file_name, 'r') as f:
        for line in f:

--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs):
    ]


-@provider(init_hook=predict_initializer)
+@provider(init_hook=predict_initializer, should_shuffle=False)
 def process_predict(settings, file_name):
    with open(file_name, 'r') as f:
        for line in f:

--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -14,10 +14,10 @@
 # limitations under the License.
 set -e

-#cfg=trainer_config.lr.py
+cfg=trainer_config.lr.py
 #cfg=trainer_config.emb.py
 #cfg=trainer_config.cnn.py
-cfg=trainer_config.lstm.py
+#cfg=trainer_config.lstm.py
 model="output/pass-00003"
 paddle train \
    --config=$cfg \

--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/preprocess.py
@@ -29,7 +29,6 @@ import gzip
 from subprocess import Popen, PIPE
 from optparse import OptionParser
 import json
-from bs4 import BeautifulSoup
 from multiprocessing import Queue
 from multiprocessing import Pool
 import multiprocessing
@@ -69,16 +68,6 @@ def parse(path):
        yield json.loads(l)
    g.close()

-'''
-def clean(review):
-    """
-    Clean input review: remove HTML, convert words to lower cases.
-    """
-    # Remove HTML
-    review_text = BeautifulSoup(review, "html.parser").get_text()
-    return review_text
-'''
-

 def tokenize(sentences):
    """
@@ -152,7 +141,7 @@ def save_batch(data_dir, num_tokenize, data_dir_dict):
 def parse_batch(data, num_tokenize):
    """
    parse data by batch
-    parse -> clean ->tokenize ->save
+    parse -> tokenize -> save
    """
    raw_txt = parse(data)
    neg, pos = [], []
@@ -160,7 +149,6 @@ def parse_batch(data, num_tokenize):
    sys.stderr.write("extract raw data\n")
    for l in raw_txt:
        rating = l["overall"]
-        #text = clean(l["reviewText"].lower()) # remove HTML
        text = l["reviewText"].lower()  # # convert words to lower case
        if rating == 5.0 and text:
            pos.append(text)
@@ -223,7 +211,6 @@ def main():
    pool.close()
    pool.join()

-    sys.stderr.write("clean data done.\n")
    file(os.path.join(os.path.dirname(data), 'labels.list'),
         'w').write('neg\t0\npos\t1\n')


--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -18,11 +18,13 @@
 # 3. distinct train set and test set.
 # 4. build dict

+set -e

-mkdir data/tmp
+mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle
 cd data/tmp
+echo 'uniq and shuffle...'
 cat pos_*|sort|uniq|shuf> pos.shuffed
 cat neg_*|sort|uniq|shuf> neg.shuffed


--- a/demo/quick_start/requirements.txt
+++ b/demo/quick_start/requirements.txt
-beautifulsoup4
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -16,7 +16,7 @@ set -e

 function get_best_pass() {
  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
+  sed  -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
  sort | head -n 1
 }


--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.

-`demo/quick_start` provides scripts for downloading data and preprocessing data, as shown below:
+`demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).

 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-pip install -r requirements.txt
 ./preprocess.sh
 ```

@@ -432,6 +431,14 @@ There are several differences between training and inference network configurati
 - batch_size = 1.
 - You need to specify the location of `test_list` in the test data.

+The results in `result.txt` is as follows, each line is one sample.
+
+```
+predicted_label_id;probability_of_label_0 probability_of_label_1  # the first sample
+predicted_label_id;probability_of_label_0 probability_of_label_1  # the second sample
+```
+
+
 ```python
 is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None

--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -38,7 +38,6 @@
 ```bash
 cd demo/quick_start
 ./data/get_data.sh
-pip install -r requirements.txt
 ./preprocess.sh
 ```

@@ -411,6 +410,13 @@ mv rank-00000 result.txt
 与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
 指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。

+预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：
+
+```
+预测ID;ID为0的概率 ID为1的概率
+预测ID;ID为0的概率 ID为1的概率
+```
+
 ```
 is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None