diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py index bbd3ecabaadbf5c856fb918a43ff19a1e860eff7..435e6d8175bd68ba82a1270a238a2abd9c963dd5 100644 --- a/demo/quick_start/dataprovider_bow.py +++ b/demo/quick_start/dataprovider_bow.py @@ -75,7 +75,7 @@ def predict_initializer(settings, dictionary, **kwargs): # Declaring a data provider for prediction. The difference with process # is that label is not generated. -@provider(init_hook=predict_initializer) +@provider(init_hook=predict_initializer, should_shuffle=False) def process_predict(settings, file_name): with open(file_name, 'r') as f: for line in f: diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py index e9b17603818b3a43dcc1746d8ea582de4380e220..e5030c5e71aa582edcf72eeeda70e3c04d593673 100755 --- a/demo/quick_start/dataprovider_emb.py +++ b/demo/quick_start/dataprovider_emb.py @@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs): ] -@provider(init_hook=predict_initializer) +@provider(init_hook=predict_initializer, should_shuffle=False) def process_predict(settings, file_name): with open(file_name, 'r') as f: for line in f: diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh index f764e202446a4ebde4806fe9ae91811ece427dcb..b1e5e44f0b644547d6573ef635084b555237bea6 100755 --- a/demo/quick_start/predict.sh +++ b/demo/quick_start/predict.sh @@ -14,10 +14,10 @@ # limitations under the License. set -e -#cfg=trainer_config.lr.py +cfg=trainer_config.lr.py #cfg=trainer_config.emb.py #cfg=trainer_config.cnn.py -cfg=trainer_config.lstm.py +#cfg=trainer_config.lstm.py model="output/pass-00003" paddle train \ --config=$cfg \ diff --git a/demo/quick_start/preprocess.py b/demo/quick_start/preprocess.py index 1507ac48e83b154897b39225a8e0711e7fce022c..69fdbe44b5245bc2855847a1507e6eaed517eb96 100755 --- a/demo/quick_start/preprocess.py +++ b/demo/quick_start/preprocess.py @@ -29,7 +29,6 @@ import gzip from subprocess import Popen, PIPE from optparse import OptionParser import json -from bs4 import BeautifulSoup from multiprocessing import Queue from multiprocessing import Pool import multiprocessing @@ -69,16 +68,6 @@ def parse(path): yield json.loads(l) g.close() -''' -def clean(review): - """ - Clean input review: remove HTML, convert words to lower cases. - """ - # Remove HTML - review_text = BeautifulSoup(review, "html.parser").get_text() - return review_text -''' - def tokenize(sentences): """ @@ -152,7 +141,7 @@ def save_batch(data_dir, num_tokenize, data_dir_dict): def parse_batch(data, num_tokenize): """ parse data by batch - parse -> clean ->tokenize ->save + parse -> tokenize -> save """ raw_txt = parse(data) neg, pos = [], [] @@ -160,7 +149,6 @@ def parse_batch(data, num_tokenize): sys.stderr.write("extract raw data\n") for l in raw_txt: rating = l["overall"] - #text = clean(l["reviewText"].lower()) # remove HTML text = l["reviewText"].lower() # # convert words to lower case if rating == 5.0 and text: pos.append(text) @@ -223,7 +211,6 @@ def main(): pool.close() pool.join() - sys.stderr.write("clean data done.\n") file(os.path.join(os.path.dirname(data), 'labels.list'), 'w').write('neg\t0\npos\t1\n') diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh index bdc03f81b645f7b9d10f27d9a5050fc523a7f1b5..49141c69fa195ce2bc263ba979992b4da7163b6e 100755 --- a/demo/quick_start/preprocess.sh +++ b/demo/quick_start/preprocess.sh @@ -18,11 +18,13 @@ # 3. distinct train set and test set. # 4. build dict +set -e -mkdir data/tmp +mkdir -p data/tmp python preprocess.py -i data/reviews_Electronics_5.json.gz # uniq and shuffle cd data/tmp +echo 'uniq and shuffle...' cat pos_*|sort|uniq|shuf> pos.shuffed cat neg_*|sort|uniq|shuf> neg.shuffed diff --git a/demo/quick_start/requirements.txt b/demo/quick_start/requirements.txt deleted file mode 100644 index c1f5f713cdafc4d0a904e0c158ccb14a0da6a445..0000000000000000000000000000000000000000 --- a/demo/quick_start/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -beautifulsoup4 diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh index ffe404de6b5227872637d08bc7287e9d67f56259..098fbb91389b89c8b69ccf2f5d308e4e715ac950 100755 --- a/demo/sentiment/test.sh +++ b/demo/sentiment/test.sh @@ -16,7 +16,7 @@ set -e function get_best_pass() { cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ + sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ sort | head -n 1 } diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md index 41ee3d1abbc96e300b96643e1defbbff16a3d9f4..2a8c8a5dde726d538e7a5a9b7cf211a48a14d2ff 100644 --- a/doc/demo/quick_start/index_en.md +++ b/doc/demo/quick_start/index_en.md @@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st ## Preprocess data into standardized format In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product. -`demo/quick_start` provides scripts for downloading data and preprocessing data, as shown below: +`demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine). ```bash cd demo/quick_start ./data/get_data.sh -pip install -r requirements.txt ./preprocess.sh ``` @@ -432,6 +431,14 @@ There are several differences between training and inference network configurati - batch_size = 1. - You need to specify the location of `test_list` in the test data. +The results in `result.txt` is as follows, each line is one sample. + +``` +predicted_label_id;probability_of_label_0 probability_of_label_1 # the first sample +predicted_label_id;probability_of_label_0 probability_of_label_1 # the second sample +``` + + ```python is_predict = get_config_arg('is_predict', bool, False) trn = 'data/train.list' if not is_predict else None diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md index 84d796320f7ab675f529d7d0bda843711b688c67..e799e454f21432c0bd2b121152c137e5be7d1e8a 100644 --- a/doc_cn/demo/quick_start/index.md +++ b/doc_cn/demo/quick_start/index.md @@ -38,7 +38,6 @@ ```bash cd demo/quick_start ./data/get_data.sh -pip install -r requirements.txt ./preprocess.sh ``` @@ -411,6 +410,13 @@ mv rank-00000 result.txt 与训练网络配置不同的是:无需label相关的层,指定outputs输出概率层(softmax输出), 指定batch_size=1,数据传输无需label数据,预测数据指定test_list的位置。 +预测结果以文本的形式保存在`result.txt`中,一行为一个样本,格式如下: + +``` +预测ID;ID为0的概率 ID为1的概率 +预测ID;ID为0的概率 ID为1的概率 +``` + ``` is_predict = get_config_arg('is_predict', bool, False) trn = 'data/train.list' if not is_predict else None