提交 eef13ffb 编写于 作者: D dangqingqing

Update quick start.

ISSUE=4602353

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1450 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 d8f30da7
...@@ -75,7 +75,7 @@ def predict_initializer(settings, dictionary, **kwargs): ...@@ -75,7 +75,7 @@ def predict_initializer(settings, dictionary, **kwargs):
# Declaring a data provider for prediction. The difference with process # Declaring a data provider for prediction. The difference with process
# is that label is not generated. # is that label is not generated.
@provider(init_hook=predict_initializer) @provider(init_hook=predict_initializer, should_shuffle=False)
def process_predict(settings, file_name): def process_predict(settings, file_name):
with open(file_name, 'r') as f: with open(file_name, 'r') as f:
for line in f: for line in f:
......
...@@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs): ...@@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs):
] ]
@provider(init_hook=predict_initializer) @provider(init_hook=predict_initializer, should_shuffle=False)
def process_predict(settings, file_name): def process_predict(settings, file_name):
with open(file_name, 'r') as f: with open(file_name, 'r') as f:
for line in f: for line in f:
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
# limitations under the License. # limitations under the License.
set -e set -e
#cfg=trainer_config.lr.py cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py #cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py #cfg=trainer_config.cnn.py
cfg=trainer_config.lstm.py #cfg=trainer_config.lstm.py
model="output/pass-00003" model="output/pass-00003"
paddle train \ paddle train \
--config=$cfg \ --config=$cfg \
......
...@@ -29,7 +29,6 @@ import gzip ...@@ -29,7 +29,6 @@ import gzip
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from optparse import OptionParser from optparse import OptionParser
import json import json
from bs4 import BeautifulSoup
from multiprocessing import Queue from multiprocessing import Queue
from multiprocessing import Pool from multiprocessing import Pool
import multiprocessing import multiprocessing
...@@ -69,16 +68,6 @@ def parse(path): ...@@ -69,16 +68,6 @@ def parse(path):
yield json.loads(l) yield json.loads(l)
g.close() g.close()
'''
def clean(review):
"""
Clean input review: remove HTML, convert words to lower cases.
"""
# Remove HTML
review_text = BeautifulSoup(review, "html.parser").get_text()
return review_text
'''
def tokenize(sentences): def tokenize(sentences):
""" """
...@@ -152,7 +141,7 @@ def save_batch(data_dir, num_tokenize, data_dir_dict): ...@@ -152,7 +141,7 @@ def save_batch(data_dir, num_tokenize, data_dir_dict):
def parse_batch(data, num_tokenize): def parse_batch(data, num_tokenize):
""" """
parse data by batch parse data by batch
parse -> clean ->tokenize ->save parse -> tokenize -> save
""" """
raw_txt = parse(data) raw_txt = parse(data)
neg, pos = [], [] neg, pos = [], []
...@@ -160,7 +149,6 @@ def parse_batch(data, num_tokenize): ...@@ -160,7 +149,6 @@ def parse_batch(data, num_tokenize):
sys.stderr.write("extract raw data\n") sys.stderr.write("extract raw data\n")
for l in raw_txt: for l in raw_txt:
rating = l["overall"] rating = l["overall"]
#text = clean(l["reviewText"].lower()) # remove HTML
text = l["reviewText"].lower() # # convert words to lower case text = l["reviewText"].lower() # # convert words to lower case
if rating == 5.0 and text: if rating == 5.0 and text:
pos.append(text) pos.append(text)
...@@ -223,7 +211,6 @@ def main(): ...@@ -223,7 +211,6 @@ def main():
pool.close() pool.close()
pool.join() pool.join()
sys.stderr.write("clean data done.\n")
file(os.path.join(os.path.dirname(data), 'labels.list'), file(os.path.join(os.path.dirname(data), 'labels.list'),
'w').write('neg\t0\npos\t1\n') 'w').write('neg\t0\npos\t1\n')
......
...@@ -18,11 +18,13 @@ ...@@ -18,11 +18,13 @@
# 3. distinct train set and test set. # 3. distinct train set and test set.
# 4. build dict # 4. build dict
set -e
mkdir data/tmp mkdir -p data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz python preprocess.py -i data/reviews_Electronics_5.json.gz
# uniq and shuffle # uniq and shuffle
cd data/tmp cd data/tmp
echo 'uniq and shuffle...'
cat pos_*|sort|uniq|shuf> pos.shuffed cat pos_*|sort|uniq|shuf> pos.shuffed
cat neg_*|sort|uniq|shuf> neg.shuffed cat neg_*|sort|uniq|shuf> neg.shuffed
......
...@@ -16,7 +16,7 @@ set -e ...@@ -16,7 +16,7 @@ set -e
function get_best_pass() { function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\ sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
sort | head -n 1 sort | head -n 1
} }
......
...@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st ...@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
## Preprocess data into standardized format ## Preprocess data into standardized format
In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product. In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
`demo/quick_start` provides scripts for downloading data and preprocessing data, as shown below: `demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
```bash ```bash
cd demo/quick_start cd demo/quick_start
./data/get_data.sh ./data/get_data.sh
pip install -r requirements.txt
./preprocess.sh ./preprocess.sh
``` ```
...@@ -432,6 +431,14 @@ There are several differences between training and inference network configurati ...@@ -432,6 +431,14 @@ There are several differences between training and inference network configurati
- batch_size = 1. - batch_size = 1.
- You need to specify the location of `test_list` in the test data. - You need to specify the location of `test_list` in the test data.
The results in `result.txt` is as follows, each line is one sample.
```
predicted_label_id;probability_of_label_0 probability_of_label_1 # the first sample
predicted_label_id;probability_of_label_0 probability_of_label_1 # the second sample
```
```python ```python
is_predict = get_config_arg('is_predict', bool, False) is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None trn = 'data/train.list' if not is_predict else None
......
...@@ -38,7 +38,6 @@ ...@@ -38,7 +38,6 @@
```bash ```bash
cd demo/quick_start cd demo/quick_start
./data/get_data.sh ./data/get_data.sh
pip install -r requirements.txt
./preprocess.sh ./preprocess.sh
``` ```
...@@ -411,6 +410,13 @@ mv rank-00000 result.txt ...@@ -411,6 +410,13 @@ mv rank-00000 result.txt
与训练网络配置不同的是:无需label相关的层,指定outputs输出概率层(softmax输出), 与训练网络配置不同的是:无需label相关的层,指定outputs输出概率层(softmax输出),
指定batch_size=1,数据传输无需label数据,预测数据指定test_list的位置。 指定batch_size=1,数据传输无需label数据,预测数据指定test_list的位置。
预测结果以文本的形式保存在`result.txt`中,一行为一个样本,格式如下:
```
预测ID;ID为0的概率 ID为1的概率
预测ID;ID为0的概率 ID为1的概率
```
``` ```
is_predict = get_config_arg('is_predict', bool, False) is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None trn = 'data/train.list' if not is_predict else None
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册