提交 eef13ffb 编写于 作者: D dangqingqing

Update quick start.

ISSUE=4602353

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1450 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 d8f30da7
......@@ -75,7 +75,7 @@ def predict_initializer(settings, dictionary, **kwargs):
# Declaring a data provider for prediction. The difference with process
# is that label is not generated.
@provider(init_hook=predict_initializer)
@provider(init_hook=predict_initializer, should_shuffle=False)
def process_predict(settings, file_name):
with open(file_name, 'r') as f:
for line in f:
......
......@@ -43,7 +43,7 @@ def predict_initializer(settings, dictionary, **kwargs):
]
@provider(init_hook=predict_initializer)
@provider(init_hook=predict_initializer, should_shuffle=False)
def process_predict(settings, file_name):
with open(file_name, 'r') as f:
for line in f:
......
......@@ -14,10 +14,10 @@
# limitations under the License.
set -e
#cfg=trainer_config.lr.py
cfg=trainer_config.lr.py
#cfg=trainer_config.emb.py
#cfg=trainer_config.cnn.py
cfg=trainer_config.lstm.py
#cfg=trainer_config.lstm.py
model="output/pass-00003"
paddle train \
--config=$cfg \
......
......@@ -29,7 +29,6 @@ import gzip
from subprocess import Popen, PIPE
from optparse import OptionParser
import json
from bs4 import BeautifulSoup
from multiprocessing import Queue
from multiprocessing import Pool
import multiprocessing
......@@ -69,16 +68,6 @@ def parse(path):
yield json.loads(l)
g.close()
'''
def clean(review):
"""
Clean input review: remove HTML, convert words to lower cases.
"""
# Remove HTML
review_text = BeautifulSoup(review, "html.parser").get_text()
return review_text
'''
def tokenize(sentences):
"""
......@@ -152,7 +141,7 @@ def save_batch(data_dir, num_tokenize, data_dir_dict):
def parse_batch(data, num_tokenize):
"""
parse data by batch
parse -> clean ->tokenize ->save
parse -> tokenize -> save
"""
raw_txt = parse(data)
neg, pos = [], []
......@@ -160,7 +149,6 @@ def parse_batch(data, num_tokenize):
sys.stderr.write("extract raw data\n")
for l in raw_txt:
rating = l["overall"]
#text = clean(l["reviewText"].lower()) # remove HTML
text = l["reviewText"].lower() # # convert words to lower case
if rating == 5.0 and text:
pos.append(text)
......@@ -223,7 +211,6 @@ def main():
pool.close()
pool.join()
sys.stderr.write("clean data done.\n")
file(os.path.join(os.path.dirname(data), 'labels.list'),
'w').write('neg\t0\npos\t1\n')
......
......@@ -18,11 +18,13 @@
# 3. distinct train set and test set.
# 4. build dict
set -e
mkdir data/tmp
mkdir -p data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz
# uniq and shuffle
cd data/tmp
echo 'uniq and shuffle...'
cat pos_*|sort|uniq|shuf> pos.shuffed
cat neg_*|sort|uniq|shuf> neg.shuffed
......
......@@ -16,7 +16,7 @@ set -e
function get_best_pass() {
cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \
sed -r 'N;s/Test.* cost=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
sed -r 'N;s/Test.* classification_error_evaluator=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' |\
sort | head -n 1
}
......
......@@ -59,12 +59,11 @@ To build your text classification system, your code will need to perform five st
## Preprocess data into standardized format
In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
`demo/quick_start` provides scripts for downloading data and preprocessing data, as shown below:
`demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
```bash
cd demo/quick_start
./data/get_data.sh
pip install -r requirements.txt
./preprocess.sh
```
......@@ -432,6 +431,14 @@ There are several differences between training and inference network configurati
- batch_size = 1.
- You need to specify the location of `test_list` in the test data.
The results in `result.txt` is as follows, each line is one sample.
```
predicted_label_id;probability_of_label_0 probability_of_label_1 # the first sample
predicted_label_id;probability_of_label_0 probability_of_label_1 # the second sample
```
```python
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
......
......@@ -38,7 +38,6 @@
```bash
cd demo/quick_start
./data/get_data.sh
pip install -r requirements.txt
./preprocess.sh
```
......@@ -411,6 +410,13 @@ mv rank-00000 result.txt
与训练网络配置不同的是:无需label相关的层,指定outputs输出概率层(softmax输出),
指定batch_size=1,数据传输无需label数据,预测数据指定test_list的位置。
预测结果以文本的形式保存在`result.txt`中,一行为一个样本,格式如下:
```
预测ID;ID为0的概率 ID为1的概率
预测ID;ID为0的概率 ID为1的概率
```
```
is_predict = get_config_arg('is_predict', bool, False)
trn = 'data/train.list' if not is_predict else None
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册