提交 5db9e590 编写于 作者: H hupeng03

speedup preprocess in quick start

ISSUE=4575209


git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1430 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 2afe6609
# -*- coding: UTF-8 -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -12,45 +14,71 @@ ...@@ -12,45 +14,71 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
''' """
1. remove HTML before tokensizing 1. (remove HTML before or not)tokensizing
2. pos sample : rating score 5; neg sample: rating score 1-2. 2. pos sample : rating score 5; neg sample: rating score 1-2.
3. size of pos : neg = 1:1.
4. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
5. distinct train set and test set.
Usage: Usage:
python preprocess.py -i data_file [random seed] python preprocess.py -i data_file [random seed]
''' """
import sys,os import sys
import re import os
import operator import operator
import gzip,math import gzip
import random
import numpy as np
from bs4 import BeautifulSoup
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from optparse import OptionParser from optparse import OptionParser
import json
from bs4 import BeautifulSoup
from multiprocessing import Queue
from multiprocessing import Pool
import multiprocessing
batch_size = 5000
word_count = {}
num_tokenize = max(1, multiprocessing.cpu_count() - 2) # parse + tokenize + save
max_queue_size = 8
parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
def create_dict(data):
"""
Create dictionary based on data, and saved in data_dir/dict.txt.
The first line is unk \t -1.
data: list, input data by batch.
"""
for seq in data:
try:
for w in seq.lower().split():
if w not in word_count:
word_count[w] = 1
else:
word_count[w] += 1
except:
sys.stderr.write(seq + "\tERROR\n")
def parse(path): def parse(path):
""" """
Open .gz file. Open .gz file.
""" """
sys.stderr.write(path)
g = gzip.open(path, 'r') g = gzip.open(path, 'r')
for l in g: for l in g:
yield eval(l) yield json.loads(l)
g.close()
'''
def clean(review): def clean(review):
""" """
Clean input review: remove HTML, convert words to lower cases. Clean input review: remove HTML, convert words to lower cases.
""" """
# Remove HTML # Remove HTML
review_text = BeautifulSoup(review, "html.parser").get_text() review_text = BeautifulSoup(review, "html.parser").get_text()
# Convert words to lower case
review_text = review_text.lower()
return review_text return review_text
'''
def tokenize(sentences): def tokenize(sentences):
""" """
...@@ -68,119 +96,137 @@ def tokenize(sentences): ...@@ -68,119 +96,137 @@ def tokenize(sentences):
toks = tok_text.split('\n')[:-1] toks = tok_text.split('\n')[:-1]
return toks return toks
def create_dict(data, data_dir):
def save_data(instance, data_dir, pre_fix, batch_num):
""" """
Create dictionary based on data, and saved in data_dir/dict.txt. save data by batch
The first line is unk \t -1.
data: list, input data.
data_dir: path to save dict.
""" """
word_count = {} label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]
for seq in data: lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))]
try: file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))
for w in seq.lower().split(): file(file_name, 'w').write('\n'.join(lines) + '\n')
if w not in word_count:
word_count[w] = 1
def tokenize_batch(id):
"""
tokenize data by batch
"""
while True:
num_batch, instance, pre_fix = parse_queue.get()
if num_batch == -1: ### parse_queue finished
tokenize_queue.put((-1, None, None))
sys.stderr.write("tokenize theread %s finish\n" % (id))
break
tokenize_instance = tokenize(instance)
tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
sys.stderr.write('.')
def save_batch(data_dir, num_tokenize, data_dir_dict):
"""
save data by batch
build dict.txt
"""
token_count = 0
while True:
num_batch, instance, pre_fix = tokenize_queue.get()
if num_batch == -1:
token_count += 1
if token_count == num_tokenize: #### tokenize finished.
break
else: else:
word_count[w] += 1 continue
except: save_data(instance, data_dir, pre_fix, num_batch)
sys.stderr.write(seq+"\tERROR\n") create_dict(instance) ## update dict
f = open(os.path.join(data_dir, 'dict.txt'), 'w')
sys.stderr.write("save file finish\n")
f = open(data_dir_dict, 'w')
f.write('%s\t%s\n' % ('unk', '-1')) f.write('%s\t%s\n' % ('unk', '-1'))
for k, v in sorted(word_count.items(), key=operator.itemgetter(1),\ for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \
reverse=True): reverse=True):
f.write('%s\t%s\n' % (k, v)) f.write('%s\t%s\n' % (k, v))
f.close() f.close()
sys.stderr.write("build dict finish\n")
def save_data(data, data_dir, prefix = ""):
file_name = os.path.join(data_dir, "%s.txt" % (prefix))
file(file_name,'w').write('\n'.join(data)+'\n')
file(os.path.join(data_dir, prefix+'.list'),'w').write('%s\n' % file_name)
def split_data(raw_txt): def parse_batch(data, num_tokenize):
""" """
Extract positive and negative sample. parse data by batch
parse -> clean ->tokenize ->save
""" """
pos = [] raw_txt = parse(data)
neg = [] neg, pos = [], []
count = 0 count = 0
dup_cnt = 0 sys.stderr.write("extract raw data\n")
sys.stderr.write("extract raw data")
for l in raw_txt: for l in raw_txt:
rating = l["overall"] rating = l["overall"]
text = clean(l["reviewText"]) #text = clean(l["reviewText"].lower()) # remove HTML
text = l["reviewText"].lower() # # convert words to lower case
if rating == 5.0 and text: if rating == 5.0 and text:
pos.append(text) pos.append(text)
if rating < 3.0 and text: if rating < 3.0 and text:
neg.append(text) neg.append(text)
if len(pos) == batch_size or len(neg) == batch_size:
if len(pos) == batch_size:
batch = pos
pre_fix = 'pos'
else:
batch = neg
pre_fix = 'neg'
parse_queue.put((count, batch, pre_fix))
count += 1 count += 1
if count % 20000==0: if pre_fix == 'pos':
sys.stderr.write(".") pos = []
sys.stderr.write("\n") else:
return pos, neg neg = []
def preprocess(pos_in, neg_in, data_dir, rand_seed): if len(pos) > 0:
# tokenize parse_queue.put((count, pos, 'pos'))
sys.stderr.write("tokenize...\n") count += 1
tmppos = tokenize(pos_in) if len(neg) > 0:
tmpneg = tokenize(neg_in) parse_queue.put((count, neg, 'neg'))
cnt = len(tmppos) + len(tmpneg) count += 1
for i in range(num_tokenize):
# unique smaples parse_queue.put((-1, None, None)) #### for tokenize's input finished
tmppos = list(set(tmppos)) sys.stderr.write("parsing finish\n")
tmpneg = list(set(tmpneg))
dup_cnt = cnt - len(tmppos) - len(tmpneg)
sys.stderr.write("\ntotal size of data set: %d, duplicate data: %d\n" % (cnt, dup_cnt))
# keep same size of positive and negative sample
min_len = min(len(tmppos), len(tmpneg))
tmppos = tmppos[0:min_len]
tmpneg = tmpneg[0:min_len]
# creat dictionary
sys.stderr.write("create dict with train and test data...\n")
all_data = tmppos + tmpneg
create_dict(all_data, data_dir)
# split into train set and test set
sys.stderr.write("split data...\n")
pos = ["1\t"+i for i in tmppos]
neg = ["0\t"+i for i in tmpneg]
random.seed(rand_seed)
random.shuffle(pos)
random.shuffle(neg)
# split into test set and train set
test_len = min(12500, int(min_len * 0.1))
test = pos[0:test_len] + neg[0:test_len]
train = pos[test_len:] + neg[test_len:]
# save data
sys.stderr.write("save data...\n")
save_data(train, data_dir, prefix = 'train')
save_data(test, data_dir, prefix = 'test')
file(os.path.join(data_dir,'labels.list'),'w').write('neg\t0\npos\t1\n')
def option_parser(): def option_parser():
parser = OptionParser(usage="usage: python preprcoess.py "\ parser = OptionParser(usage="usage: python preprcoess.py "\
"-i data_path [options]") "-i data_path [options]")
parser.add_option("-i", "--data", action="store", parser.add_option(
dest="input", help="Input data path.") "-i", "--data", action="store", dest="input", help="Input data path.")
parser.add_option("-s", "--seed", action="store", parser.add_option(
dest="seed", default=1024, "-s",
"--seed",
action="store",
dest="seed",
default=1024,
help="Set random seed.") help="Set random seed.")
return parser.parse_args() return parser.parse_args()
def main(): def main():
reload(sys) reload(sys)
sys.setdefaultencoding('utf-8') sys.setdefaultencoding('utf-8')
options, args = option_parser() options, args = option_parser()
data=options.input data = options.input
seed=options.seed seed = options.seed
data_dir = os.path.dirname(data) data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')
pos, neg = split_data(parse(data)) data_dir = os.path.join(os.path.dirname(data), 'tmp')
preprocess(pos, neg, data_dir, seed) pool = Pool(processes=num_tokenize + 2)
sys.stderr.write("Done.\n") pool.apply_async(parse_batch, args=(data, num_tokenize))
for i in range(num_tokenize):
pool.apply_async(tokenize_batch, args=(str(i), ))
pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))
pool.close()
pool.join()
sys.stderr.write("clean data done.\n")
file(os.path.join(os.path.dirname(data), 'labels.list'),
'w').write('neg\t0\npos\t1\n')
if __name__ == '__main__': if __name__ == '__main__':
main() main()
#!/bin/bash #!/bin/sh
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -12,10 +12,41 @@ ...@@ -12,10 +12,41 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
set -e
# 1. size of pos : neg = 1:1.
# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
# 3. distinct train set and test set.
# 4. build dict
mkdir data/tmp
python preprocess.py -i data/reviews_Electronics_5.json.gz python preprocess.py -i data/reviews_Electronics_5.json.gz
# uniq and shuffle
cd data/tmp
cat pos_*|sort|uniq|shuf> pos.shuffed
cat neg_*|sort|uniq|shuf> neg.shuffed
min_len=`sed -n '$=' neg.shuffed`
((test_num=$min_len/10))
if [ $test_num -gt 12500 ];then
test_num=12500
fi
((train_num=$min_len-$test_num))
head -n$train_num pos.shuffed >train.pos
head -n$train_num neg.shuffed >train.neg
tail -n$test_num pos.shuffed >test.pos
tail -n$test_num neg.shuffed >test.neg
cat train.pos train.neg|shuf>../train.txt
cat test.pos test.neg|shuf>../test.txt
cd -
echo 'data/train.txt' > data/train.list
echo 'data/test.txt' > data/test.list
# use 30k dict # use 30k dict
rm -rf data/tmp
mv data/dict.txt data/dict_all.txt mv data/dict.txt data/dict_all.txt
cat data/dict_all.txt | head -n 30001 > data/dict.txt cat data/dict_all.txt | head -n 30001 > data/dict.txt
echo 'preprocess finished'
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册