speedup preprocess in quick start

ISSUE=4575209 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1430 1ad973e4-5ce8-4261-8a94-b56d1f490c56

speedup preprocess in quick start
ISSUE=4575209 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1430 1ad973e4-5ce8-4261-8a94-b56d1f490c56
5db9e590 · hupeng03 · 2afe6609 · 5db9e590 · 5db9e590
显示空白变更内容
内联并排

Showing with 181 addition and 104 deletion

demo/quick_start/preprocess.py demo/quick_start/preprocess.py +148 -102

demo/quick_start/preprocess.sh demo/quick_start/preprocess.sh +33 -2

未找到文件。
--- a/demo/quick_start/preprocess.py
+++ b/demo/quick_start/preprocess.py
+# -*- coding: UTF-8 -*-
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,45 +14,71 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-'''
+"""
-1. remove HTML before tokensizing 
+1. (remove HTML before or not)tokensizing
 2. pos sample : rating score 5; neg sample: rating score 1-2.
-3. size of pos : neg = 1:1.
-4. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
-5. distinct train set and test set.
 Usage:
    python preprocess.py -i data_file [random seed]
-'''
+"""
-import sys,os
+import sys
-import re
+import os
 import operator
-import gzip,math
+import gzip
-import random
-import numpy as np
-from bs4 import BeautifulSoup
 from subprocess import Popen, PIPE
 from optparse import OptionParser
+import json
+from bs4 import BeautifulSoup
+from multiprocessing import Queue
+from multiprocessing import Pool
+import multiprocessing
+batch_size = 5000
+word_count = {}
+num_tokenize = max(1, multiprocessing.cpu_count() - 2)  # parse + tokenize + save
+max_queue_size = 8
+parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
+tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
+def create_dict(data):
+    """
+    Create dictionary based on data, and saved in data_dir/dict.txt.
+    The first line is unk \t -1.
+    data: list, input data by batch.
+    """
+    for seq in data:
+        try:
+            for w in seq.lower().split():
+                if w not in word_count:
+                    word_count[w] = 1
+                else:
+                    word_count[w] += 1
+        except:
+            sys.stderr.write(seq + "\tERROR\n")
 def parse(path):
    """
    Open .gz file.
    """
+    sys.stderr.write(path)
    g = gzip.open(path, 'r')
    for l in g:
-        yield eval(l)
+        yield json.loads(l)
+    g.close()
+'''
 def clean(review):
    """
    Clean input review: remove HTML, convert words to lower cases.
    """
    # Remove HTML
    review_text = BeautifulSoup(review, "html.parser").get_text()
-    # Convert words to lower case
-    review_text = review_text.lower()
    return review_text
+'''
 def tokenize(sentences):
    """
@@ -68,119 +96,137 @@ def tokenize(sentences):
    toks = tok_text.split('\n')[:-1]
    return toks
-def create_dict(data, data_dir):
+def save_data(instance, data_dir, pre_fix, batch_num):
    """
-    Create dictionary based on data, and saved in data_dir/dict.txt.
+    save data by batch
-    The first line is unk \t -1. 
-    data: list, input data.
-    data_dir: path to save dict.
    """
-    word_count = {}
+    label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]
-    for seq in data:
+    lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))]
-        try:
+    file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))
-            for w in seq.lower().split():
+    file(file_name, 'w').write('\n'.join(lines) + '\n')
-                if w not in word_count:
-                    word_count[w] = 1
+def tokenize_batch(id):
+    """
+    tokenize data by batch
+    """
+    while True:
+        num_batch, instance, pre_fix = parse_queue.get()
+        if num_batch == -1:  ### parse_queue finished
+            tokenize_queue.put((-1, None, None))
+            sys.stderr.write("tokenize theread %s finish\n" % (id))
+            break
+        tokenize_instance = tokenize(instance)
+        tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
+        sys.stderr.write('.')
+def save_batch(data_dir, num_tokenize, data_dir_dict):
+    """
+        save data by batch
+        build dict.txt
+    """
+    token_count = 0
+    while True:
+        num_batch, instance, pre_fix = tokenize_queue.get()
+        if num_batch == -1:
+            token_count += 1
+            if token_count == num_tokenize:  #### tokenize finished.
+                break
            else:
-                    word_count[w] += 1
+                continue
-        except:
+        save_data(instance, data_dir, pre_fix, num_batch)
-            sys.stderr.write(seq+"\tERROR\n")
+        create_dict(instance)  ## update dict
-    f = open(os.path.join(data_dir, 'dict.txt'), 'w')
+    sys.stderr.write("save file finish\n")
+    f = open(data_dir_dict, 'w')
    f.write('%s\t%s\n' % ('unk', '-1'))
-    for k, v in sorted(word_count.items(), key=operator.itemgetter(1),\
+    for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \
                       reverse=True):
        f.write('%s\t%s\n' % (k, v))
    f.close()
+    sys.stderr.write("build dict finish\n")
-def save_data(data, data_dir, prefix = ""):
-    file_name = os.path.join(data_dir, "%s.txt" % (prefix))
-    file(file_name,'w').write('\n'.join(data)+'\n')
-    file(os.path.join(data_dir, prefix+'.list'),'w').write('%s\n' % file_name)
-def split_data(raw_txt):
+def parse_batch(data, num_tokenize):
    """
-    Extract positive and negative sample.
+    parse data by batch
+    parse -> clean ->tokenize ->save
    """
-    pos = []
+    raw_txt = parse(data)
-    neg = []
+    neg, pos = [], []
    count = 0
-    dup_cnt = 0
+    sys.stderr.write("extract raw data\n")
-    sys.stderr.write("extract raw data")
    for l in raw_txt:
        rating = l["overall"]
-        text = clean(l["reviewText"])
+        #text = clean(l["reviewText"].lower()) # remove HTML
+        text = l["reviewText"].lower()  # # convert words to lower case
        if rating == 5.0 and text:
            pos.append(text)
        if rating < 3.0 and text:
            neg.append(text)
+        if len(pos) == batch_size or len(neg) == batch_size:
+            if len(pos) == batch_size:
+                batch = pos
+                pre_fix = 'pos'
+            else:
+                batch = neg
+                pre_fix = 'neg'
+            parse_queue.put((count, batch, pre_fix))
            count += 1
-        if count % 20000==0:
+            if pre_fix == 'pos':
-            sys.stderr.write(".")
+                pos = []
-    sys.stderr.write("\n")
+            else:
-    return pos, neg
+                neg = []
-def preprocess(pos_in, neg_in, data_dir, rand_seed):
+    if len(pos) > 0:
-    # tokenize
+        parse_queue.put((count, pos, 'pos'))
-    sys.stderr.write("tokenize...\n")
+        count += 1
-    tmppos = tokenize(pos_in)
+    if len(neg) > 0:
-    tmpneg = tokenize(neg_in)
+        parse_queue.put((count, neg, 'neg'))
-    cnt = len(tmppos) + len(tmpneg)
+        count += 1
+    for i in range(num_tokenize):
-    # unique smaples
+        parse_queue.put((-1, None, None))  #### for tokenize's input finished
-    tmppos = list(set(tmppos))
+    sys.stderr.write("parsing finish\n")
-    tmpneg = list(set(tmpneg))
-    dup_cnt = cnt - len(tmppos) - len(tmpneg)
-    sys.stderr.write("\ntotal size of data set: %d, duplicate data: %d\n" % (cnt, dup_cnt))
-    # keep same size of positive and negative sample
-    min_len = min(len(tmppos), len(tmpneg))
-    tmppos = tmppos[0:min_len]
-    tmpneg = tmpneg[0:min_len]
-    # creat dictionary
-    sys.stderr.write("create dict with train and test data...\n")
-    all_data = tmppos + tmpneg
-    create_dict(all_data, data_dir)
-    # split into train set and test set
-    sys.stderr.write("split data...\n")
-    pos = ["1\t"+i for i in tmppos]
-    neg = ["0\t"+i for i in tmpneg]
-    random.seed(rand_seed)
-    random.shuffle(pos)
-    random.shuffle(neg)
-    # split into test set and train set
-    test_len = min(12500, int(min_len * 0.1))
-    test = pos[0:test_len] + neg[0:test_len]
-    train = pos[test_len:] + neg[test_len:]
-    # save data
-    sys.stderr.write("save data...\n")
-    save_data(train, data_dir, prefix = 'train')
-    save_data(test, data_dir, prefix = 'test')
-    file(os.path.join(data_dir,'labels.list'),'w').write('neg\t0\npos\t1\n')
 def option_parser():
    parser = OptionParser(usage="usage: python preprcoess.py "\
                                "-i data_path [options]")
-    parser.add_option("-i", "--data", action="store",
+    parser.add_option(
-                      dest="input", help="Input data path.")
+        "-i", "--data", action="store", dest="input", help="Input data path.")
-    parser.add_option("-s", "--seed", action="store",
+    parser.add_option(
-                      dest="seed", default=1024,
+        "-s",
+        "--seed",
+        action="store",
+        dest="seed",
+        default=1024,
        help="Set random seed.")
    return parser.parse_args()
 def main():
    reload(sys)
    sys.setdefaultencoding('utf-8')
    options, args = option_parser()
-    data=options.input
+    data = options.input
-    seed=options.seed
+    seed = options.seed
-    data_dir = os.path.dirname(data)
+    data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')
-    pos, neg = split_data(parse(data))
+    data_dir = os.path.join(os.path.dirname(data), 'tmp')
-    preprocess(pos, neg, data_dir, seed)
+    pool = Pool(processes=num_tokenize + 2)
-    sys.stderr.write("Done.\n")
+    pool.apply_async(parse_batch, args=(data, num_tokenize))
+    for i in range(num_tokenize):
+        pool.apply_async(tokenize_batch, args=(str(i), ))
+    pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))
+    pool.close()
+    pool.join()
+    sys.stderr.write("clean data done.\n")
+    file(os.path.join(os.path.dirname(data), 'labels.list'),
+         'w').write('neg\t0\npos\t1\n')
 if __name__ == '__main__':
    main()
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
-#!/bin/bash
+#!/bin/sh
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,10 +12,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
+# 1. size of pos : neg = 1:1.
+# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
+# 3. distinct train set and test set.
+# 4. build dict
+mkdir data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
+# uniq and shuffle
+cd data/tmp
+cat pos_*|sort|uniq|shuf> pos.shuffed
+cat neg_*|sort|uniq|shuf> neg.shuffed
+min_len=`sed -n '$=' neg.shuffed`
+((test_num=$min_len/10))
+if [ $test_num -gt 12500 ];then
+ test_num=12500
+fi
+((train_num=$min_len-$test_num))
+head -n$train_num pos.shuffed >train.pos
+head -n$train_num neg.shuffed >train.neg
+tail -n$test_num pos.shuffed >test.pos
+tail -n$test_num neg.shuffed >test.neg
+cat train.pos train.neg|shuf>../train.txt
+cat test.pos test.neg|shuf>../test.txt
+cd -
+echo 'data/train.txt' > data/train.list
+echo 'data/test.txt' > data/test.list
 # use 30k dict
+rm -rf data/tmp
 mv data/dict.txt data/dict_all.txt
 cat data/dict_all.txt | head -n 30001 > data/dict.txt
+echo 'preprocess finished'