# -*- coding: UTF-8 -*- # Copyright (c) 2016 Baidu, Inc. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ 1. (remove HTML before or not)tokensizing 2. pos sample : rating score 5; neg sample: rating score 1-2. Usage: python preprocess.py -i data_file [random seed] """ import sys import os import operator import gzip from subprocess import Popen, PIPE from optparse import OptionParser import json from bs4 import BeautifulSoup from multiprocessing import Queue from multiprocessing import Pool import multiprocessing batch_size = 5000 word_count = {} num_tokenize = max(1, multiprocessing.cpu_count() - 2) # parse + tokenize + save max_queue_size = 8 parse_queue = Queue(maxsize=max_queue_size + num_tokenize) tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize) def create_dict(data): """ Create dictionary based on data, and saved in data_dir/dict.txt. The first line is unk \t -1. data: list, input data by batch. """ for seq in data: try: for w in seq.lower().split(): if w not in word_count: word_count[w] = 1 else: word_count[w] += 1 except: sys.stderr.write(seq + "\tERROR\n") def parse(path): """ Open .gz file. """ sys.stderr.write(path) g = gzip.open(path, 'r') for l in g: yield json.loads(l) g.close() ''' def clean(review): """ Clean input review: remove HTML, convert words to lower cases. """ # Remove HTML review_text = BeautifulSoup(review, "html.parser").get_text() return review_text ''' def tokenize(sentences): """ Use tokenizer.perl to tokenize input sentences. tokenizer.perl is tool of Moses. sentences : a list of input sentences. return: a list of processed text. """ dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl' tokenizer_cmd = [dir, '-l', 'en', '-q', '-'] assert isinstance(sentences, list) text = "\n".join(sentences) tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) tok_text, _ = tokenizer.communicate(text) toks = tok_text.split('\n')[:-1] return toks def save_data(instance, data_dir, pre_fix, batch_num): """ save data by batch """ label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))] lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))] file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num)) file(file_name, 'w').write('\n'.join(lines) + '\n') def tokenize_batch(id): """ tokenize data by batch """ while True: num_batch, instance, pre_fix = parse_queue.get() if num_batch == -1: ### parse_queue finished tokenize_queue.put((-1, None, None)) sys.stderr.write("tokenize theread %s finish\n" % (id)) break tokenize_instance = tokenize(instance) tokenize_queue.put((num_batch, tokenize_instance, pre_fix)) sys.stderr.write('.') def save_batch(data_dir, num_tokenize, data_dir_dict): """ save data by batch build dict.txt """ token_count = 0 while True: num_batch, instance, pre_fix = tokenize_queue.get() if num_batch == -1: token_count += 1 if token_count == num_tokenize: #### tokenize finished. break else: continue save_data(instance, data_dir, pre_fix, num_batch) create_dict(instance) ## update dict sys.stderr.write("save file finish\n") f = open(data_dir_dict, 'w') f.write('%s\t%s\n' % ('unk', '-1')) for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \ reverse=True): f.write('%s\t%s\n' % (k, v)) f.close() sys.stderr.write("build dict finish\n") def parse_batch(data, num_tokenize): """ parse data by batch parse -> clean ->tokenize ->save """ raw_txt = parse(data) neg, pos = [], [] count = 0 sys.stderr.write("extract raw data\n") for l in raw_txt: rating = l["overall"] #text = clean(l["reviewText"].lower()) # remove HTML text = l["reviewText"].lower() # # convert words to lower case if rating == 5.0 and text: pos.append(text) if rating < 3.0 and text: neg.append(text) if len(pos) == batch_size or len(neg) == batch_size: if len(pos) == batch_size: batch = pos pre_fix = 'pos' else: batch = neg pre_fix = 'neg' parse_queue.put((count, batch, pre_fix)) count += 1 if pre_fix == 'pos': pos = [] else: neg = [] if len(pos) > 0: parse_queue.put((count, pos, 'pos')) count += 1 if len(neg) > 0: parse_queue.put((count, neg, 'neg')) count += 1 for i in range(num_tokenize): parse_queue.put((-1, None, None)) #### for tokenize's input finished sys.stderr.write("parsing finish\n") def option_parser(): parser = OptionParser(usage="usage: python preprcoess.py "\ "-i data_path [options]") parser.add_option( "-i", "--data", action="store", dest="input", help="Input data path.") parser.add_option( "-s", "--seed", action="store", dest="seed", default=1024, help="Set random seed.") return parser.parse_args() def main(): reload(sys) sys.setdefaultencoding('utf-8') options, args = option_parser() data = options.input seed = options.seed data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt') data_dir = os.path.join(os.path.dirname(data), 'tmp') pool = Pool(processes=num_tokenize + 2) pool.apply_async(parse_batch, args=(data, num_tokenize)) for i in range(num_tokenize): pool.apply_async(tokenize_batch, args=(str(i), )) pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict)) pool.close() pool.join() sys.stderr.write("clean data done.\n") file(os.path.join(os.path.dirname(data), 'labels.list'), 'w').write('neg\t0\npos\t1\n') if __name__ == '__main__': main()