# Copyright (c) 2016 Baidu, Inc. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ''' 1. remove HTML before tokensizing 2. pos sample : rating score 5; neg sample: rating score 1-2. 3. size of pos : neg = 1:1. 4. size of testing set = min(25k, len(all_data) * 0.1), others is traning set. 5. distinct train set and test set. Usage: python preprocess.py -i data_file [random seed] ''' import sys,os import re import operator import gzip,math import random import numpy as np from bs4 import BeautifulSoup from subprocess import Popen, PIPE from optparse import OptionParser def parse(path): """ Open .gz file. """ g = gzip.open(path, 'r') for l in g: yield eval(l) def clean(review): """ Clean input review: remove HTML, convert words to lower cases. """ # Remove HTML review_text = BeautifulSoup(review, "html.parser").get_text() # Convert words to lower case review_text = review_text.lower() return review_text def tokenize(sentences): """ Use tokenizer.perl to tokenize input sentences. tokenizer.perl is tool of Moses. sentences : a list of input sentences. return: a list of processed text. """ dir = './data/mosesdecoder-master/scripts/tokenizer/tokenizer.perl' tokenizer_cmd = [dir, '-l', 'en', '-q', '-'] assert isinstance(sentences, list) text = "\n".join(sentences) tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE) tok_text, _ = tokenizer.communicate(text) toks = tok_text.split('\n')[:-1] return toks def create_dict(data, data_dir): """ Create dictionary based on data, and saved in data_dir/dict.txt. The first line is unk \t -1. data: list, input data. data_dir: path to save dict. """ word_count = {} for seq in data: try: for w in seq.lower().split(): if w not in word_count: word_count[w] = 1 else: word_count[w] += 1 except: sys.stderr.write(seq+"\tERROR\n") f = open(os.path.join(data_dir, 'dict.txt'), 'w') f.write('%s\t%s\n' % ('unk', '-1')) for k, v in sorted(word_count.items(), key=operator.itemgetter(1),\ reverse=True): f.write('%s\t%s\n' % (k, v)) f.close() def save_data(data, data_dir, prefix = ""): file_name = os.path.join(data_dir, "%s.txt" % (prefix)) file(file_name,'w').write('\n'.join(data)+'\n') file(os.path.join(data_dir, prefix+'.list'),'w').write('%s\n' % file_name) def split_data(raw_txt): """ Extract positive and negative sample. """ pos = [] neg = [] count = 0 dup_cnt = 0 sys.stderr.write("extract raw data") for l in raw_txt: rating = l["overall"] text = clean(l["reviewText"]) if rating == 5.0 and text: pos.append(text) if rating < 3.0 and text: neg.append(text) count += 1 if count % 20000==0: sys.stderr.write(".") sys.stderr.write("\n") return pos, neg def preprocess(pos_in, neg_in, data_dir, rand_seed): # tokenize sys.stderr.write("tokenize...\n") tmppos = tokenize(pos_in) tmpneg = tokenize(neg_in) cnt = len(tmppos) + len(tmpneg) # unique smaples tmppos = list(set(tmppos)) tmpneg = list(set(tmpneg)) dup_cnt = cnt - len(tmppos) - len(tmpneg) sys.stderr.write("\ntotal size of data set: %d, duplicate data: %d\n" % (cnt, dup_cnt)) # keep same size of positive and negative sample min_len = min(len(tmppos), len(tmpneg)) tmppos = tmppos[0:min_len] tmpneg = tmpneg[0:min_len] # creat dictionary sys.stderr.write("create dict with train and test data...\n") all_data = tmppos + tmpneg create_dict(all_data, data_dir) # split into train set and test set sys.stderr.write("split data...\n") pos = ["1\t"+i for i in tmppos] neg = ["0\t"+i for i in tmpneg] random.seed(rand_seed) random.shuffle(pos) random.shuffle(neg) # split into test set and train set test_len = min(12500, int(min_len * 0.1)) test = pos[0:test_len] + neg[0:test_len] train = pos[test_len:] + neg[test_len:] # save data sys.stderr.write("save data...\n") save_data(train, data_dir, prefix = 'train') save_data(test, data_dir, prefix = 'test') file(os.path.join(data_dir,'labels.list'),'w').write('neg\t0\npos\t1\n') def option_parser(): parser = OptionParser(usage="usage: python preprcoess.py "\ "-i data_path [options]") parser.add_option("-i", "--data", action="store", dest="input", help="Input data path.") parser.add_option("-s", "--seed", action="store", dest="seed", default=1024, help="Set random seed.") return parser.parse_args() def main(): reload(sys) sys.setdefaultencoding('utf-8') options, args = option_parser() data=options.input seed=options.seed data_dir = os.path.dirname(data) pos, neg = split_data(parse(data)) preprocess(pos, neg, data_dir, seed) sys.stderr.write("Done.\n") if __name__ == '__main__': main()