diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index a5ffe25a116e9be039bdebaaaad435685e23d372..fcf4437ffaf329f52cc5bc997eff45dee200873c 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -32,3 +32,10 @@ def download(url, module_name, md5sum): shutil.copyfileobj(r.raw, f) return filename + + +def dict_add(a_dict, ele): + if ele in a_dict: + a_dict[ele] += 1 + else: + a_dict[ele] = 1 diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index 7a191cb583d4ced8e92354f4158e646d57b601bd..433e37380f840f5b7ff619a5f64b99d2ad724b17 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -17,3 +17,104 @@ """ IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz """ +import paddle.v2.dataset.common +import tarfile +import Queue +import re +import string +import threading + +__all__ = ['build_dict', 'train', 'test'] + +URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' +MD5 = '7c2ac02c03563afcf9b574c7e56c153a' + + +# Read files that match pattern. Tokenize and yield each file. +def tokenize(pattern): + with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', + MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip("\n\r").translate( + None, string.punctuation).lower().split() + tf = tarf.next() + + +def build_dict(pattern, cutoff): + word_freq = {} + for doc in tokenize(pattern): + for word in doc: + paddle.v2.dataset.common.dict_add(word_freq, word) + + # Not sure if we should prune less-frequent words here. + word_freq = filter(lambda x: x[1] > cutoff, word_freq.items()) + + dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*dictionary)) + word_idx = dict(zip(words, xrange(len(words)))) + word_idx[''] = len(words) + return word_idx + + +def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size): + UNK = word_idx[''] + + qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)] + + def load(pattern, queue): + for doc in tokenize(pattern): + queue.put(doc) + queue.put(None) + + def reader(): + # Creates two threads that loads positive and negative samples + # into qs. + t0 = threading.Thread( + target=load, args=( + pos_pattern, + qs[0], )) + t0.daemon = True + t0.start() + + t1 = threading.Thread( + target=load, args=( + neg_pattern, + qs[1], )) + t1.daemon = True + t1.start() + + # Read alternatively from qs[0] and qs[1]. + i = 0 + doc = qs[i].get() + while doc != None: + yield [word_idx.get(w, UNK) for w in doc], i % 2 + i += 1 + doc = qs[i % 2].get() + + # If any queue is empty, reads from the other queue. + i += 1 + doc = qs[i % 2].get() + while doc != None: + yield [word_idx.get(w, UNK) for w in doc], i % 2 + doc = qs[i % 2].get() + + return reader() + + +def train(word_idx): + return reader_creator( + re.compile("aclImdb/train/pos/.*\.txt$"), + re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000) + + +def test(word_idx): + return reader_creator( + re.compile("aclImdb/test/pos/.*\.txt$"), + re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000) diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py index 5cfb6203e78ec14f2cc5474876a6ed7de7d491e9..e887af16634d2db04b8cf5fa0269a69991d8baac 100644 --- a/python/paddle/v2/dataset/tests/imdb_test.py +++ b/python/paddle/v2/dataset/tests/imdb_test.py @@ -1,12 +1,43 @@ -import paddle.v2.dataset.common -import tarfile +import paddle.v2.dataset.imdb +import unittest +import re -URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' -MD5 = '7c2ac02c03563afcf9b574c7e56c153a' +TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$") +TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$") +TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$") -tarf = tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', MD5)) +TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$") +TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$") +TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$") -tf = tarf.next() -while tf != None: - print tf.name - tf = tarf.next() + +class TestIMDB(unittest.TestCase): + word_idx = None + + def test_build_dict(self): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + self.assertEqual(len(self.word_idx), 7036) + + def check_dataset(self, dataset, expected_size): + if self.word_idx == None: + self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, + 150) + + sum = 0 + for l in dataset(self.word_idx): + self.assertEqual(l[1], sum % 2) + sum += 1 + self.assertEqual(sum, expected_size) + + def test_train(self): + self.check_dataset(paddle.v2.dataset.imdb.train, 25000) + + def test_test(self): + self.check_dataset(paddle.v2.dataset.imdb.test, 25000) + + +if __name__ == '__main__': + unittest.main()