提交 26c9e8e7 编写于 作者: D dangqingqing

Speed data reader for imdb dataset.

上级 298dc895
......@@ -23,10 +23,8 @@ Besides, this module also provides API for building dictionary.
import paddle.v2.dataset.common
import collections
import tarfile
import Queue
import re
import string
import threading
__all__ = ['build_dict', 'train', 'test', 'convert']
......@@ -76,45 +74,19 @@ def build_dict(pattern, cutoff):
def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
UNK = word_idx['<unk>']
INS = []
qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
def load(pattern, queue):
def load(pattern, out, label):
for doc in tokenize(pattern):
queue.put(doc)
queue.put(None)
out.append(([word_idx.get(w, UNK) for w in doc], label))
load(pos_pattern, INS, 0)
load(neg_pattern, INS, 1)
random.shuffle(INS)
def reader():
# Creates two threads that loads positive and negative samples
# into qs.
t0 = threading.Thread(
target=load, args=(
pos_pattern,
qs[0], ))
t0.daemon = True
t0.start()
t1 = threading.Thread(
target=load, args=(
neg_pattern,
qs[1], ))
t1.daemon = True
t1.start()
# Read alternatively from qs[0] and qs[1].
i = 0
doc = qs[i].get()
while doc != None:
yield [word_idx.get(w, UNK) for w in doc], i % 2
i += 1
doc = qs[i % 2].get()
# If any queue is empty, reads from the other queue.
i += 1
doc = qs[i % 2].get()
while doc != None:
yield [word_idx.get(w, UNK) for w in doc], i % 2
doc = qs[i % 2].get()
for doc, label in INS:
yield doc, label
return reader
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册