提交 26c9e8e7 编写于 作者: D dangqingqing

Speed data reader for imdb dataset.

上级 298dc895
...@@ -23,10 +23,8 @@ Besides, this module also provides API for building dictionary. ...@@ -23,10 +23,8 @@ Besides, this module also provides API for building dictionary.
import paddle.v2.dataset.common import paddle.v2.dataset.common
import collections import collections
import tarfile import tarfile
import Queue
import re import re
import string import string
import threading
__all__ = ['build_dict', 'train', 'test', 'convert'] __all__ = ['build_dict', 'train', 'test', 'convert']
...@@ -76,45 +74,19 @@ def build_dict(pattern, cutoff): ...@@ -76,45 +74,19 @@ def build_dict(pattern, cutoff):
def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size): def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
UNK = word_idx['<unk>'] UNK = word_idx['<unk>']
INS = []
qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)] def load(pattern, out, label):
def load(pattern, queue):
for doc in tokenize(pattern): for doc in tokenize(pattern):
queue.put(doc) out.append(([word_idx.get(w, UNK) for w in doc], label))
queue.put(None)
load(pos_pattern, INS, 0)
load(neg_pattern, INS, 1)
random.shuffle(INS)
def reader(): def reader():
# Creates two threads that loads positive and negative samples for doc, label in INS:
# into qs. yield doc, label
t0 = threading.Thread(
target=load, args=(
pos_pattern,
qs[0], ))
t0.daemon = True
t0.start()
t1 = threading.Thread(
target=load, args=(
neg_pattern,
qs[1], ))
t1.daemon = True
t1.start()
# Read alternatively from qs[0] and qs[1].
i = 0
doc = qs[i].get()
while doc != None:
yield [word_idx.get(w, UNK) for w in doc], i % 2
i += 1
doc = qs[i % 2].get()
# If any queue is empty, reads from the other queue.
i += 1
doc = qs[i % 2].get()
while doc != None:
yield [word_idx.get(w, UNK) for w in doc], i % 2
doc = qs[i % 2].get()
return reader return reader
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册