From 26c9e8e70b9bfbf3f304286ac21ff6aa37917e8c Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 25 Dec 2017 19:32:12 +0800 Subject: [PATCH] Speed data reader for imdb dataset. --- python/paddle/v2/dataset/imdb.py | 46 +++++++------------------------- 1 file changed, 9 insertions(+), 37 deletions(-) diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py index cfc1c886e13..63588c9b31f 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/v2/dataset/imdb.py @@ -23,10 +23,8 @@ Besides, this module also provides API for building dictionary. import paddle.v2.dataset.common import collections import tarfile -import Queue import re import string -import threading __all__ = ['build_dict', 'train', 'test', 'convert'] @@ -76,45 +74,19 @@ def build_dict(pattern, cutoff): def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size): UNK = word_idx[''] + INS = [] - qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)] - - def load(pattern, queue): + def load(pattern, out, label): for doc in tokenize(pattern): - queue.put(doc) - queue.put(None) + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + random.shuffle(INS) def reader(): - # Creates two threads that loads positive and negative samples - # into qs. - t0 = threading.Thread( - target=load, args=( - pos_pattern, - qs[0], )) - t0.daemon = True - t0.start() - - t1 = threading.Thread( - target=load, args=( - neg_pattern, - qs[1], )) - t1.daemon = True - t1.start() - - # Read alternatively from qs[0] and qs[1]. - i = 0 - doc = qs[i].get() - while doc != None: - yield [word_idx.get(w, UNK) for w in doc], i % 2 - i += 1 - doc = qs[i % 2].get() - - # If any queue is empty, reads from the other queue. - i += 1 - doc = qs[i % 2].get() - while doc != None: - yield [word_idx.get(w, UNK) for w in doc], i % 2 - doc = qs[i % 2].get() + for doc, label in INS: + yield doc, label return reader -- GitLab