reader.py 891 字节
Newer Older
Z
zhaopu 已提交
1 2 3
import collections
import os

C
caoying03 已提交
4 5
MIN_LEN = 3
MAX_LEN = 100
Z
zhaopu 已提交
6

C
caoying03 已提交
7 8

def rnn_reader(file_name, word_dict):
Z
zhaopu 已提交
9 10 11 12 13 14
    """
    create reader for RNN, each line is a sample.

    :param file_name: file name.
    :param min_sentence_length: sentence's min length.
    :param max_sentence_length: sentence's max length.
C
caoying03 已提交
15 16
    :param word_dict: vocab with content of '{word, id}',
                      'word' is string type , 'id' is int type.
Z
zhaopu 已提交
17 18 19 20
    :return: data reader.
    """

    def reader():
C
caoying03 已提交
21
        UNK_ID = word_dict['<unk>']
Z
zhaopu 已提交
22 23
        with open(file_name) as file:
            for line in file:
C
caoying03 已提交
24 25
                words = line.strip().lower().split()
                if len(words) < MIN_LEN or len(words) > MAX_LEN:
Z
zhaopu 已提交
26
                    continue
C
caoying03 已提交
27 28 29
                ids = [word_dict.get(w, UNK_ID)
                       for w in words] + [word_dict['<e>']]
                yield ids[:-1], ids[1:]
Z
zhaopu 已提交
30 31

    return reader