# coding=utf-8 import collections import os def rnn_reader(file_name, min_sentence_length, max_sentence_length, word_id_dict): """ create reader for RNN, each line is a sample. :param file_name: file name. :param min_sentence_length: sentence's min length. :param max_sentence_length: sentence's max length. :param word_id_dict: vocab with content of '{word, id}', 'word' is string type , 'id' is int type. :return: data reader. """ def reader(): UNK = word_id_dict[''] with open(file_name) as file: for line in file: words = line.decode('utf-8', 'ignore').strip().split() if len(words) < min_sentence_length or len( words) > max_sentence_length: continue ids = [word_id_dict.get(w, UNK) for w in words] ids.append(word_id_dict['']) target = ids[1:] target.append(word_id_dict['']) yield ids[:], target[:] return reader def ngram_reader(file_name, N, word_id_dict): """ create reader for N-Gram. :param file_name: file name. :param N: N-Gram's N. :param word_id_dict: vocab with content of '{word, id}', 'word' is string type , 'id' is int type. :return: data reader. """ assert N >= 2 def reader(): ids = [] UNK_ID = word_id_dict[''] cache_size = 10000000 with open(file_name) as file: for line in file: words = line.decode('utf-8', 'ignore').strip().split() ids += [word_id_dict.get(w, UNK_ID) for w in words] ids_len = len(ids) if ids_len > cache_size: # output for i in range(ids_len - N - 1): yield tuple(ids[i:i + N]) ids = [] ids_len = len(ids) for i in range(ids_len - N - 1): yield tuple(ids[i:i + N]) return reader