From 13a5f2595443aecf733598bc6673a781f374eb25 Mon Sep 17 00:00:00 2001 From: zhaopu Date: Tue, 13 Jun 2017 23:36:54 +0800 Subject: [PATCH] delete old file --- language_model/data_util.py | 157 ---------------------- language_model/lm_ngram.py | 184 -------------------------- language_model/lm_rnn.py | 254 ------------------------------------ 3 files changed, 595 deletions(-) delete mode 100644 language_model/data_util.py delete mode 100644 language_model/lm_ngram.py delete mode 100644 language_model/lm_rnn.py diff --git a/language_model/data_util.py b/language_model/data_util.py deleted file mode 100644 index 21fe7c60..00000000 --- a/language_model/data_util.py +++ /dev/null @@ -1,157 +0,0 @@ -# coding=utf-8 -import collections -import os - -# -- function -- - - -def save_vocab(word_id_dict, vocab_file_name): - """ - save vocab. - :param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :param vocab_file_name: vocab file name. - """ - f = open(vocab_file_name, 'w') - for (k, v) in word_id_dict.items(): - f.write(k.encode('utf-8') + '\t' + str(v) + '\n') - print('save vocab to ' + vocab_file_name) - f.close() - - -def load_vocab(vocab_file_name): - """ - load vocab from file - :param vocab_file_name: vocab file name. - :return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - """ - if not os.path.isfile(vocab_file_name): - raise Exception('vocab file does not exist!') - dict = {} - for line in open(vocab_file_name): - if len(line) < 2: - continue - kv = line.decode('utf-8').strip().split('\t') - dict[kv[0]] = int(kv[1]) - return dict - - -def build_vocab(file_name, vocab_max_size): - """ - build vacab. - - :param vocab_max_size: vocab's max size. - :return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - """ - words = [] - for line in open(file_name): - words += line.decode('utf-8', 'ignore').strip().split() - - counter = collections.Counter(words) - counter = sorted(counter.items(), key=lambda x: -x[1]) - if len(counter) > vocab_max_size: - counter = counter[:vocab_max_size] - words, counts = zip(*counter) - word_id_dict = dict(zip(words, range(2, len(words) + 2))) - word_id_dict[''] = 0 - word_id_dict[''] = 1 - return word_id_dict - - -def _read_by_fixed_length(file_name, word_id_dict, sentence_len=10): - """ - create reader, each sample with fixed length. - - :param file_name: file name. - :param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :param sentence_len: each sample's length. - :return: data reader. - """ - - def reader(): - words = [] - UNK = word_id_dict[''] - for line in open(file_name): - words += line.decode('utf-8', 'ignore').strip().split() - ids = [word_id_dict.get(w, UNK) for w in words] - words_len = len(words) - sentence_num = (words_len - 1) // sentence_len - count = 0 - while count < sentence_num: - start = count * sentence_len - count += 1 - yield ids[start:start + sentence_len], ids[start + 1:start + - sentence_len + 1] - - return reader - - -def _read_by_line(file_name, min_sentence_length, max_sentence_length, - word_id_dict): - """ - create reader, each line is a sample. - - :param file_name: file name. - :param min_sentence_length: sentence's min length. - :param max_sentence_length: sentence's max length. - :param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :return: data reader. - """ - - def reader(): - UNK = word_id_dict[''] - for line in open(file_name): - words = line.decode('utf-8', 'ignore').strip().split() - if len(words) < min_sentence_length or len( - words) > max_sentence_length: - continue - ids = [word_id_dict.get(w, UNK) for w in words] - ids.append(word_id_dict['']) - target = ids[1:] - target.append(word_id_dict['']) - yield ids[:], target[:] - - return reader - - -def _reader_creator_for_NGram(file_name, N, word_id_dict): - """ - create reader for ngram. - - :param file_name: file name. - :param N: ngram's n. - :param word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :return: data reader. - """ - assert N >= 2 - - def reader(): - words = [] - UNK = word_id_dict[''] - for line in open(file_name): - words += line.decode('utf-8', 'ignore').strip().split() - ids = [word_id_dict.get(w, UNK) for w in words] - words_len = len(words) - for i in range(words_len - N - 1): - yield tuple(ids[i:i + N]) - - return reader - - -def train_data(train_file, min_sentence_length, max_sentence_length, - word_id_dict): - return _read_by_line(train_file, min_sentence_length, max_sentence_length, - word_id_dict) - - -def test_data(test_file, min_sentence_length, max_sentence_length, - word_id_dict): - return _read_by_line(test_file, min_sentence_length, max_sentence_length, - word_id_dict) - - -def train_data_for_NGram(train_file, N, word_id_dict): - return _reader_creator_for_NGram(train_file, N, word_id_dict) - - -def test_data_for_NGram(test_file, N, word_id_dict): - return _reader_creator_for_NGram(test_file, N, word_id_dict) diff --git a/language_model/lm_ngram.py b/language_model/lm_ngram.py deleted file mode 100644 index 5296cb8b..00000000 --- a/language_model/lm_ngram.py +++ /dev/null @@ -1,184 +0,0 @@ -# coding=utf-8 -import sys -import paddle.v2 as paddle -import data_util as reader -import gzip -import numpy as np - - -def lm(vocab_size, emb_dim, hidden_size, num_layer): - """ - ngram language model definition. - - :param vocab_size: size of vocab. - :param emb_dim: embedding vector's dimension. - :param hidden_size: size of unit. - :param num_layer: layer number. - :return: cost and output layer of model. - """ - - assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0 - - def wordemb(inlayer): - wordemb = paddle.layer.table_projection( - input=inlayer, - size=emb_dim, - param_attr=paddle.attr.Param( - name="_proj", - initial_std=0.001, - learning_rate=1, - l2_rate=0, )) - return wordemb - - # input layers - firstword = paddle.layer.data( - name="firstw", type=paddle.data_type.integer_value(vocab_size)) - secondword = paddle.layer.data( - name="secondw", type=paddle.data_type.integer_value(vocab_size)) - thirdword = paddle.layer.data( - name="thirdw", type=paddle.data_type.integer_value(vocab_size)) - fourthword = paddle.layer.data( - name="fourthw", type=paddle.data_type.integer_value(vocab_size)) - - # embedding layer - Efirst = wordemb(firstword) - Esecond = wordemb(secondword) - Ethird = wordemb(thirdword) - Efourth = wordemb(fourthword) - - contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth]) - - # hidden layer - hidden = paddle.layer.fc( - input=contextemb, size=hidden_size, act=paddle.activation.Relu()) - for _ in range(num_layer - 1): - hidden = paddle.layer.fc( - input=hidden, size=hidden_size, act=paddle.activation.Relu()) - - # fc and output layer - predictword = paddle.layer.fc( - input=[hidden], size=vocab_size, act=paddle.activation.Softmax()) - - # loss - nextword = paddle.layer.data( - name="fifthw", type=paddle.data_type.integer_value(vocab_size)) - cost = paddle.layer.classification_cost(input=predictword, label=nextword) - - return cost, predictword - - -def train(): - """ - train ngram language model. - - :return: none, but this function will save the training model each epoch. - """ - - # prepare word dictionary - print('prepare vocab...') - word_id_dict = reader.build_vocab(train_file, vocab_max_size) # build vocab - reader.save_vocab(word_id_dict, vocab_file) # save vocab - - # define data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - reader.train_data_for_NGram(train_file, N, word_id_dict), - buf_size=65536), - batch_size=32) - - test_reader = paddle.batch( - paddle.reader.shuffle( - reader.test_data_for_NGram(train_file, N, word_id_dict), - buf_size=65536), - batch_size=8) - - # network config - print('prepare model...') - cost, _ = lm(len(word_id_dict), emb_dim, hidden_size, num_layer) - - # create parameters - parameters = paddle.parameters.create(cost) - - # create optimizer - adam_optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, - regularization=paddle.optimizer.L2Regularization(rate=1e-3), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - - # create trainer - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=adam_optimizer) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print("\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics)) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # save model each pass - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_reader) - print("\nTest with Pass %d, %s" % (event.pass_id, result.metrics)) - with gzip.open( - model_file_name_prefix + str(event.pass_id) + '.tar.gz', - 'w') as f: - parameters.to_tar(f) - - # start to train - print('start training...') - - trainer.train( - reader=train_reader, event_handler=event_handler, num_passes=num_passs) - - print("Training finished.") - - -if __name__ == '__main__': - # -- config : model -- - emb_dim = 200 - hidden_size = 200 - num_passs = 2 - num_layer = 2 - N = 5 - model_file_name_prefix = 'lm_ngram_pass_' - - # -- config : data -- - train_file = 'data/ptb.train.txt' - test_file = 'data/ptb.test.txt' - vocab_file = 'data/vocab_ptb.txt' # the file to save vocab - vocab_max_size = 3000 - min_sentence_length = 3 - max_sentence_length = 60 - - # -- train -- - paddle.init(use_gpu=False, trainer_count=1) - train() - - # -- predict -- - - text = 'the end of the' # use 4 words to predict the 5th word - - # prepare model - word_id_dict = reader.load_vocab(vocab_file) # load word dictionary - _, output_layer = lm(len(word_id_dict), emb_dim, hidden_size, - num_layer) # network config - model_file_name = model_file_name_prefix + str(num_passs - 1) + '.tar.gz' - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(model_file_name)) # load parameters - # generate - input = [[word_id_dict.get(w, word_id_dict['']) for w in text.split()]] - predictions = paddle.infer( - output_layer=output_layer, - parameters=parameters, - input=input, - field=['value']) - id_word_dict = dict( - [(v, k) - for k, v in word_id_dict.items()]) # dictionary with type {id : word} - predictions[-1][word_id_dict['']] = -1 # filter - next_word = id_word_dict[np.argmax(predictions[-1])] - print(next_word.encode('utf-8')) diff --git a/language_model/lm_rnn.py b/language_model/lm_rnn.py deleted file mode 100644 index e5eb23cf..00000000 --- a/language_model/lm_rnn.py +++ /dev/null @@ -1,254 +0,0 @@ -# coding=utf-8 -import sys -import paddle.v2 as paddle -import data_util as reader -import gzip -import os -import numpy as np - - -def lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer): - """ - rnn language model definition. - - :param vocab_size: size of vocab. - :param emb_dim: embedding vector's dimension. - :param rnn_type: the type of RNN cell. - :param hidden_size: number of unit. - :param num_layer: layer number. - :return: cost and output layer of model. - """ - - assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0 - - # input layers - data = paddle.layer.data( - name="word", type=paddle.data_type.integer_value_sequence(vocab_size)) - target = paddle.layer.data( - "label", paddle.data_type.integer_value_sequence(vocab_size)) - - # embedding layer - emb = paddle.layer.embedding(input=data, size=emb_dim) - - # rnn layer - if rnn_type == 'lstm': - rnn_cell = paddle.networks.simple_lstm(input=emb, size=hidden_size) - for _ in range(num_layer - 1): - rnn_cell = paddle.networks.simple_lstm( - input=rnn_cell, size=hidden_size) - elif rnn_type == 'gru': - rnn_cell = paddle.networks.simple_gru(input=emb, size=hidden_size) - for _ in range(num_layer - 1): - rnn_cell = paddle.networks.simple_gru( - input=rnn_cell, size=hidden_size) - else: - raise Exception('rnn_type error!') - - # fc(full connected) and output layer - output = paddle.layer.fc( - input=[rnn_cell], size=vocab_size, act=paddle.activation.Softmax()) - - # loss - cost = paddle.layer.classification_cost(input=output, label=target) - - return cost, output - - -def train(): - """ - train rnn language model. - - :return: none, but this function will save the training model each epoch. - """ - - # prepare word dictionary - print('prepare vocab...') - word_id_dict = reader.build_vocab(train_file, vocab_max_size) # build vocab - reader.save_vocab(word_id_dict, vocab_file) # save vocab - - # define data reader - train_reader = paddle.batch( - paddle.reader.shuffle( - reader.train_data(train_file, min_sentence_length, - max_sentence_length, word_id_dict), - buf_size=65536), - batch_size=32) - - test_reader = paddle.batch( - paddle.reader.shuffle( - reader.test_data(test_file, min_sentence_length, - max_sentence_length, word_id_dict), - buf_size=65536), - batch_size=8) - - # network config - print('prepare model...') - cost, _ = lm(len(word_id_dict), emb_dim, rnn_type, hidden_size, num_layer) - - # create parameters - parameters = paddle.parameters.create(cost) - - # create optimizer - adam_optimizer = paddle.optimizer.Adam( - learning_rate=1e-3, - regularization=paddle.optimizer.L2Regularization(rate=1e-3), - model_average=paddle.optimizer.ModelAverage(average_window=0.5)) - - # create trainer - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=adam_optimizer) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print("\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics)) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # save model each pass - if isinstance(event, paddle.event.EndPass): - result = trainer.test(reader=test_reader) - print("\nTest with Pass %d, %s" % (event.pass_id, result.metrics)) - with gzip.open( - model_file_name_prefix + str(event.pass_id) + '.tar.gz', - 'w') as f: - parameters.to_tar(f) - - # start to train - print('start training...') - - trainer.train( - reader=train_reader, event_handler=event_handler, num_passes=num_passs) - - print("Training finished.") - - -def _generate_with_beamSearch(inferer, word_id_dict, input, num_words, - beam_size): - """ - Demo: generate 'num_words' words using "beam search" algorithm. - - :param inferer: paddle's inferer - :type inferer: paddle.inference.Inference - :param word_id_dict: vocab. - :type word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type. - :param input: prefix text. - :type input: string. - :param num_words: the number of the words to generate. - :type num_words: int - :param beam_size: beam with. - :type beam_size: int - :return: text with generated words. dictionary with content of '{text, probability}' - """ - - assert beam_size > 0 and num_words > 0 - - # load word dictionary - id_word_dict = dict( - [(v, k) for k, v in word_id_dict.items()]) # {id : word} - - # tools - def str2ids(str): - return [[[ - word_id_dict.get(w, word_id_dict['']) for w in str.split() - ]]] - - def ids2str(ids): - return [[[id_word_dict.get(id, ' ') for id in ids]]] - - # generate - texts = {} # type: {text : prob} - texts[input] = 1 - for _ in range(num_words): - texts_new = {} - for (text, prob) in texts.items(): - # next word's prob distubution - predictions = inferer.infer(input=str2ids(text)) - predictions[-1][word_id_dict['']] = -1 # filter - # find next beam_size words - for _ in range(beam_size): - cur_maxProb_index = np.argmax(predictions[-1]) # next word's id - text_new = text + ' ' + id_word_dict[ - cur_maxProb_index] # text append nextWord - texts_new[text_new] = texts[text] * predictions[-1][ - cur_maxProb_index] - predictions[-1][cur_maxProb_index] = -1 - texts.clear() - if len(texts_new) <= beam_size: - texts = texts_new - else: # cutting - texts = dict( - sorted(texts_new.items(), key=lambda d: d[1], reverse=True) - [:beam_size]) - - return texts - - -def predict(): - """ - demo: use model to do prediction. - - :return: print result to console. - """ - - # prepare and cache vocab - if os.path.isfile(vocab_file): - word_id_dict = reader.load_vocab(vocab_file) # load word dictionary - else: - word_id_dict = reader.build_vocab(train_file, - vocab_max_size) # build vocab - reader.save_vocab(word_id_dict, vocab_file) # save vocab - - # prepare and cache model - _, output = lm( - len(word_id_dict), emb_dim, rnn_type, hidden_size, - num_layer) # network config - model_file_name = model_file_name_prefix + str(num_passs - 1) + '.tar.gz' - parameters = paddle.parameters.Parameters.from_tar( - gzip.open(model_file_name)) # load parameters - inferer = paddle.inference.Inference( - output_layer=output, parameters=parameters) - - # generate text - while True: - input_str = raw_input('input:') - input_str_uft8 = input_str.decode('utf-8') - generate_sentences = _generate_with_beamSearch( - inferer=inferer, - word_id_dict=word_id_dict, - input=input_str_uft8, - num_words=5, - beam_size=5) - # print result - for (sentence, prob) in generate_sentences.items(): - print(sentence.encode('utf-8', 'replace')) - print('prob: ', prob) - print('-------') - - -if __name__ == '__main__': - # -- config : model -- - rnn_type = 'gru' # or 'lstm' - emb_dim = 200 - hidden_size = 200 - num_passs = 2 - num_layer = 2 - model_file_name_prefix = 'lm_' + rnn_type + '_params_pass_' - - # -- config : data -- - train_file = 'data/ptb.train.txt' - test_file = 'data/ptb.test.txt' - vocab_file = 'data/vocab_ptb.txt' # the file to save vocab - vocab_max_size = 3000 - min_sentence_length = 3 - max_sentence_length = 60 - - # -- train -- - paddle.init(use_gpu=False, trainer_count=1) - train() - - # -- predict -- - predict() -- GitLab