提交 798a8266 编写于 作者: Z zhaopu7 提交者: GitHub

Add files via upload

上级 6f06bb62
# coding=utf-8
import numpy as np
import collections
# config
train_file = 'ptb/ptb.train.txt'
test_file = 'ptb/ptb.test.txt'
vocab_max_size = 3000
min_sentence_length = 3
max_sentence_length = 60
def build_vocab():
"""
build vacab.
:return: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
"""
words = []
for line in open(train_file):
words += line.decode('utf-8','ignore').strip().split()
counter = collections.Counter(words)
counter = sorted(counter.items(), key=lambda x: -x[1])
if len(counter) > vocab_max_size:
counter = counter[:vocab_max_size]
words, counts = zip(*counter)
word_id_dict = dict(zip(words, range(2, len(words) + 2)))
word_id_dict['<UNK>'] = 0
word_id_dict['<EOS>'] = 1
return word_id_dict
def _read_by_fixed_length(file_name, sentence_len=10):
"""
create reader, each sample with fixed length.
:param file_name: file name.
:param sentence_len: each sample's length.
:return: data reader.
"""
def reader():
word_id_dict = build_vocab()
words = []
UNK = word_id_dict['<UNK>']
for line in open(file_name):
words += line.decode('utf-8','ignore').strip().split()
ids = [word_id_dict.get(w, UNK) for w in words]
words_len = len(words)
sentence_num = (words_len-1) // sentence_len
count = 0
while count < sentence_num:
start = count * sentence_len
count += 1
yield ids[start:start+sentence_len], ids[start+1:start+sentence_len+1]
return reader
def _read_by_line(file_name):
"""
create reader, each line is a sample.
:param file_name: file name.
:return: data reader.
"""
def reader():
word_id_dict = build_vocab()
UNK = word_id_dict['<UNK>']
for line in open(file_name):
words = line.decode('utf-8','ignore').strip().split()
if len(words) < min_sentence_length or len(words) > max_sentence_length:
continue
ids = [word_id_dict.get(w, UNK) for w in words]
ids.append(word_id_dict['<EOS>'])
target = ids[1:]
target.append(word_id_dict['<EOS>'])
yield ids[:], target[:]
return reader
def _reader_creator_for_NGram(file_name, N):
"""
create reader for ngram.
:param file_name: file name.
:param N: ngram's n.
:return: data reader.
"""
assert N >= 2
def reader():
word_id_dict = build_vocab()
words = []
UNK = word_id_dict['<UNK>']
for line in open(file_name):
words += line.decode('utf-8','ignore').strip().split()
ids = [word_id_dict.get(w, UNK) for w in words]
words_len = len(words)
for i in range(words_len-N-1):
yield tuple(ids[i:i+N])
return reader
def train_data():
return _read_by_line(train_file)
def test_data():
return _read_by_line(test_file)
def train_data_for_NGram(N):
return _reader_creator_for_NGram(train_file, N)
def test_data_for_NGram(N):
return _reader_creator_for_NGram(test_file, N)
# coding=utf-8
import paddle.v2 as paddle
import numpy as np
def next_word(model_struct, model_params, word_id_dict, input):
"""
Demo: generate the next word.
to show the simplest way using trained model to do prediction.
:param model_struct: model's structure, only the output layer will be used for prediction task.
:param model_params: parameters trained before.
:param word_id_dict: vocab.
:type word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:param input: input.
:type input: integer sequence.
:return: predict word.
"""
predictions = paddle.infer(
output_layer=model_struct,
parameters=model_params,
input=input,
field=['value'])
id_word_dict = dict([(v, k) for k, v in word_id_dict.items()]) # dictionary with type {id : word}
predictions[-1][word_id_dict['<UNK>']] = -1 # filter <UNK>
return id_word_dict[np.argmax(predictions[-1])]
def generate_with_greedy(model_struct, model_params, word_id_dict, text, num_words):
"""
Demo: generate 'num_words' words using greedy algorithm.
:param model_struct: model's structure, only the output layer will be used for prediction task.
:param model_params: parameters trained before.
:param word_id_dict: vocab.
:type word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:param text: prefix text.
:type text: string.
:param num_words: the number of the words to generate.
:return: text with generated words.
"""
assert num_words > 0
# prepare dictionary
id_word_dict = dict([(v, k) for k, v in word_id_dict.items()])
# generate
for _ in range(num_words):
text_ids = [[[word_id_dict.get(w, word_id_dict['<UNK>']) for w in text.split()]]]
print('input:', text.encode('utf-8', 'replace'), text_ids)
predictions = paddle.infer(
output_layer=model_struct,
parameters=model_params,
input=text_ids,
field=['value'])
predictions[-1][word_id_dict['<UNK>']] = -1 # filter <UNK>
text += ' ' + id_word_dict[np.argmax(predictions[-1])]
return text
def generate_with_beamSearch(model_struct, model_params, word_id_dict, text, num_words, beam_size):
"""
Demo: generate 'num_words' words using "beam search" algorithm.
:param model_struct: model's structure, only the output layer will be used for prediction task.
:param model_params: parameters trained before.
:param word_id_dict: vocab.
:type word_id_dict: dictionary with content of '{word, id}', 'word' is string type , 'id' is int type.
:param text: prefix text.
:type text: string.
:param num_words: the number of the words to generate.
:param beam_size: beam with.
:return: text with generated words.
"""
assert beam_size > 0 and num_words > 0
# load word dictionary
id_word_dict = dict([(v, k) for k, v in word_id_dict.items()]) # {id : word}
# tools
def str2ids(str):
return [[[word_id_dict.get(w, word_id_dict['<UNK>']) for w in str.split()]]]
def ids2str(ids):
return [[[id_word_dict.get(id, ' ') for id in ids]]]
# generate
texts = {} # type: {text : prob}
texts[text] = 1
for _ in range(num_words):
texts_new = {}
for (text, prob) in texts.items():
# next word's prob distubution
predictions = paddle.infer(
output_layer=model_struct,
parameters=model_params,
input=str2ids(text),
field=['value'])
predictions[-1][word_id_dict['<UNK>']] = -1 # filter <UNK>
# find next beam_size words
for _ in range(beam_size):
cur_maxProb_index = np.argmax(predictions[-1]) # next word's id
text_new = text + ' ' + id_word_dict[cur_maxProb_index] # text append nextWord
texts_new[text_new] = texts[text] * predictions[-1][cur_maxProb_index]
predictions[-1][cur_maxProb_index] = -1
texts.clear()
if len(texts_new) <= beam_size:
texts = texts_new
else: # cutting
texts = dict(sorted(texts_new.items(), key=lambda d: d[1], reverse=True)[:beam_size])
return texts
# coding=utf-8
import sys
import paddle.v2 as paddle
import data_util as reader
import gzip
import generate_text as generator
def lm(vocab_size, emb_dim, hidden_size, num_layer):
"""
ngram language model definition.
:param vocab_size: size of vocab.
:param emb_dim: embedding vector's dimension.
:param hidden_size: size of unit.
:param num_layer: layer number.
:return: cost and output of model.
"""
assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0
def wordemb(inlayer):
wordemb = paddle.layer.table_projection(
input=inlayer,
size=emb_dim,
param_attr=paddle.attr.Param(
name="_proj",
initial_std=0.001,
learning_rate=1,
l2_rate=0, ))
return wordemb
# input layers
firstword = paddle.layer.data(
name="firstw", type=paddle.data_type.integer_value(vocab_size))
secondword = paddle.layer.data(
name="secondw", type=paddle.data_type.integer_value(vocab_size))
thirdword = paddle.layer.data(
name="thirdw", type=paddle.data_type.integer_value(vocab_size))
fourthword = paddle.layer.data(
name="fourthw", type=paddle.data_type.integer_value(vocab_size))
# embedding layer
Efirst = wordemb(firstword)
Esecond = wordemb(secondword)
Ethird = wordemb(thirdword)
Efourth = wordemb(fourthword)
contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
# hidden layer
hidden = paddle.layer.fc(
input=contextemb, size=hidden_size, act=paddle.activation.Relu())
for _ in range(num_layer - 1):
hidden = paddle.layer.fc(
input=hidden, size=hidden_size, act=paddle.activation.Relu())
# fc and output layer
predictword = paddle.layer.fc(
input=[hidden], size=vocab_size, act=paddle.activation.Softmax())
# loss
nextword = paddle.layer.data(
name="fifthw", type=paddle.data_type.integer_value(vocab_size))
cost = paddle.layer.classification_cost(input=predictword, label=nextword)
return cost, predictword
def train():
"""
train ngram language model.
:return: none, but this function will save the training model each epoch.
"""
# load word dictionary
print('load dictionary...')
word_id_dict = reader.build_vocab()
# define data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_data_for_NGram(N), buf_size=65536),
batch_size=32)
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.test_data_for_NGram(N), buf_size=65536),
batch_size=8)
# network config
print('prepare model...')
cost, _ = lm(len(word_id_dict), emb_dim, hidden_size, num_layer)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# create trainer
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=adam_optimizer)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print("\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost,
event.metrics))
else:
sys.stdout.write('.')
sys.stdout.flush()
# save model each pass
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader)
print("\nTest with Pass %d, %s" % (event.pass_id, result.metrics))
with gzip.open(model_file_name_prefix + str(event.pass_id) + '.tar.gz',
'w') as f:
parameters.to_tar(f)
# start to train
print('start training...')
trainer.train(
reader=train_reader, event_handler=event_handler, num_passes=num_passs)
print("Training finished.")
if __name__ == '__main__':
# -- config --
paddle.init(use_gpu=False, trainer_count=1)
emb_dim = 200
hidden_size = 200
num_passs = 2
num_layer = 2
N = 5
model_file_name_prefix = 'lm_ngram_pass_'
# -- train --
train()
# -- predict --
# prepare model
word_id_dict = reader.build_vocab() # load word dictionary
_, output = lm(len(word_id_dict), emb_dim, hidden_size, num_layer) # network config
model_file_name = model_file_name_prefix + str(num_passs - 1) + '.tar.gz'
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_file_name)) # load parameters
# generate
text = 'the end of the'
input = [[word_id_dict.get(w, word_id_dict['<UNK>']) for w in text.split()]]
print(generator.next_word(output, parameters, word_id_dict, input))
# coding=utf-8
import sys
import paddle.v2 as paddle
import data_util as reader
import gzip
import generate_text as generator
def lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer):
"""
rnn language model definition.
:param vocab_size: size of vocab.
:param emb_dim: embedding vector's dimension.
:param rnn_type: the type of RNN cell.
:param hidden_size: number of unit.
:param num_layer: layer number.
:return: cost and output layer of model.
"""
assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0
# input layers
data = paddle.layer.data(
name="word", type=paddle.data_type.integer_value_sequence(vocab_size))
target = paddle.layer.data("label", paddle.data_type.integer_value_sequence(vocab_size))
# embedding layer
emb = paddle.layer.embedding(input=data, size=emb_dim)
# rnn layer
if rnn_type == 'lstm':
rnn_cell = paddle.networks.simple_lstm(
input=emb, size=hidden_size)
for _ in range(num_layer - 1):
rnn_cell = paddle.networks.simple_lstm(
input=rnn_cell, size=hidden_size)
elif rnn_type == 'gru':
rnn_cell = paddle.networks.simple_gru(
input=emb, size=hidden_size)
for _ in range(num_layer - 1):
rnn_cell = paddle.networks.simple_gru(
input=rnn_cell, size=hidden_size)
else:
raise Exception('rnn_type error!')
# fc(full connected) and output layer
output = paddle.layer.fc(
input=[rnn_cell], size=vocab_size, act=paddle.activation.Softmax())
# loss
cost = paddle.layer.classification_cost(input=output, label=target)
return cost, output
def train():
"""
train rnn language model.
:return: none, but this function will save the training model each epoch.
"""
# load word dictionary
print('load dictionary...')
word_id_dict = reader.build_vocab()
# define data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_data(), buf_size=65536),
batch_size=32)
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.test_data(), buf_size=65536),
batch_size=8)
# network config
print('prepare model...')
cost, _ = lm(len(word_id_dict), emb_dim, rnn_type, hidden_size, num_layer)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# create trainer
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=adam_optimizer)
# define event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print("\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost,
event.metrics))
else:
sys.stdout.write('.')
sys.stdout.flush()
# save model each pass
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader)
print("\nTest with Pass %d, %s" % (event.pass_id, result.metrics))
with gzip.open(model_file_name_prefix + str(event.pass_id) + '.tar.gz',
'w') as f:
parameters.to_tar(f)
# start to train
print('start training...')
trainer.train(
reader=train_reader, event_handler=event_handler, num_passes=num_passs)
print("Training finished.")
if __name__ == '__main__':
# -- config --
paddle.init(use_gpu=False, trainer_count=1)
rnn_type = 'gru' # or 'lstm'
emb_dim = 200
hidden_size = 200
num_passs = 2
num_layer = 2
model_file_name_prefix = 'lm_' + rnn_type + '_params_pass_'
# -- train --
train()
# -- predict --
# prepare model
word_id_dict = reader.build_vocab() # load word dictionary
_, output = lm(len(word_id_dict), emb_dim, rnn_type, hidden_size, num_layer) # network config
model_file_name = model_file_name_prefix + str(num_passs - 1) + '.tar.gz'
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_file_name)) # load parameters
# generate
text = 'the end of'
generate_sentences = generator.generate_with_beamSearch(output, parameters, word_id_dict, text, 5, 5)
# print result
for (sentence, prob) in generate_sentences.items():
print(sentence.encode('utf-8', 'replace'))
print('prob: ', prob)
print('-------')
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册