# 基于GRU的Text Generation
文本生成是NLP领域中的重要组成部分,基于GRU,我们可以快速构建文本生成模型。

In [74]:
import paddle
import numpy as np
import matplotlib.pyplot as plt

paddle.__version__

'2.0.0-alpha0'

# 复现过程
## 1.下载数据
文件路径:https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
保存为txt格式即可

## 2.读取数据

In [60]:
# 文件路径
path_to_file = './shakespeare.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# 文本长度是指文本中的字符个数
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1115394 characters


In [61]:
# 看一看文本中的前 250 个字符
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [62]:
# 文本中的非重复字符
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

65 unique characters


## 3.向量化文本
在训练之前,我们需要将字符串映射到数字表示值。创建两个查找表格:一个将字符映射到数字,另一个将数字映射到字符。

In [63]:
# 创建从非重复字符到索引的映射
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
# 用index表示文本
text_as_int = np.array([char2idx[c] for c in text])


In [64]:
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [65]:
print(idx2char)

['\n' ' ' '!' '$' '&' "'" ',' '-' '.' '3' ':' ';' '?' 'A' 'B' 'C' 'D' 'E'
 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W'
 'X' 'Y' 'Z' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']


现在,每个字符都有一个整数表示值。请注意,我们将字符映射至索引 0 至 len(vocab).

In [66]:
print(text_as_int)
print(len(text_as_int))

[18 47 56 ... 45 8 0]
1115394


In [67]:
# 显示文本首 13 个字符的整数映射
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'First Citizen' ---- characters mapped to int ---- > [18 47 56 57 58 1 15 47 58 47 64 43 52]


## 预测任务
给定一个字符或者一个字符序列,下一个最可能出现的字符是什么?这就是我们训练模型要执行的任务。输入进模型的是一个字符序列,我们训练这个模型来预测输出 -- 每个时间步(time step)预测下一个字符是什么。

## 创建训练样本和目标
接下来,将文本划分为样本序列。每个输入序列包含文本中的 seq_length 个字符。

对于每个输入序列,其对应的目标包含相同长度的文本,但是向右顺移一个字符。

将文本拆分为长度为 seq_length 的文本块。例如,假设 seq_length 为 4 而且文本为 “Hello”, 那么输入序列将为 “Hell”,目标序列将为 “ello”。

In [68]:
seq_length = 100
def load_data(data, seq_length):
 train_data = []
 train_label = []
 for i in range(len(data)//seq_length):
 train_data.append(data[i*seq_length:(i+1)*seq_length])
 train_label.append(data[i*seq_length + 1:(i+1)*seq_length+1])
 return train_data, train_label
train_data, train_label = load_data(text_as_int, seq_length)

In [69]:
char_list = []
label_list = []
for char_id, label_id in zip(train_data[0], train_label[0]):
 char_list.append(idx2char[char_id])
 label_list.append(idx2char[label_id])

print('training data is :')
print(''.join(char_list))
print("------------")
print('training_label is:')
print(''.join(label_list))

training data is :
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You
------------
training_label is:
irst Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


## 用`paddle.batch`完成数据的加载

In [70]:
import numpy as np
batch_size = 64
def train_reader():
 for i in range(len(train_data)):
 yield train_data[i], train_label[i]
batch_reader = paddle.batch(train_reader, batch_size=batch_size) 

## 基于GRU构建文本生成模型

In [71]:
import paddle
import numpy as np

vocab_size = len(vocab)
embedding_dim = 256
hidden_size = 1024
class GRUModel(paddle.nn.Layer):
 def __init__(self):
 super(GRUModel, self).__init__()
 self.embedding = paddle.nn.Embedding(size=[vocab_size, embedding_dim])
 self.gru = paddle.incubate.hapi.text.GRU(input_size=embedding_dim, hidden_size=hidden_size)
 self.linear1 = paddle.nn.Linear(hidden_size, hidden_size//2)
 self.linear2 = paddle.nn.Linear(hidden_size//2, vocab_size)
 def forward(self, x):
 x = self.embedding(x)
 x = paddle.reshape(x, [-1, 1, embedding_dim])
 x, _ = self.gru(x)
 x = paddle.reshape(x, [-1, hidden_size])
 x = self.linear1(x)
 x = paddle.nn.functional.relu(x)
 x = self.linear2(x)
 x = paddle.nn.functional.softmax(x)
 return x

In [72]:
paddle.enable_imperative()
losses = []
def train(model):
 model.train()
 optim = paddle.optimizer.SGD(learning_rate=0.001, parameter_list=model.parameters())
 for epoch in range(10):
 batch_id = 0
 for batch_data in batch_reader():
 batch_id += 1
 data = np.array(batch_data)
 x_data = data[:, 0]
 y_data = data[:, 1]
 for i in range(len(x_data[0])):
 x_char = x_data[:, i]
 y_char = y_data[:, i]
 x_char = paddle.imperative.to_variable(x_char)
 y_char = paddle.imperative.to_variable(y_char)
 predicts = model(x_char)
 loss = paddle.nn.functional.cross_entropy(predicts, y_char)
 avg_loss = paddle.mean(loss)
 avg_loss.backward()
 optim.minimize(avg_loss)
 model.clear_gradients()
 if batch_id % 50 == 0:
 print("epoch: {}, batch: {}, loss is: {}".format(epoch, batch_id, avg_loss.numpy()))
 losses.append(loss.numpy())
model = GRUModel()
train(model)

epoch: 0, batch: 50, loss is: [3.7835407]
epoch: 0, batch: 100, loss is: [3.2774005]
epoch: 0, batch: 150, loss is: [3.2576294]
epoch: 1, batch: 50, loss is: [3.3434656]
epoch: 1, batch: 100, loss is: [2.9948606]
epoch: 1, batch: 150, loss is: [3.0285468]
epoch: 2, batch: 50, loss is: [3.133882]
epoch: 2, batch: 100, loss is: [2.7811327]
epoch: 2, batch: 150, loss is: [2.8133557]
epoch: 3, batch: 50, loss is: [3.000814]
epoch: 3, batch: 100, loss is: [2.6404488]
epoch: 3, batch: 150, loss is: [2.7050896]
epoch: 4, batch: 50, loss is: [2.9289591]
epoch: 4, batch: 100, loss is: [2.5629177]
epoch: 4, batch: 150, loss is: [2.6438713]
epoch: 5, batch: 50, loss is: [2.8832304]
epoch: 5, batch: 100, loss is: [2.5137548]
epoch: 5, batch: 150, loss is: [2.5926144]
epoch: 6, batch: 50, loss is: [2.8562953]
epoch: 6, batch: 100, loss is: [2.4752126]
epoch: 6, batch: 150, loss is: [2.5510798]
epoch: 7, batch: 50, loss is: [2.8426895]
epoch: 7, batch: 100, loss is: [2.4442513]
epoch: 7, batch: 150,

## 模型预测
利用训练好的模型,输出初始化文本'ROMEO: ',自动生成后续的num_generate个字符。

In [73]:
def generate_text(model, start_string):
 
 model.eval()
 num_generate = 100

 # Converting our start string to numbers (vectorizing)
 input_eval = [char2idx[s] for s in start_string]
 input_data = paddle.imperative.to_variable(np.array(input_eval))
 input_data = paddle.reshape(input_data, [-1, 1])
 text_generated = []

 for i in range(num_generate):
 predicts = model(input_data)
 predicts = predicts.numpy().tolist()[0]
 # print(predicts)
 predicts_id = predicts.index(max(predicts))
 # print(predicts_id)
 # using a categorical distribution to predict the character returned by the model
 input_data = paddle.imperative.to_variable(np.array([predicts_id]))
 input_data = paddle.reshape(input_data, [-1, 1])
 text_generated.append(idx2char[predicts_id])
 return (start_string + ''.join(text_generated))
print(generate_text(model, start_string=u"ROMEO:"))

ROMEO:I the the the the the the the the the the the the the the the the the the the the the the the the th
