## 数据准备
### 数据介绍
本教程使用Penn Tree Bank (PTB)数据集。PTB数据集较小,训练速度快,应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下:
+### 数据预处理
I have a dream
I have a dream that
have a dream that one
a dream that one day
dream that one day
## 编程实现
图5. 模型配置中的N-gram神经网络模型
+import math
+def main():
+ paddle.init(use_gpu=False, trainer_count=1) # 初始化PaddlePaddle
+ word_dict = paddle.dataset.imikolov.build_dict()
+ dict_size = len(word_dict)
+ # 每个输入层都接受整形数据,这些数据的范围是[0, dict_size)
+ firstword = paddle.layer.data(
+ name="firstw", type=paddle.data_type.integer_value(dict_size))
+ secondword = paddle.layer.data(
+ name="secondw", type=paddle.data_type.integer_value(dict_size))
+ thirdword = paddle.layer.data(
+ name="thirdw", type=paddle.data_type.integer_value(dict_size))
+ fourthword = paddle.layer.data(
+ name="fourthw", type=paddle.data_type.integer_value(dict_size))
+ nextword = paddle.layer.data(
+ name="fifthw", type=paddle.data_type.integer_value(dict_size))
+ Efirst = wordemb(firstword)
+ Esecond = wordemb(secondword)
+ Ethird = wordemb(thirdword)
+ Efourth = wordemb(fourthword)
+- 将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
+ contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+- 将历史文本特征经过一个全连接得到文本隐层特征。
+ hidden1 = paddle.layer.fc(input=contextemb,
+ size=hiddensize,
+ act=paddle.activation.Sigmoid(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5),
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ param_attr=paddle.attr.Param(
+ initial_std=1. / math.sqrt(embsize * 8),
+ learning_rate=1))
+- 将文本隐层特征,再经过一个全连接,映射成一个$|V|$维向量,同时通过softmax归一化得到这`|V|`个词的生成概率。
+ predictword = paddle.layer.fc(input=hidden1,
+ size=dict_size,
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ act=paddle.activation.Softmax())
+- 网络的损失函数为多分类交叉熵,可直接调用`classification_cost`函数。
+cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+- 训练方法(optimizer): 代表训练过程在更新权重时采用动量优化器,本教程使用Adam优化器。
+- 训练速度(learning_rate): 迭代的速度,与网络的训练收敛速度有关系。
+- 正则化(regularization): 是防止网络过拟合的一种手段,此处采用L2正则化。
+ parameters = paddle.parameters.create(cost)
+ adam_optimizer = paddle.optimizer.Adam(
+ learning_rate=3e-3,
+ regularization=paddle.optimizer.L2Regularization(8e-4))
+ trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+下一步,我们开始训练过程。`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python generator。
+`paddle.batch`的输入是一个reader,输出是一个batched reader —— 在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minbatch。
+ def event_handler(event):
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 100 == 0:
+ result = trainer.test(
+ paddle.batch(
+ paddle.dataset.imikolov.test(word_dict, N), 32))
+ print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics,
+ result.metrics)
+ trainer.train(
+ paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+ num_passes=30,
+ event_handler=event_handler)
--- /dev/null
+++ b/word2vec/train.py
@@ -0,0 +1,79 @@
+import math
+import paddle.v2 as paddle
+embsize = 32
+hiddensize = 256
+N = 5
+def wordemb(inlayer):
+ wordemb = paddle.layer.table_projection(
+ input=inlayer,
+ size=embsize,
+ param_attr=paddle.attr.Param(
+ name="_proj",
+ initial_std=0.001,
+ learning_rate=1,
+ l2_rate=0, ))
+ return wordemb
+def main():
+ paddle.init(use_gpu=False, trainer_count=1)
+ word_dict = paddle.dataset.imikolov.build_dict()
+ dict_size = len(word_dict)
+ firstword = paddle.layer.data(
+ name="firstw", type=paddle.data_type.integer_value(dict_size))
+ secondword = paddle.layer.data(
+ name="secondw", type=paddle.data_type.integer_value(dict_size))
+ thirdword = paddle.layer.data(
+ name="thirdw", type=paddle.data_type.integer_value(dict_size))
+ fourthword = paddle.layer.data(
+ name="fourthw", type=paddle.data_type.integer_value(dict_size))
+ nextword = paddle.layer.data(
+ name="fifthw", type=paddle.data_type.integer_value(dict_size))
+ Efirst = wordemb(firstword)
+ Esecond = wordemb(secondword)
+ Ethird = wordemb(thirdword)
+ Efourth = wordemb(fourthword)
+ contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+ hidden1 = paddle.layer.fc(input=contextemb,
+ size=hiddensize,
+ act=paddle.activation.Sigmoid(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5),
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ param_attr=paddle.attr.Param(
+ initial_std=1. / math.sqrt(embsize * 8),
+ learning_rate=1))
+ predictword = paddle.layer.fc(input=hidden1,
+ size=dict_size,
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ act=paddle.activation.Softmax())
+ def event_handler(event):
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 100 == 0:
+ result = trainer.test(
+ paddle.batch(
+ paddle.dataset.imikolov.test(word_dict, N), 32))
+ print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics,
+ result.metrics)
+ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+ parameters = paddle.parameters.create(cost)
+ adam_optimizer = paddle.optimizer.Adam(
+ learning_rate=3e-3,
+ regularization=paddle.optimizer.L2Regularization(8e-4))
+ trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+ trainer.train(
+ paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+ num_passes=30,
+ event_handler=event_handler)
+if __name__ == '__main__':
+ main()
