diff --git a/word2vec/README.md b/word2vec/README.md
index 3b44c2d0c326736eaea435f98dc7d833ccac1068..1a942f4ec26cf2763977a57b2b8e9232c95f521e 100644
--- a/word2vec/README.md
+++ b/word2vec/README.md
@@ -141,7 +141,7 @@ CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去
## 数据准备
-### 数据介绍与下载
+### 数据介绍
本教程使用Penn Tree Bank (PTB)数据集。PTB数据集较小,训练速度快,应用于Mikolov的公开语言模型训练工具\[[2](#参考文献)\]中。其统计情况如下:
@@ -165,109 +165,24 @@ CBOW的好处是对上下文词语的分布在词向量上进行了平滑,去
-执行以下命令,可下载该数据集,并分别将训练数据和验证数据输入`train.list`和`test.list`文件中,供PaddlePaddle训练时使用。
-
-```bash
-./data/getdata.sh
-```
-
-### 提供数据给PaddlePaddle
-
-1. 使用initializer函数进行dataprovider的初始化,包括字典的建立(build_dict函数中)和PaddlePaddle输入字段的格式定义。注意:这里N为n-gram模型中的`n`, 本章代码中,定义$N=5$, 表示在PaddlePaddle训练时,每条数据的前4个词用来预测第5个词。大家也可以根据自己的数据和需求自行调整N,但调整的同时要在模型配置文件中加入/减少相应输入字段。
-
- ```python
- from paddle.trainer.PyDataProvider2 import *
- import collections
- import logging
- import pdb
-
- logging.basicConfig(
- format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
- logger = logging.getLogger('paddle')
- logger.setLevel(logging.INFO)
-
- N = 5 # Ngram
- cutoff = 50 # select words with frequency > cutoff to dictionary
- def build_dict(ftrain, fdict):
- sentences = []
- with open(ftrain) as fin:
- for line in fin:
- line = [''] + line.strip().split() + ['']
- sentences += line
- wordfreq = collections.Counter(sentences)
- wordfreq = filter(lambda x: x[1] > cutoff, wordfreq.items())
- dictionary = sorted(wordfreq, key = lambda x: (-x[1], x[0]))
- words, _ = list(zip(*dictionary))
- for word in words:
- print >> fdict, word
- word_idx = dict(zip(words, xrange(len(words))))
- logger.info("Dictionary size=%s" %len(words))
- return word_idx
-
- def initializer(settings, srcText, dictfile, **xargs):
- with open(dictfile, 'w') as fdict:
- settings.dicts = build_dict(srcText, fdict)
- input_types = []
- for i in xrange(N):
- input_types.append(integer_value(len(settings.dicts)))
- settings.input_types = input_types
- ```
-
-2. 使用process函数中将数据逐一提供给PaddlePaddle。具体来说,将每句话前面补上N-1个开始符号 ``, 末尾补上一个结束符号 ``,然后以N为窗口大小,从头到尾每次向右滑动窗口并生成一条数据。
-
- ```python
- @provider(init_hook=initializer)
- def process(settings, filename):
- UNKID = settings.dicts['']
- with open(filename) as fin:
- for line in fin:
- line = ['']*(N-1) + line.strip().split() + ['']
- line = [settings.dicts.get(w, UNKID) for w in line]
- for i in range(N, len(line) + 1):
- yield line[i-N: i]
- ```
-
- 如"I have a dream" 一句提供了5条数据:
-
- > ` I`
- > ` I have`
- > ` I have a`
- > ` I have a dream`
- > `I have a dream `
-
-
-## 模型配置说明
-
-### 数据定义
-
-通过`define_py_data_sources2`函数从dataprovider中读入数据,其中args指定了训练文本(srcText)和词汇表(dictfile)。
-
-```python
-from paddle.trainer_config_helpers import *
-import math
+### 数据预处理
-args = {'srcText': 'data/simple-examples/data/ptb.train.txt',
- 'dictfile': 'data/vocabulary.txt'}
-
-define_py_data_sources2(
- train_list="data/train.list",
- test_list="data/test.list",
- module="dataprovider",
- obj="process",
- args=args)
-```
+本章训练的是5-gram模型,表示在PaddlePaddle训练时,每条数据的前4个词用来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`,自动做数据的下载与预处理,方便大家使用。
-### 算法配置
+预处理会把数据集中的每一句话前后加上开始符号``以及结束符号``。然后依据窗口大小(本教程中为5),从头到尾每次向右滑动窗口并生成一条数据。
-在这里,我们指定了模型的训练参数, L2正则项系数、学习率和batch size。
+如"I have a dream that one day" 一句提供了5条数据:
-```python
-settings(
- batch_size=100, regularization=L2Regularization(8e-4), learning_rate=3e-3)
+```text
+ I have a dream
+I have a dream that
+have a dream that one
+a dream that one day
+dream that one day
```
-### 模型结构
+## 编程实现
本配置的模型结构如下图所示:
@@ -276,94 +191,132 @@ settings(
图5. 模型配置中的N-gram神经网络模型
-1. 定义参数维度和和数据输入。
-
- ```python
- dictsize = 1953 # 字典大小
- embsize = 32 # 词向量维度
- hiddensize = 256 # 隐层维度
-
- firstword = data_layer(name = "firstw", size = dictsize)
- secondword = data_layer(name = "secondw", size = dictsize)
- thirdword = data_layer(name = "thirdw", size = dictsize)
- fourthword = data_layer(name = "fourthw", size = dictsize)
- nextword = data_layer(name = "fifthw", size = dictsize)
- ```
-
-2. 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$,通过$|V|\times D$的矩阵映射到D维词向量(本例中取D=32)。
+首先,加载所需要的包:
+
+```python
+import math
+import paddle.v2 as paddle
+```
+
+然后,定义参数:
+```python
+embsize = 32 # 词向量维度
+hiddensize = 256 # 隐层维度
+N = 5 # 训练5-Gram
+```
+
+接着,定义网络结构:
+
+- 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$,通过$|V|\times D$的矩阵映射到D维词向量(本例中取D=32)。
- ```python
- def wordemb(inlayer):
- wordemb = table_projection(
- input = inlayer,
- size = embsize,
- param_attr=ParamAttr(name = "_proj",
- initial_std=0.001, # 参数初始化标准差
- l2_rate= 0,)) # 词向量不需要稀疏化,因此其l2_rate设为0
+```python
+def wordemb(inlayer):
+ wordemb = paddle.layer.table_projection(
+ input=inlayer,
+ size=embsize,
+ param_attr=paddle.attr.Param(
+ name="_proj",
+ initial_std=0.001,
+ learning_rate=1,
+ l2_rate=0, ))
return wordemb
+```
- Efirst = wordemb(firstword)
- Esecond = wordemb(secondword)
- Ethird = wordemb(thirdword)
- Efourth = wordemb(fourthword)
- ```
-
-3. 接着,将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
-
- ```python
- contextemb = concat_layer(input = [Efirst, Esecond, Ethird, Efourth])
- ```
-4. 然后,将历史文本特征经过一个全连接得到文本隐层特征。
-
- ```python
- hidden1 = fc_layer(
- input = contextemb,
- size = hiddensize,
- act = SigmoidActivation(),
- layer_attr = ExtraAttr(drop_rate=0.5),
- bias_attr = ParamAttr(learning_rate = 2),
- param_attr = ParamAttr(
- initial_std = 1./math.sqrt(embsize*8),
- learning_rate = 1))
- ```
-
-5. 最后,将文本隐层特征,再经过一个全连接,映射成一个$|V|$维向量,同时通过softmax归一化得到这`|V|`个词的生成概率。
-
- ```python
- # use context embedding to predict nextword
- predictword = fc_layer(
- input = hidden1,
- size = dictsize,
- bias_attr = ParamAttr(learning_rate = 2),
- act = SoftmaxActivation())
- ```
-
-6. 网络的损失函数为多分类交叉熵,可直接调用`classification_cost`函数。
-
- ```python
- cost = classification_cost(
- input = predictword,
- label = nextword)
- # network input and output
- outputs(cost)
- ```
+- 定义输入层接受的数据类型以及名字。
+
+```python
+def main():
+ paddle.init(use_gpu=False, trainer_count=1) # 初始化PaddlePaddle
+ word_dict = paddle.dataset.imikolov.build_dict()
+ dict_size = len(word_dict)
+ # 每个输入层都接受整形数据,这些数据的范围是[0, dict_size)
+ firstword = paddle.layer.data(
+ name="firstw", type=paddle.data_type.integer_value(dict_size))
+ secondword = paddle.layer.data(
+ name="secondw", type=paddle.data_type.integer_value(dict_size))
+ thirdword = paddle.layer.data(
+ name="thirdw", type=paddle.data_type.integer_value(dict_size))
+ fourthword = paddle.layer.data(
+ name="fourthw", type=paddle.data_type.integer_value(dict_size))
+ nextword = paddle.layer.data(
+ name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+ Efirst = wordemb(firstword)
+ Esecond = wordemb(secondword)
+ Ethird = wordemb(thirdword)
+ Efourth = wordemb(fourthword)
+```
+
+- 将这n-1个词向量经过concat_layer连接成一个大向量作为历史文本特征。
+
+```python
+ contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+```
+
+- 将历史文本特征经过一个全连接得到文本隐层特征。
+
+```python
+ hidden1 = paddle.layer.fc(input=contextemb,
+ size=hiddensize,
+ act=paddle.activation.Sigmoid(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5),
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ param_attr=paddle.attr.Param(
+ initial_std=1. / math.sqrt(embsize * 8),
+ learning_rate=1))
+```
-##训练模型
+- 将文本隐层特征,再经过一个全连接,映射成一个$|V|$维向量,同时通过softmax归一化得到这`|V|`个词的生成概率。
+
+```python
+ predictword = paddle.layer.fc(input=hidden1,
+ size=dict_size,
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ act=paddle.activation.Softmax())
+```
-模型训练命令为`./train.sh`。脚本内容如下,其中指定了总共需要执行30个pass。
+- 网络的损失函数为多分类交叉熵,可直接调用`classification_cost`函数。
-```bash
-paddle train \
- --config ngram.py \
- --use_gpu=1 \
- --dot_period=100 \
- --log_period=3000 \
- --test_period=0 \
- --save_dir=model \
- --num_passes=30
+```python
+cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+```
+
+然后,指定训练相关的参数:
+
+- 训练方法(optimizer): 代表训练过程在更新权重时采用动量优化器,本教程使用Adam优化器。
+- 训练速度(learning_rate): 迭代的速度,与网络的训练收敛速度有关系。
+- 正则化(regularization): 是防止网络过拟合的一种手段,此处采用L2正则化。
+
+```python
+ parameters = paddle.parameters.create(cost)
+ adam_optimizer = paddle.optimizer.Adam(
+ learning_rate=3e-3,
+ regularization=paddle.optimizer.L2Regularization(8e-4))
+ trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+```
+
+下一步,我们开始训练过程。`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数,每次调用的时候返回一个Python generator。
+
+`paddle.batch`的输入是一个reader,输出是一个batched reader —— 在PaddlePaddle里,一个reader每次yield一条训练数据,而一个batched reader每次yield一个minbatch。
+
+```python
+ def event_handler(event):
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 100 == 0:
+ result = trainer.test(
+ paddle.batch(
+ paddle.dataset.imikolov.test(word_dict, N), 32))
+ print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics,
+ result.metrics)
+
+ trainer.train(
+ paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+ num_passes=30,
+ event_handler=event_handler)
```
-一个pass的训练日志如下所示:
+训练过程是完全自动的,event_handler里打印的日志类似如下所示:
```text
.............................
diff --git a/word2vec/data/getdata.sh b/word2vec/data/getdata.sh
deleted file mode 100755
index 7b9e938640add251df4b8f1c61277b1c1eed61c6..0000000000000000000000000000000000000000
--- a/word2vec/data/getdata.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-set -e
-
-wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
-tar -zxf simple-examples.tgz
-echo `pwd`/simple-examples/data/ptb.train.txt > train.list
-echo `pwd`/simple-examples/data/ptb.valid.txt > test.list
diff --git a/word2vec/dataprovider.py b/word2vec/dataprovider.py
deleted file mode 100644
index 2f48d4f0fb17b84696f85f1df4cc558082ea9eed..0000000000000000000000000000000000000000
--- a/word2vec/dataprovider.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import collections
-import logging
-import pdb
-
-logging.basicConfig(
- format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-
-N = 5 # Ngram
-cutoff = 50 # select words with frequency > cutoff to dictionary
-
-
-def build_dict(ftrain, fdict):
- sentences = []
- with open(ftrain) as fin:
- for line in fin:
- line = [''] + line.strip().split() + ['']
- sentences += line
- wordfreq = collections.Counter(sentences)
- wordfreq = filter(lambda x: x[1] > cutoff, wordfreq.items())
- dictionary = sorted(wordfreq, key=lambda x: (-x[1], x[0]))
- words, _ = list(zip(*dictionary))
- for word in words:
- print >> fdict, word
- word_idx = dict(zip(words, xrange(len(words))))
- logger.info("Dictionary size=%s" % len(words))
- return word_idx
-
-
-def initializer(settings, srcText, dictfile, **xargs):
- with open(dictfile, 'w') as fdict:
- settings.dicts = build_dict(srcText, fdict)
- input_types = []
- for i in xrange(N):
- input_types.append(integer_value(len(settings.dicts)))
- settings.input_types = input_types
-
-
-@provider(init_hook=initializer)
-def process(settings, filename):
- UNKID = settings.dicts['']
- with open(filename) as fin:
- for line in fin:
- line = [''] * (N - 1) + line.strip().split() + ['']
- line = [settings.dicts.get(w, UNKID) for w in line]
- for i in range(N, len(line) + 1):
- yield line[i - N:i]
diff --git a/word2vec/ngram.py b/word2vec/ngram.py
deleted file mode 100644
index 13d6291dc5b3886c42c18e80af413b36bdb724e1..0000000000000000000000000000000000000000
--- a/word2vec/ngram.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-import math
-
-#################### Data Configure ####################
-args = {
- 'srcText': 'data/simple-examples/data/ptb.train.txt',
- 'dictfile': 'data/vocabulary.txt'
-}
-define_py_data_sources2(
- train_list="data/train.list",
- test_list="data/test.list",
- module="dataprovider",
- obj="process",
- args=args)
-
-settings(
- batch_size=100, regularization=L2Regularization(8e-4), learning_rate=3e-3)
-
-dictsize = 1953
-embsize = 32
-hiddensize = 256
-
-firstword = data_layer(name="firstw", size=dictsize)
-secondword = data_layer(name="secondw", size=dictsize)
-thirdword = data_layer(name="thirdw", size=dictsize)
-fourthword = data_layer(name="fourthw", size=dictsize)
-nextword = data_layer(name="fifthw", size=dictsize)
-
-
-# construct word embedding for each datalayer
-def wordemb(inlayer):
- wordemb = table_projection(
- input=inlayer,
- size=embsize,
- param_attr=ParamAttr(
- name="_proj",
- initial_std=0.001,
- learning_rate=1,
- l2_rate=0, ))
- return wordemb
-
-
-Efirst = wordemb(firstword)
-Esecond = wordemb(secondword)
-Ethird = wordemb(thirdword)
-Efourth = wordemb(fourthword)
-
-# concatentate Ngram embeddings into context embedding
-contextemb = concat_layer(input=[Efirst, Esecond, Ethird, Efourth])
-hidden1 = fc_layer(
- input=contextemb,
- size=hiddensize,
- act=SigmoidActivation(),
- layer_attr=ExtraAttr(drop_rate=0.5),
- bias_attr=ParamAttr(learning_rate=2),
- param_attr=ParamAttr(
- initial_std=1. / math.sqrt(embsize * 8), learning_rate=1))
-
-# use context embedding to predict nextword
-predictword = fc_layer(
- input=hidden1,
- size=dictsize,
- bias_attr=ParamAttr(learning_rate=2),
- act=SoftmaxActivation())
-
-cost = classification_cost(input=predictword, label=nextword)
-
-# network input and output
-outputs(cost)
diff --git a/word2vec/train.py b/word2vec/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..15ad6a01cc2230ad1c8a6a44c1d3d828331a0d1d
--- /dev/null
+++ b/word2vec/train.py
@@ -0,0 +1,79 @@
+import math
+
+import paddle.v2 as paddle
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+ wordemb = paddle.layer.table_projection(
+ input=inlayer,
+ size=embsize,
+ param_attr=paddle.attr.Param(
+ name="_proj",
+ initial_std=0.001,
+ learning_rate=1,
+ l2_rate=0, ))
+ return wordemb
+
+
+def main():
+ paddle.init(use_gpu=False, trainer_count=1)
+ word_dict = paddle.dataset.imikolov.build_dict()
+ dict_size = len(word_dict)
+ firstword = paddle.layer.data(
+ name="firstw", type=paddle.data_type.integer_value(dict_size))
+ secondword = paddle.layer.data(
+ name="secondw", type=paddle.data_type.integer_value(dict_size))
+ thirdword = paddle.layer.data(
+ name="thirdw", type=paddle.data_type.integer_value(dict_size))
+ fourthword = paddle.layer.data(
+ name="fourthw", type=paddle.data_type.integer_value(dict_size))
+ nextword = paddle.layer.data(
+ name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+ Efirst = wordemb(firstword)
+ Esecond = wordemb(secondword)
+ Ethird = wordemb(thirdword)
+ Efourth = wordemb(fourthword)
+
+ contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+ hidden1 = paddle.layer.fc(input=contextemb,
+ size=hiddensize,
+ act=paddle.activation.Sigmoid(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5),
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ param_attr=paddle.attr.Param(
+ initial_std=1. / math.sqrt(embsize * 8),
+ learning_rate=1))
+ predictword = paddle.layer.fc(input=hidden1,
+ size=dict_size,
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ act=paddle.activation.Softmax())
+
+ def event_handler(event):
+ if isinstance(event, paddle.event.EndIteration):
+ if event.batch_id % 100 == 0:
+ result = trainer.test(
+ paddle.batch(
+ paddle.dataset.imikolov.test(word_dict, N), 32))
+ print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+ event.pass_id, event.batch_id, event.cost, event.metrics,
+ result.metrics)
+
+ cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+ parameters = paddle.parameters.create(cost)
+ adam_optimizer = paddle.optimizer.Adam(
+ learning_rate=3e-3,
+ regularization=paddle.optimizer.L2Regularization(8e-4))
+ trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+ trainer.train(
+ paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+ num_passes=30,
+ event_handler=event_handler)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/word2vec/train.sh b/word2vec/train.sh
deleted file mode 100755
index 1e7a7753aeed45e34165539ab34c2792ec8e8196..0000000000000000000000000000000000000000
--- a/word2vec/train.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-set -e
-
-paddle train \
- --config ngram.py \
- --use_gpu=1 \
- --dot_period=100 \
- --log_period=3000 \
- --test_period=0 \
- --save_dir=model \
- --num_passes=30