diff --git a/README.md b/README.md index fb0e20bf42560748f1c9633f19eb0d77090d1b32..33d6f94f3a0de2dd82709af50fc9ec55663a1ffd 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ PaddlePaddle提供了丰富的运算单元,帮助大家以模块化的方式 在词向量的例子中,我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计(Noise Contrastive Estimation,NCE)来加速词向量的学习。 -- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/word_embedding) +- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid) - 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost) diff --git a/hsigmoid/.gitignore b/hsigmoid/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..29a9367f0e91889df8654ad4293f0649de2074f0 --- /dev/null +++ b/hsigmoid/.gitignore @@ -0,0 +1,3 @@ +*.pyc +models + diff --git a/hsigmoid/README.md b/hsigmoid/README.md index 66798f9a2fe8e7921dd819a444b19183bd70de67..b8af766ba3712e55c8447b5f0fcd5763209ff6b4 100644 --- a/hsigmoid/README.md +++ b/hsigmoid/README.md @@ -50,7 +50,7 @@ def train_data(filename, word_dict, n): ``` ## 网络结构 -本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是Hsigmoid层。详细网络结构见图2: +本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是`Hsigmoid`层。详细网络结构见图2:
@@ -60,41 +60,27 @@ def train_data(filename, word_dict, n):
代码实现如下:
```python
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
- first_word = paddle.layer.data(
- name='firstw', type=paddle.data_type.integer_value(dict_size))
- second_word = paddle.layer.data(
- name='secondw', type=paddle.data_type.integer_value(dict_size))
- third_word = paddle.layer.data(
- name='thirdw', type=paddle.data_type.integer_value(dict_size))
- fourth_word = paddle.layer.data(
- name='fourthw', type=paddle.data_type.integer_value(dict_size))
- target_word = paddle.layer.data(
- name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+ emb_layers = []
embed_param_attr = paddle.attr.Param(
name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
- embed_first_word = paddle.layer.embedding(
- input=first_word, size=embed_size, param_attr=embed_param_attr)
- embed_second_word = paddle.layer.embedding(
- input=second_word, size=embed_size, param_attr=embed_param_attr)
- embed_third_word = paddle.layer.embedding(
- input=third_word, size=embed_size, param_attr=embed_param_attr)
- embed_fourth_word = paddle.layer.embedding(
- input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
- embed_context = paddle.layer.concat(input=[
- embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
- ])
+ for i in range(gram_num):
+ word = paddle.layer.data(
+ name="__word%02d__" % (i),
+ type=paddle.data_type.integer_value(dict_size))
+ emb_layers.append(
+ paddle.layer.embedding(
+ input=word, size=embed_size, param_attr=embed_param_attr))
+
+ target_word = paddle.layer.data(
+ name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+ embed_context = paddle.layer.concat(input=emb_layers)
hidden_layer = paddle.layer.fc(
input=embed_context,
size=hidden_size,
- act=paddle.activation.Sigmoid(),
+ act=paddle.activation.Sigmoid(),
layer_attr=paddle.attr.Extra(drop_rate=0.5),
bias_attr=paddle.attr.Param(learning_rate=2),
param_attr=paddle.attr.Param(
@@ -105,27 +91,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
input=hidden_layer,
label=target_word,
num_classes=dict_size,
- param_attr=paddle.attr.Param(name='sigmoid_w'),
- bias_attr=paddle.attr.Param(name='sigmoid_b'))
+ param_attr=paddle.attr.Param(name="sigmoid_w"),
+ bias_attr=paddle.attr.Param(name="sigmoid_b"))
return cost
else:
- with paddle.layer.mixed(
- size=dict_size - 1,
- act=paddle.activation.Sigmoid(),
- bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
- prediction += paddle.layer.trans_full_matrix_projection(
- input=hidden_layer,
- param_attr=paddle.attr.Param(name='sigmoid_w'))
+ prediction = paddle.layer.fc(
+ size=dict_size - 1,
+ input=hidden_layer,
+ act=paddle.activation.Sigmoid(),
+ bias_attr=paddle.attr.Param(name="sigmoid_b"),
+ param_attr=paddle.attr.Param(name="sigmoid_w"))
return prediction
```
需要注意,在预测阶段,我们需要对hsigmoid参数做一次转置,这里输出的类别数为词典大小减1,对应非叶节点的数量。
## 训练阶段
-训练比较简单,直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。
+训练比较简单,直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。
## 预测阶段
-预测时,直接运行``` python hsigmoid_predict.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下:
+预测时,直接运行``` python infer.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下:
```python
def decode_res(infer_res, dict_size):
diff --git a/hsigmoid/hsigmoid_conf.py b/hsigmoid/hsigmoid_conf.py
deleted file mode 100644
index be6b7462a1487e906278fa2682d65add256aaa2d..0000000000000000000000000000000000000000
--- a/hsigmoid/hsigmoid_conf.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
- first_word = paddle.layer.data(
- name='firstw', type=paddle.data_type.integer_value(dict_size))
- second_word = paddle.layer.data(
- name='secondw', type=paddle.data_type.integer_value(dict_size))
- third_word = paddle.layer.data(
- name='thirdw', type=paddle.data_type.integer_value(dict_size))
- fourth_word = paddle.layer.data(
- name='fourthw', type=paddle.data_type.integer_value(dict_size))
- target_word = paddle.layer.data(
- name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
- embed_param_attr = paddle.attr.Param(
- name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
- embed_first_word = paddle.layer.embedding(
- input=first_word, size=embed_size, param_attr=embed_param_attr)
- embed_second_word = paddle.layer.embedding(
- input=second_word, size=embed_size, param_attr=embed_param_attr)
- embed_third_word = paddle.layer.embedding(
- input=third_word, size=embed_size, param_attr=embed_param_attr)
- embed_fourth_word = paddle.layer.embedding(
- input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
- embed_context = paddle.layer.concat(input=[
- embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
- ])
-
- hidden_layer = paddle.layer.fc(
- input=embed_context,
- size=hidden_size,
- act=paddle.activation.Sigmoid(),
- layer_attr=paddle.attr.Extra(drop_rate=0.5),
- bias_attr=paddle.attr.Param(learning_rate=2),
- param_attr=paddle.attr.Param(
- initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
-
- if is_train == True:
- cost = paddle.layer.hsigmoid(
- input=hidden_layer,
- label=target_word,
- num_classes=dict_size,
- param_attr=paddle.attr.Param(name='sigmoid_w'),
- bias_attr=paddle.attr.Param(name='sigmoid_b'))
- return cost
- else:
- with paddle.layer.mixed(
- size=dict_size - 1,
- act=paddle.activation.Sigmoid(),
- bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
- prediction += paddle.layer.trans_full_matrix_projection(
- input=hidden_layer,
- param_attr=paddle.attr.Param(name='sigmoid_w'))
- return prediction
diff --git a/hsigmoid/index.html b/hsigmoid/index.html
index 83f6809d669d9ec6e0dd002f414ba8247068e270..c53e110fdb80fabe5d82865fc5d4eb3512007d4a 100644
--- a/hsigmoid/index.html
+++ b/hsigmoid/index.html
@@ -92,7 +92,7 @@ def train_data(filename, word_dict, n):
```
## 网络结构
-本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是Hsigmoid层。详细网络结构见图2:
+本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是`Hsigmoid`层。详细网络结构见图2:
@@ -102,41 +102,27 @@ def train_data(filename, word_dict, n):
代码实现如下:
```python
-import math
-import paddle.v2 as paddle
-
-
-def network_conf(hidden_size, embed_size, dict_size, is_train=True):
- first_word = paddle.layer.data(
- name='firstw', type=paddle.data_type.integer_value(dict_size))
- second_word = paddle.layer.data(
- name='secondw', type=paddle.data_type.integer_value(dict_size))
- third_word = paddle.layer.data(
- name='thirdw', type=paddle.data_type.integer_value(dict_size))
- fourth_word = paddle.layer.data(
- name='fourthw', type=paddle.data_type.integer_value(dict_size))
- target_word = paddle.layer.data(
- name='fifthw', type=paddle.data_type.integer_value(dict_size))
-
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+ emb_layers = []
embed_param_attr = paddle.attr.Param(
name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
- embed_first_word = paddle.layer.embedding(
- input=first_word, size=embed_size, param_attr=embed_param_attr)
- embed_second_word = paddle.layer.embedding(
- input=second_word, size=embed_size, param_attr=embed_param_attr)
- embed_third_word = paddle.layer.embedding(
- input=third_word, size=embed_size, param_attr=embed_param_attr)
- embed_fourth_word = paddle.layer.embedding(
- input=fourth_word, size=embed_size, param_attr=embed_param_attr)
-
- embed_context = paddle.layer.concat(input=[
- embed_first_word, embed_second_word, embed_third_word, embed_fourth_word
- ])
+ for i in range(gram_num):
+ word = paddle.layer.data(
+ name="__word%02d__" % (i),
+ type=paddle.data_type.integer_value(dict_size))
+ emb_layers.append(
+ paddle.layer.embedding(
+ input=word, size=embed_size, param_attr=embed_param_attr))
+
+ target_word = paddle.layer.data(
+ name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+ embed_context = paddle.layer.concat(input=emb_layers)
hidden_layer = paddle.layer.fc(
input=embed_context,
size=hidden_size,
- act=paddle.activation.Sigmoid(),
+ act=paddle.activation.Sigmoid(),
layer_attr=paddle.attr.Extra(drop_rate=0.5),
bias_attr=paddle.attr.Param(learning_rate=2),
param_attr=paddle.attr.Param(
@@ -147,27 +133,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True):
input=hidden_layer,
label=target_word,
num_classes=dict_size,
- param_attr=paddle.attr.Param(name='sigmoid_w'),
- bias_attr=paddle.attr.Param(name='sigmoid_b'))
+ param_attr=paddle.attr.Param(name="sigmoid_w"),
+ bias_attr=paddle.attr.Param(name="sigmoid_b"))
return cost
else:
- with paddle.layer.mixed(
- size=dict_size - 1,
- act=paddle.activation.Sigmoid(),
- bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction:
- prediction += paddle.layer.trans_full_matrix_projection(
- input=hidden_layer,
- param_attr=paddle.attr.Param(name='sigmoid_w'))
+ prediction = paddle.layer.fc(
+ size=dict_size - 1,
+ input=hidden_layer,
+ act=paddle.activation.Sigmoid(),
+ bias_attr=paddle.attr.Param(name="sigmoid_b"),
+ param_attr=paddle.attr.Param(name="sigmoid_w"))
return prediction
```
需要注意,在预测阶段,我们需要对hsigmoid参数做一次转置,这里输出的类别数为词典大小减1,对应非叶节点的数量。
## 训练阶段
-训练比较简单,直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。
+训练比较简单,直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。
## 预测阶段
-预测时,直接运行``` python hsigmoid_predict.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下:
+预测时,直接运行``` python infer.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下:
```python
def decode_res(infer_res, dict_size):
diff --git a/hsigmoid/hsigmoid_predict.py b/hsigmoid/infer.py
similarity index 82%
rename from hsigmoid/hsigmoid_predict.py
rename to hsigmoid/infer.py
index 210f87ee103a2ac145e3c42cea536cd00d2994bb..32000238ee715e6ad8fcb9cb2484e7c532974987 100644
--- a/hsigmoid/hsigmoid_predict.py
+++ b/hsigmoid/infer.py
@@ -1,9 +1,14 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
-import gzip
+from network_conf import ngram_lm
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.WARNING)
def decode_res(infer_res, dict_size):
@@ -45,21 +50,20 @@ def predict(batch_ins, idx_word_dict, dict_size, prediction_layer, parameters):
# Ouput format: word1 word2 word3 word4 -> predict label
for i, ins in enumerate(batch_ins):
- print(idx_word_dict[ins[0]] + ' ' + \
- idx_word_dict[ins[1]] + ' ' + \
- idx_word_dict[ins[2]] + ' ' + \
- idx_word_dict[ins[3]] + ' ' + \
- ' -> ' + predict_words[i])
+ print(" ".join([idx_word_dict[w]
+ for w in ins]) + " -> " + predict_words[i])
+
+def main(model_path):
+ assert os.path.exists(model_path), "trained model does not exist."
-def main():
paddle.init(use_gpu=False, trainer_count=1)
word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
dict_size = len(word_dict)
- prediction_layer = network_conf(
+ prediction_layer = ngram_lm(
is_train=False, hidden_size=256, embed_size=32, dict_size=dict_size)
- with gzip.open('./models/model_pass_00000.tar.gz') as f:
+ with gzip.open(model_path, "r") as f:
parameters = paddle.parameters.Parameters.from_tar(f)
idx_word_dict = dict((v, k) for k, v in word_dict.items())
@@ -79,5 +83,5 @@ def main():
parameters)
-if __name__ == '__main__':
- main()
+if __name__ == "__main__":
+ main("models/hsigmoid_batch_00010.tar.gz")
diff --git a/hsigmoid/network_conf.py b/hsigmoid/network_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..494494788c015fd76ab5914ba6c2a8161bde5785
--- /dev/null
+++ b/hsigmoid/network_conf.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import math
+import paddle.v2 as paddle
+
+
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+ emb_layers = []
+ embed_param_attr = paddle.attr.Param(
+ name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+ for i in range(gram_num):
+ word = paddle.layer.data(
+ name="__word%02d__" % (i),
+ type=paddle.data_type.integer_value(dict_size))
+ emb_layers.append(
+ paddle.layer.embedding(
+ input=word, size=embed_size, param_attr=embed_param_attr))
+
+ target_word = paddle.layer.data(
+ name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+
+ embed_context = paddle.layer.concat(input=emb_layers)
+
+ hidden_layer = paddle.layer.fc(
+ input=embed_context,
+ size=hidden_size,
+ act=paddle.activation.Sigmoid(),
+ layer_attr=paddle.attr.Extra(drop_rate=0.5),
+ bias_attr=paddle.attr.Param(learning_rate=2),
+ param_attr=paddle.attr.Param(
+ initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
+
+ if is_train == True:
+ cost = paddle.layer.hsigmoid(
+ input=hidden_layer,
+ label=target_word,
+ num_classes=dict_size,
+ param_attr=paddle.attr.Param(name="sigmoid_w"),
+ bias_attr=paddle.attr.Param(name="sigmoid_b"))
+ return cost
+ else:
+ prediction = paddle.layer.fc(
+ size=dict_size - 1,
+ input=hidden_layer,
+ act=paddle.activation.Sigmoid(),
+ bias_attr=paddle.attr.Param(name="sigmoid_b"),
+ param_attr=paddle.attr.Param(name="sigmoid_w"))
+ return prediction
diff --git a/hsigmoid/hsigmoid_train.py b/hsigmoid/train.py
similarity index 55%
rename from hsigmoid/hsigmoid_train.py
rename to hsigmoid/train.py
index 0c2e1b236b284c3dfb32988b0d917eb830f365be..809c842af55b22daff3428db9b674065a16f1700 100644
--- a/hsigmoid/hsigmoid_train.py
+++ b/hsigmoid/train.py
@@ -1,40 +1,41 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
import paddle.v2 as paddle
-from hsigmoid_conf import network_conf
-import gzip
+from network_conf import ngram_lm
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
-def main():
+def main(save_dir="models"):
+ if not os.path.exists(save_dir):
+ os.mkdir(save_dir)
+
paddle.init(use_gpu=False, trainer_count=1)
word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
dict_size = len(word_dict)
- cost = network_conf(
- is_train=True, hidden_size=256, embed_size=32, dict_size=dict_size)
+ cost = ngram_lm(hidden_size=256, embed_size=32, dict_size=dict_size)
def event_handler(event):
if isinstance(event, paddle.event.EndPass):
- model_name = './models/model_pass_%05d.tar.gz' % event.pass_id
- print("Save model into %s ..." % model_name)
- with gzip.open(model_name, 'w') as f:
+ model_name = os.path.join(save_dir, "hsigmoid_pass_%05d.tar.gz" %
+ event.pass_id)
+ logger.info("Save model into %s ..." % model_name)
+ with gzip.open(model_name, "w") as f:
parameters.to_tar(f)
if isinstance(event, paddle.event.EndIteration):
- if event.batch_id % 100 == 0:
+ if event.batch_id and event.batch_id % 10 == 0:
result = trainer.test(
paddle.batch(
paddle.dataset.imikolov.test(word_dict, 5), 32))
- print("Pass %d, Batch %d, Cost %f, Test Cost %f" %
- (event.pass_id, event.batch_id, event.cost, result.cost))
-
- feeding = {
- 'firstw': 0,
- 'secondw': 1,
- 'thirdw': 2,
- 'fourthw': 3,
- 'fifthw': 4
- }
+ logger.info(
+ "Pass %d, Batch %d, Cost %f, Test Cost %f" %
+ (event.pass_id, event.batch_id, event.cost, result.cost))
parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.Adam(
@@ -48,9 +49,8 @@ def main():
lambda: paddle.dataset.imikolov.train(word_dict, 5)(),
buf_size=1000), 64),
num_passes=30,
- event_handler=event_handler,
- feeding=feeding)
+ event_handler=event_handler)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()