diff --git a/README.md b/README.md index fb0e20bf42560748f1c9633f19eb0d77090d1b32..33d6f94f3a0de2dd82709af50fc9ec55663a1ffd 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ PaddlePaddle提供了丰富的运算单元,帮助大家以模块化的方式 在词向量的例子中,我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计(Noise Contrastive Estimation,NCE)来加速词向量的学习。 -- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/word_embedding) +- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid) - 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost) diff --git a/hsigmoid/.gitignore b/hsigmoid/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..29a9367f0e91889df8654ad4293f0649de2074f0 --- /dev/null +++ b/hsigmoid/.gitignore @@ -0,0 +1,3 @@ +*.pyc +models + diff --git a/hsigmoid/README.md b/hsigmoid/README.md index 66798f9a2fe8e7921dd819a444b19183bd70de67..b8af766ba3712e55c8447b5f0fcd5763209ff6b4 100644 --- a/hsigmoid/README.md +++ b/hsigmoid/README.md @@ -50,7 +50,7 @@ def train_data(filename, word_dict, n): ``` ## 网络结构 -本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是Hsigmoid层。详细网络结构见图2: +本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是`Hsigmoid`层。详细网络结构见图2:


@@ -60,41 +60,27 @@ def train_data(filename, word_dict, n): 代码实现如下: ```python -import math -import paddle.v2 as paddle - - -def network_conf(hidden_size, embed_size, dict_size, is_train=True): - first_word = paddle.layer.data( - name='firstw', type=paddle.data_type.integer_value(dict_size)) - second_word = paddle.layer.data( - name='secondw', type=paddle.data_type.integer_value(dict_size)) - third_word = paddle.layer.data( - name='thirdw', type=paddle.data_type.integer_value(dict_size)) - fourth_word = paddle.layer.data( - name='fourthw', type=paddle.data_type.integer_value(dict_size)) - target_word = paddle.layer.data( - name='fifthw', type=paddle.data_type.integer_value(dict_size)) - +def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True): + emb_layers = [] embed_param_attr = paddle.attr.Param( name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) - embed_first_word = paddle.layer.embedding( - input=first_word, size=embed_size, param_attr=embed_param_attr) - embed_second_word = paddle.layer.embedding( - input=second_word, size=embed_size, param_attr=embed_param_attr) - embed_third_word = paddle.layer.embedding( - input=third_word, size=embed_size, param_attr=embed_param_attr) - embed_fourth_word = paddle.layer.embedding( - input=fourth_word, size=embed_size, param_attr=embed_param_attr) - - embed_context = paddle.layer.concat(input=[ - embed_first_word, embed_second_word, embed_third_word, embed_fourth_word - ]) + for i in range(gram_num): + word = paddle.layer.data( + name="__word%02d__" % (i), + type=paddle.data_type.integer_value(dict_size)) + emb_layers.append( + paddle.layer.embedding( + input=word, size=embed_size, param_attr=embed_param_attr)) + + target_word = paddle.layer.data( + name="__target_word__", type=paddle.data_type.integer_value(dict_size)) + + embed_context = paddle.layer.concat(input=emb_layers) hidden_layer = paddle.layer.fc( input=embed_context, size=hidden_size, - act=paddle.activation.Sigmoid(), + act=paddle.activation.Sigmoid(), layer_attr=paddle.attr.Extra(drop_rate=0.5), bias_attr=paddle.attr.Param(learning_rate=2), param_attr=paddle.attr.Param( @@ -105,27 +91,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True): input=hidden_layer, label=target_word, num_classes=dict_size, - param_attr=paddle.attr.Param(name='sigmoid_w'), - bias_attr=paddle.attr.Param(name='sigmoid_b')) + param_attr=paddle.attr.Param(name="sigmoid_w"), + bias_attr=paddle.attr.Param(name="sigmoid_b")) return cost else: - with paddle.layer.mixed( - size=dict_size - 1, - act=paddle.activation.Sigmoid(), - bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction: - prediction += paddle.layer.trans_full_matrix_projection( - input=hidden_layer, - param_attr=paddle.attr.Param(name='sigmoid_w')) + prediction = paddle.layer.fc( + size=dict_size - 1, + input=hidden_layer, + act=paddle.activation.Sigmoid(), + bias_attr=paddle.attr.Param(name="sigmoid_b"), + param_attr=paddle.attr.Param(name="sigmoid_w")) return prediction ``` 需要注意,在预测阶段,我们需要对hsigmoid参数做一次转置,这里输出的类别数为词典大小减1,对应非叶节点的数量。 ## 训练阶段 -训练比较简单,直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。 +训练比较简单,直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。 ## 预测阶段 -预测时,直接运行``` python hsigmoid_predict.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下: +预测时,直接运行``` python infer.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下: ```python def decode_res(infer_res, dict_size): diff --git a/hsigmoid/hsigmoid_conf.py b/hsigmoid/hsigmoid_conf.py deleted file mode 100644 index be6b7462a1487e906278fa2682d65add256aaa2d..0000000000000000000000000000000000000000 --- a/hsigmoid/hsigmoid_conf.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import math -import paddle.v2 as paddle - - -def network_conf(hidden_size, embed_size, dict_size, is_train=True): - first_word = paddle.layer.data( - name='firstw', type=paddle.data_type.integer_value(dict_size)) - second_word = paddle.layer.data( - name='secondw', type=paddle.data_type.integer_value(dict_size)) - third_word = paddle.layer.data( - name='thirdw', type=paddle.data_type.integer_value(dict_size)) - fourth_word = paddle.layer.data( - name='fourthw', type=paddle.data_type.integer_value(dict_size)) - target_word = paddle.layer.data( - name='fifthw', type=paddle.data_type.integer_value(dict_size)) - - embed_param_attr = paddle.attr.Param( - name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) - embed_first_word = paddle.layer.embedding( - input=first_word, size=embed_size, param_attr=embed_param_attr) - embed_second_word = paddle.layer.embedding( - input=second_word, size=embed_size, param_attr=embed_param_attr) - embed_third_word = paddle.layer.embedding( - input=third_word, size=embed_size, param_attr=embed_param_attr) - embed_fourth_word = paddle.layer.embedding( - input=fourth_word, size=embed_size, param_attr=embed_param_attr) - - embed_context = paddle.layer.concat(input=[ - embed_first_word, embed_second_word, embed_third_word, embed_fourth_word - ]) - - hidden_layer = paddle.layer.fc( - input=embed_context, - size=hidden_size, - act=paddle.activation.Sigmoid(), - layer_attr=paddle.attr.Extra(drop_rate=0.5), - bias_attr=paddle.attr.Param(learning_rate=2), - param_attr=paddle.attr.Param( - initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1)) - - if is_train == True: - cost = paddle.layer.hsigmoid( - input=hidden_layer, - label=target_word, - num_classes=dict_size, - param_attr=paddle.attr.Param(name='sigmoid_w'), - bias_attr=paddle.attr.Param(name='sigmoid_b')) - return cost - else: - with paddle.layer.mixed( - size=dict_size - 1, - act=paddle.activation.Sigmoid(), - bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction: - prediction += paddle.layer.trans_full_matrix_projection( - input=hidden_layer, - param_attr=paddle.attr.Param(name='sigmoid_w')) - return prediction diff --git a/hsigmoid/index.html b/hsigmoid/index.html index 83f6809d669d9ec6e0dd002f414ba8247068e270..c53e110fdb80fabe5d82865fc5d4eb3512007d4a 100644 --- a/hsigmoid/index.html +++ b/hsigmoid/index.html @@ -92,7 +92,7 @@ def train_data(filename, word_dict, n): ``` ## 网络结构 -本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是Hsigmoid层。详细网络结构见图2: +本文通过训练N-gram语言模型来获得词向量,具体地使用前4个词来预测当前词。网络输入为词在字典中的id,然后查询词向量词表获取词向量,接着拼接4个词的词向量,然后接入一个全连接隐层,最后是`Hsigmoid`层。详细网络结构见图2:


@@ -102,41 +102,27 @@ def train_data(filename, word_dict, n): 代码实现如下: ```python -import math -import paddle.v2 as paddle - - -def network_conf(hidden_size, embed_size, dict_size, is_train=True): - first_word = paddle.layer.data( - name='firstw', type=paddle.data_type.integer_value(dict_size)) - second_word = paddle.layer.data( - name='secondw', type=paddle.data_type.integer_value(dict_size)) - third_word = paddle.layer.data( - name='thirdw', type=paddle.data_type.integer_value(dict_size)) - fourth_word = paddle.layer.data( - name='fourthw', type=paddle.data_type.integer_value(dict_size)) - target_word = paddle.layer.data( - name='fifthw', type=paddle.data_type.integer_value(dict_size)) - +def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True): + emb_layers = [] embed_param_attr = paddle.attr.Param( name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) - embed_first_word = paddle.layer.embedding( - input=first_word, size=embed_size, param_attr=embed_param_attr) - embed_second_word = paddle.layer.embedding( - input=second_word, size=embed_size, param_attr=embed_param_attr) - embed_third_word = paddle.layer.embedding( - input=third_word, size=embed_size, param_attr=embed_param_attr) - embed_fourth_word = paddle.layer.embedding( - input=fourth_word, size=embed_size, param_attr=embed_param_attr) - - embed_context = paddle.layer.concat(input=[ - embed_first_word, embed_second_word, embed_third_word, embed_fourth_word - ]) + for i in range(gram_num): + word = paddle.layer.data( + name="__word%02d__" % (i), + type=paddle.data_type.integer_value(dict_size)) + emb_layers.append( + paddle.layer.embedding( + input=word, size=embed_size, param_attr=embed_param_attr)) + + target_word = paddle.layer.data( + name="__target_word__", type=paddle.data_type.integer_value(dict_size)) + + embed_context = paddle.layer.concat(input=emb_layers) hidden_layer = paddle.layer.fc( input=embed_context, size=hidden_size, - act=paddle.activation.Sigmoid(), + act=paddle.activation.Sigmoid(), layer_attr=paddle.attr.Extra(drop_rate=0.5), bias_attr=paddle.attr.Param(learning_rate=2), param_attr=paddle.attr.Param( @@ -147,27 +133,26 @@ def network_conf(hidden_size, embed_size, dict_size, is_train=True): input=hidden_layer, label=target_word, num_classes=dict_size, - param_attr=paddle.attr.Param(name='sigmoid_w'), - bias_attr=paddle.attr.Param(name='sigmoid_b')) + param_attr=paddle.attr.Param(name="sigmoid_w"), + bias_attr=paddle.attr.Param(name="sigmoid_b")) return cost else: - with paddle.layer.mixed( - size=dict_size - 1, - act=paddle.activation.Sigmoid(), - bias_attr=paddle.attr.Param(name='sigmoid_b')) as prediction: - prediction += paddle.layer.trans_full_matrix_projection( - input=hidden_layer, - param_attr=paddle.attr.Param(name='sigmoid_w')) + prediction = paddle.layer.fc( + size=dict_size - 1, + input=hidden_layer, + act=paddle.activation.Sigmoid(), + bias_attr=paddle.attr.Param(name="sigmoid_b"), + param_attr=paddle.attr.Param(name="sigmoid_w")) return prediction ``` 需要注意,在预测阶段,我们需要对hsigmoid参数做一次转置,这里输出的类别数为词典大小减1,对应非叶节点的数量。 ## 训练阶段 -训练比较简单,直接运行``` python hsigmoid_train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。 +训练比较简单,直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集,如果未包含,则自动下载。运行过程中,每100个iteration会打印模型训练信息,主要包含训练损失和测试损失,每个pass会保存一次模型。 ## 预测阶段 -预测时,直接运行``` python hsigmoid_predict.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下: +预测时,直接运行``` python infer.py ```,程序会首先load模型,然后按照batch方式进行预测,并打印预测结果。预测阶段最重要的就是根据概率得到编码路径,然后遍历路径获取最终的预测类别,这部分逻辑如下: ```python def decode_res(infer_res, dict_size): diff --git a/hsigmoid/hsigmoid_predict.py b/hsigmoid/infer.py similarity index 82% rename from hsigmoid/hsigmoid_predict.py rename to hsigmoid/infer.py index 210f87ee103a2ac145e3c42cea536cd00d2994bb..32000238ee715e6ad8fcb9cb2484e7c532974987 100644 --- a/hsigmoid/hsigmoid_predict.py +++ b/hsigmoid/infer.py @@ -1,9 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import os +import logging +import gzip import paddle.v2 as paddle -from hsigmoid_conf import network_conf -import gzip +from network_conf import ngram_lm + +logger = logging.getLogger("paddle") +logger.setLevel(logging.WARNING) def decode_res(infer_res, dict_size): @@ -45,21 +50,20 @@ def predict(batch_ins, idx_word_dict, dict_size, prediction_layer, parameters): # Ouput format: word1 word2 word3 word4 -> predict label for i, ins in enumerate(batch_ins): - print(idx_word_dict[ins[0]] + ' ' + \ - idx_word_dict[ins[1]] + ' ' + \ - idx_word_dict[ins[2]] + ' ' + \ - idx_word_dict[ins[3]] + ' ' + \ - ' -> ' + predict_words[i]) + print(" ".join([idx_word_dict[w] + for w in ins]) + " -> " + predict_words[i]) + +def main(model_path): + assert os.path.exists(model_path), "trained model does not exist." -def main(): paddle.init(use_gpu=False, trainer_count=1) word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2) dict_size = len(word_dict) - prediction_layer = network_conf( + prediction_layer = ngram_lm( is_train=False, hidden_size=256, embed_size=32, dict_size=dict_size) - with gzip.open('./models/model_pass_00000.tar.gz') as f: + with gzip.open(model_path, "r") as f: parameters = paddle.parameters.Parameters.from_tar(f) idx_word_dict = dict((v, k) for k, v in word_dict.items()) @@ -79,5 +83,5 @@ def main(): parameters) -if __name__ == '__main__': - main() +if __name__ == "__main__": + main("models/hsigmoid_batch_00010.tar.gz") diff --git a/hsigmoid/network_conf.py b/hsigmoid/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..494494788c015fd76ab5914ba6c2a8161bde5785 --- /dev/null +++ b/hsigmoid/network_conf.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import math +import paddle.v2 as paddle + + +def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True): + emb_layers = [] + embed_param_attr = paddle.attr.Param( + name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) + for i in range(gram_num): + word = paddle.layer.data( + name="__word%02d__" % (i), + type=paddle.data_type.integer_value(dict_size)) + emb_layers.append( + paddle.layer.embedding( + input=word, size=embed_size, param_attr=embed_param_attr)) + + target_word = paddle.layer.data( + name="__target_word__", type=paddle.data_type.integer_value(dict_size)) + + embed_context = paddle.layer.concat(input=emb_layers) + + hidden_layer = paddle.layer.fc( + input=embed_context, + size=hidden_size, + act=paddle.activation.Sigmoid(), + layer_attr=paddle.attr.Extra(drop_rate=0.5), + bias_attr=paddle.attr.Param(learning_rate=2), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1)) + + if is_train == True: + cost = paddle.layer.hsigmoid( + input=hidden_layer, + label=target_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name="sigmoid_w"), + bias_attr=paddle.attr.Param(name="sigmoid_b")) + return cost + else: + prediction = paddle.layer.fc( + size=dict_size - 1, + input=hidden_layer, + act=paddle.activation.Sigmoid(), + bias_attr=paddle.attr.Param(name="sigmoid_b"), + param_attr=paddle.attr.Param(name="sigmoid_w")) + return prediction diff --git a/hsigmoid/hsigmoid_train.py b/hsigmoid/train.py similarity index 55% rename from hsigmoid/hsigmoid_train.py rename to hsigmoid/train.py index 0c2e1b236b284c3dfb32988b0d917eb830f365be..809c842af55b22daff3428db9b674065a16f1700 100644 --- a/hsigmoid/hsigmoid_train.py +++ b/hsigmoid/train.py @@ -1,40 +1,41 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import os +import logging +import gzip import paddle.v2 as paddle -from hsigmoid_conf import network_conf -import gzip +from network_conf import ngram_lm + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) -def main(): +def main(save_dir="models"): + if not os.path.exists(save_dir): + os.mkdir(save_dir) + paddle.init(use_gpu=False, trainer_count=1) word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2) dict_size = len(word_dict) - cost = network_conf( - is_train=True, hidden_size=256, embed_size=32, dict_size=dict_size) + cost = ngram_lm(hidden_size=256, embed_size=32, dict_size=dict_size) def event_handler(event): if isinstance(event, paddle.event.EndPass): - model_name = './models/model_pass_%05d.tar.gz' % event.pass_id - print("Save model into %s ..." % model_name) - with gzip.open(model_name, 'w') as f: + model_name = os.path.join(save_dir, "hsigmoid_pass_%05d.tar.gz" % + event.pass_id) + logger.info("Save model into %s ..." % model_name) + with gzip.open(model_name, "w") as f: parameters.to_tar(f) if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: + if event.batch_id and event.batch_id % 10 == 0: result = trainer.test( paddle.batch( paddle.dataset.imikolov.test(word_dict, 5), 32)) - print("Pass %d, Batch %d, Cost %f, Test Cost %f" % - (event.pass_id, event.batch_id, event.cost, result.cost)) - - feeding = { - 'firstw': 0, - 'secondw': 1, - 'thirdw': 2, - 'fourthw': 3, - 'fifthw': 4 - } + logger.info( + "Pass %d, Batch %d, Cost %f, Test Cost %f" % + (event.pass_id, event.batch_id, event.cost, result.cost)) parameters = paddle.parameters.create(cost) adam_optimizer = paddle.optimizer.Adam( @@ -48,9 +49,8 @@ def main(): lambda: paddle.dataset.imikolov.train(word_dict, 5)(), buf_size=1000), 64), num_passes=30, - event_handler=event_handler, - feeding=feeding) + event_handler=event_handler) -if __name__ == '__main__': +if __name__ == "__main__": main()