diff --git a/04.word2vec/README.cn.md b/04.word2vec/README.cn.md index 7c5e9bc1eb4dc89ebe2eb61cb250c0663008412c..f9aac91005f69f63dbe392b8c219f2a93a1c77df 100644 --- a/04.word2vec/README.cn.md +++ b/04.word2vec/README.cn.md @@ -207,6 +207,28 @@ hiddensize = 256 # 隐层维度 N = 5 # 训练5-Gram ``` +用于保存和加载word_dict和embedding table的函数 +```python +# save and load word dict and embedding table +def save_dict_and_embedding(word_dict, embeddings): + with open("word_dict", "w") as f: + for key in word_dict: + f.write(key + " " + str(word_dict[key]) + "\n") + with open("embedding_table", "w") as f: + numpy.savetxt(f, embeddings, delimiter=',', newline='\n') + + +def load_dict_and_embedding(): + word_dict = dict() + with open("word_dict", "r") as f: + for line in f: + key, value = line.strip().split(" ") + word_dict[key] = value + + embeddings = numpy.loadtxt("embedding_table", delimiter=",") + return word_dict, embeddings +``` + 接着,定义网络结构: - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$,通过$|V|\times D$的矩阵映射到D维词向量(本例中取D=32)。 @@ -333,6 +355,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te 经过30个pass,我们将得到平均错误率为classification_error_evaluator=0.735611。 +## 保存词典和embedding + +训练完成之后,我们可以把词典和embedding table单独保存下来,后面可以直接使用 + +```python +# save word dict and embedding table +embeddings = parameters.get("_proj").reshape(len(word_dict), embsize) +save_dict_and_embedding(word_dict, embeddings) +``` + ## 应用模型 训练模型后,我们可以加载模型参数,用训练出来的词向量初始化其他模型,也可以将模型查看参数用来做后续应用。 diff --git a/04.word2vec/README.md b/04.word2vec/README.md index 76dd29d52bfca527710b638ff22b24768e8a8e99..71444ef4a97b6b13dfc311da9c925d3230923560 100644 --- a/04.word2vec/README.md +++ b/04.word2vec/README.md @@ -224,6 +224,29 @@ hiddensize = 256 # hidden layer dimension N = 5 # train 5-gram ``` + +- functions used to save and load word dict and embedding table +```python +# save and load word dict and embedding table +def save_dict_and_embedding(word_dict, embeddings): + with open("word_dict", "w") as f: + for key in word_dict: + f.write(key + " " + str(word_dict[key]) + "\n") + with open("embedding_table", "w") as f: + numpy.savetxt(f, embeddings, delimiter=',', newline='\n') + + +def load_dict_and_embedding(): + word_dict = dict() + with open("word_dict", "r") as f: + for line in f: + key, value = line.strip().split(" ") + word_dict[key] = value + + embeddings = numpy.loadtxt("embedding_table", delimiter=",") + return word_dict, embeddings +``` + - Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example). ```python @@ -343,6 +366,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te After 30 passes, we can get average error rate around 0.735611. +## Save word dict and embedding table + +after training, we can save the word dict and embedding table for the future usage. + +```python +# save word dict and embedding table +embeddings = parameters.get("_proj").reshape(len(word_dict), embsize) +save_dict_and_embedding(word_dict, embeddings) +``` + ## Model Application diff --git a/04.word2vec/index.cn.html b/04.word2vec/index.cn.html index 5bb2b31efb61b1edb188778daada87a3dee1cdff..f8a4a0bfd9450f2007848db5c1faa6486b152497 100644 --- a/04.word2vec/index.cn.html +++ b/04.word2vec/index.cn.html @@ -249,6 +249,28 @@ hiddensize = 256 # 隐层维度 N = 5 # 训练5-Gram ``` +用于保存和加载word_dict和embedding table的函数 +```python +# save and load word dict and embedding table +def save_dict_and_embedding(word_dict, embeddings): + with open("word_dict", "w") as f: + for key in word_dict: + f.write(key + " " + str(word_dict[key]) + "\n") + with open("embedding_table", "w") as f: + numpy.savetxt(f, embeddings, delimiter=',', newline='\n') + + +def load_dict_and_embedding(): + word_dict = dict() + with open("word_dict", "r") as f: + for line in f: + key, value = line.strip().split(" ") + word_dict[key] = value + + embeddings = numpy.loadtxt("embedding_table", delimiter=",") + return word_dict, embeddings +``` + 接着,定义网络结构: - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$,通过$|V|\times D$的矩阵映射到D维词向量(本例中取D=32)。 @@ -375,6 +397,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te 经过30个pass,我们将得到平均错误率为classification_error_evaluator=0.735611。 +## 保存词典和embedding + +训练完成之后,我们可以把词典和embedding table单独保存下来,后面可以直接使用 + +```python +# save word dict and embedding table +embeddings = parameters.get("_proj").reshape(len(word_dict), embsize) +save_dict_and_embedding(word_dict, embeddings) +``` + ## 应用模型 训练模型后,我们可以加载模型参数,用训练出来的词向量初始化其他模型,也可以将模型查看参数用来做后续应用。 diff --git a/04.word2vec/index.html b/04.word2vec/index.html index aa2fdd184e45d64cd2c315ae544aa7407873dd17..8dce7e228382c076a9ea98c118888dea059117a7 100644 --- a/04.word2vec/index.html +++ b/04.word2vec/index.html @@ -266,6 +266,29 @@ hiddensize = 256 # hidden layer dimension N = 5 # train 5-gram ``` + +- functions used to save and load word dict and embedding table +```python +# save and load word dict and embedding table +def save_dict_and_embedding(word_dict, embeddings): + with open("word_dict", "w") as f: + for key in word_dict: + f.write(key + " " + str(word_dict[key]) + "\n") + with open("embedding_table", "w") as f: + numpy.savetxt(f, embeddings, delimiter=',', newline='\n') + + +def load_dict_and_embedding(): + word_dict = dict() + with open("word_dict", "r") as f: + for line in f: + key, value = line.strip().split(" ") + word_dict[key] = value + + embeddings = numpy.loadtxt("embedding_table", delimiter=",") + return word_dict, embeddings +``` + - Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example). ```python @@ -385,6 +408,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te After 30 passes, we can get average error rate around 0.735611. +## Save word dict and embedding table + +after training, we can save the word dict and embedding table for the future usage. + +```python +# save word dict and embedding table +embeddings = parameters.get("_proj").reshape(len(word_dict), embsize) +save_dict_and_embedding(word_dict, embeddings) +``` + ## Model Application diff --git a/04.word2vec/train.py b/04.word2vec/train.py index 550ba0093d62f50b674eef3663f13323f57dbd3c..435af6a602c00aaf9cc2ac6c4a8084e265f1f2d2 100644 --- a/04.word2vec/train.py +++ b/04.word2vec/train.py @@ -1,5 +1,7 @@ -import math, os +import math +import os +import numpy import paddle.v2 as paddle with_gpu = os.getenv('WITH_GPU', '0') != '0' @@ -18,6 +20,26 @@ def wordemb(inlayer): return wordemb +# save and load word dict and embedding table +def save_dict_and_embedding(word_dict, embeddings): + with open("word_dict", "w") as f: + for key in word_dict: + f.write(key + " " + str(word_dict[key]) + "\n") + with open("embedding_table", "w") as f: + numpy.savetxt(f, embeddings, delimiter=',', newline='\n') + + +def load_dict_and_embedding(): + word_dict = dict() + with open("word_dict", "r") as f: + for line in f: + key, value = line.strip().split(" ") + word_dict[key] = value + + embeddings = numpy.loadtxt("embedding_table", delimiter=",") + return word_dict, embeddings + + def main(): paddle.init(use_gpu=with_gpu, trainer_count=3) word_dict = paddle.dataset.imikolov.build_dict() @@ -79,6 +101,10 @@ def main(): num_passes=100, event_handler=event_handler) + # save word dict and embedding table + embeddings = parameters.get("_proj").reshape(len(word_dict), embsize) + save_dict_and_embedding(word_dict, embeddings) + if __name__ == '__main__': main()