Merge pull request #410 from jacquesqiao/use-word2vec

add save/load dict_and_embedding for word2vector

Merge pull request #410 from jacquesqiao/use-word2vec
add save/load dict_and_embedding for word2vector
37a6556e · Qiao Longfei · GitHub · ceb3fc4e · da00779c · 37a6556e
5 changed file
--- a/04.word2vec/README.cn.md
+++ b/04.word2vec/README.cn.md
@@ -207,6 +207,28 @@ hiddensize = 256 # 隐层维度
 N = 5 # 训练5-Gram
 ```
+用于保存和加载word_dict和embedding table的函数
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = value
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
 接着，定义网络结构：
 - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
@@ -333,6 +355,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 经过30个pass，我们将得到平均错误率为classification_error_evaluator=0.735611。
+## 保存词典和embedding
+训练完成之后，我们可以把词典和embedding table单独保存下来，后面可以直接使用
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
 ## 应用模型
 训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型查看参数用来做后续应用。

--- a/04.word2vec/README.md
+++ b/04.word2vec/README.md
@@ -224,6 +224,29 @@ hiddensize = 256 # hidden layer dimension
 N = 5 # train 5-gram
 ```
+- functions used to save and load word dict and embedding table
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = value
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
 - Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example).
 ```python
@@ -343,6 +366,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 After 30 passes, we can get average error rate around 0.735611.
+## Save word dict and embedding table
+after training, we can save the word dict and embedding table for the future usage.
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
 ## Model Application

--- a/04.word2vec/index.cn.html
+++ b/04.word2vec/index.cn.html
@@ -249,6 +249,28 @@ hiddensize = 256 # 隐层维度
 N = 5 # 训练5-Gram
 ```
+用于保存和加载word_dict和embedding table的函数
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = value
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
 接着，定义网络结构：
 - 将$w_t$之前的$n-1$个词 $w_{t-n+1},...w_{t-1}$，通过$|V|\times D$的矩阵映射到D维词向量（本例中取D=32）。
@@ -375,6 +397,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 经过30个pass，我们将得到平均错误率为classification_error_evaluator=0.735611。
+## 保存词典和embedding
+训练完成之后，我们可以把词典和embedding table单独保存下来，后面可以直接使用
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
 ## 应用模型
 训练模型后，我们可以加载模型参数，用训练出来的词向量初始化其他模型，也可以将模型查看参数用来做后续应用。

--- a/04.word2vec/index.html
+++ b/04.word2vec/index.html
@@ -266,6 +266,29 @@ hiddensize = 256 # hidden layer dimension
 N = 5 # train 5-gram
 ```
+- functions used to save and load word dict and embedding table
+```python
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = value
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
+```
 - Map the $n-1$ words $w_{t-n+1},...w_{t-1}$ before $w_t$ to a D-dimensional vector though matrix of dimention $|V|\times D$ (D=32 in this example).
 ```python
@@ -385,6 +408,16 @@ Pass 0, Batch 200, Cost 5.786797, {'classification_error_evaluator': 0.8125}, Te
 After 30 passes, we can get average error rate around 0.735611.
+## Save word dict and embedding table
+after training, we can save the word dict and embedding table for the future usage.
+```python
+# save word dict and embedding table
+embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+save_dict_and_embedding(word_dict, embeddings)
+```
 ## Model Application

--- a/04.word2vec/train.py
+++ b/04.word2vec/train.py
-import math, os
+import math
+import os
+import numpy
 import paddle.v2 as paddle
 with_gpu = os.getenv('WITH_GPU', '0') != '0'
@@ -18,6 +20,26 @@ def wordemb(inlayer):
    return wordemb
+# save and load word dict and embedding table
+def save_dict_and_embedding(word_dict, embeddings):
+    with open("word_dict", "w") as f:
+        for key in word_dict:
+            f.write(key + " " + str(word_dict[key]) + "\n")
+    with open("embedding_table", "w") as f:
+        numpy.savetxt(f, embeddings, delimiter=',', newline='\n')
+def load_dict_and_embedding():
+    word_dict = dict()
+    with open("word_dict", "r") as f:
+        for line in f:
+            key, value = line.strip().split(" ")
+            word_dict[key] = value
+    embeddings = numpy.loadtxt("embedding_table", delimiter=",")
+    return word_dict, embeddings
 def main():
    paddle.init(use_gpu=with_gpu, trainer_count=3)
    word_dict = paddle.dataset.imikolov.build_dict()
@@ -79,6 +101,10 @@ def main():
        num_passes=100,
        event_handler=event_handler)
+    # save word dict and embedding table
+    embeddings = parameters.get("_proj").reshape(len(word_dict), embsize)
+    save_dict_and_embedding(word_dict, embeddings)
 if __name__ == '__main__':
    main()