From 66f866f178f62a9dafaa806042bd6de35b61866f Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Fri, 15 Sep 2017 14:59:08 -0700 Subject: [PATCH] add save/load dict_and_embedding for word2vector --- 04.word2vec/train.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/04.word2vec/train.py b/04.word2vec/train.py index 550ba00..c19df00 100644 --- a/04.word2vec/train.py +++ b/04.word2vec/train.py @@ -1,4 +1,5 @@ import math, os +import numpy import paddle.v2 as paddle @@ -18,6 +19,31 @@ def wordemb(inlayer): return wordemb +# save and load word dict and embedding table +def save_dict_and_embedding(word_dict, embeddings): + with open("word_dict", "w") as f: + for key in word_dict: + f.write(key + " " + str(word_dict[key]) + "\n") + with open("embedding_table", "w") as f: + for line in embeddings: + f.write(",".join([str(x) for x in line]) + "\n") + + +def load_dict_and_embedding(): + word_dict = dict() + embeddings = [] + + with open("word_dict", "r") as f: + for line in f: + key, value = line.strip().split(" ") + word_dict[key] = value + with open("embedding_table", "r") as f: + for line in f: + embeddings.append( + numpy.array([float(x) for x in line.strip().split(',')])) + return word_dict, embeddings + + def main(): paddle.init(use_gpu=with_gpu, trainer_count=3) word_dict = paddle.dataset.imikolov.build_dict() @@ -76,9 +102,13 @@ def main(): trainer = paddle.trainer.SGD(cost, parameters, adagrad) trainer.train( paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32), - num_passes=100, + num_passes=1, event_handler=event_handler) + # save word dict and embedding table + embeddings = parameters.get("_proj").reshape(len(word_dict), embsize) + save_dict_and_embedding(word_dict, embeddings) + if __name__ == '__main__': main() -- GitLab