未验证 提交 62148b78 编写于 作者: J Jiabin Yang 提交者: GitHub

Merge pull request #1545 from JiabinYang/add-word2vec

support no third_party vocab
...@@ -25,6 +25,7 @@ cd data && ./download.sh && cd .. ...@@ -25,6 +25,7 @@ cd data && ./download.sh && cd ..
```bash ```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
``` ```
如果您想使用我们支持的第三方词汇表,请将--other_dict_path设置为您存放将使用的词汇表的目录,并设置--with_other_dict使用它
## 训练 ## 训练
训练的命令行选项可以通过`python train.py -h`列出。 训练的命令行选项可以通过`python train.py -h`列出。
......
...@@ -14,6 +14,11 @@ Download dataset: ...@@ -14,6 +14,11 @@ Download dataset:
```bash ```bash
cd data && ./download.sh && cd .. cd data && ./download.sh && cd ..
``` ```
if you would like to use our supported third party vocab, please run:
```bash
wget http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt
```
## Model ## Model
This model implement a skip-gram model of word2vector. This model implement a skip-gram model of word2vector.
...@@ -24,8 +29,10 @@ This model implement a skip-gram model of word2vector. ...@@ -24,8 +29,10 @@ This model implement a skip-gram model of word2vector.
Preprocess the training data to generate a word dict. Preprocess the training data to generate a word dict.
```bash ```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --is_local --dict_path data/1-billion_dict
``` ```
if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
save the vocab you will use and set --with_other_dict flag on to using it.
## Train ## Train
The command line options for training can be listed by `python train.py -h`. The command line options for training can be listed by `python train.py -h`.
......
...@@ -2,11 +2,9 @@ import time ...@@ -2,11 +2,9 @@ import time
import os import os
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np import numpy as np
from Queue import PriorityQueue
import logging import logging
import argparse import argparse
import preprocess import preprocess
from sklearn.metrics.pairwise import cosine_similarity
word_to_id = dict() word_to_id = dict()
id_to_word = dict() id_to_word = dict()
...@@ -51,8 +49,8 @@ def parse_args(): ...@@ -51,8 +49,8 @@ def parse_args():
'--test_acc', '--test_acc',
action='store_true', action='store_true',
required=False, required=False,
default=True, default=False,
help='if using test_files , (default: True)') help='if using test_files , (default: False)')
parser.add_argument( parser.add_argument(
'--test_files_dir', '--test_files_dir',
type=str, type=str,
...@@ -85,14 +83,12 @@ def build_test_case_from_file(args, emb): ...@@ -85,14 +83,12 @@ def build_test_case_from_file(args, emb):
logger.info("test files list: {}".format(current_list)) logger.info("test files list: {}".format(current_list))
test_cases = list() test_cases = list()
test_labels = list() test_labels = list()
test_case_descs = list()
exclude_lists = list() exclude_lists = list()
for file_dir in current_list: for file_dir in current_list:
with open(args.test_files_dir + "/" + file_dir, 'r') as f: with open(args.test_files_dir + "/" + file_dir, 'r') as f:
count = 0
for line in f: for line in f:
if count == 0: if ':' in line:
pass
elif ':' in line:
logger.info("{}".format(line)) logger.info("{}".format(line))
pass pass
else: else:
...@@ -102,14 +98,15 @@ def build_test_case_from_file(args, emb): ...@@ -102,14 +98,15 @@ def build_test_case_from_file(args, emb):
line.split()[2]]] line.split()[2]]]
test_case_desc = line.split()[0] + " - " + line.split()[ test_case_desc = line.split()[0] + " - " + line.split()[
1] + " + " + line.split()[2] + " = " + line.split()[3] 1] + " + " + line.split()[2] + " = " + line.split()[3]
test_cases.append([test_case, test_case_desc]) test_cases.append(test_case)
test_case_descs.append(test_case_desc)
test_labels.append(word_to_id[line.split()[3]]) test_labels.append(word_to_id[line.split()[3]])
exclude_lists.append([ exclude_lists.append([
word_to_id[line.split()[0]], word_to_id[line.split()[0]],
word_to_id[line.split()[1]], word_to_id[line.split()[2]] word_to_id[line.split()[1]], word_to_id[line.split()[2]]
]) ])
count += 1 test_cases = norm(np.array(test_cases))
return test_cases, test_labels, exclude_lists return test_cases, test_case_descs, test_labels, exclude_lists
def build_small_test_case(emb): def build_small_test_case(emb):
...@@ -133,8 +130,27 @@ def build_small_test_case(emb): ...@@ -133,8 +130,27 @@ def build_small_test_case(emb):
'deeper']] 'deeper']]
desc5 = "old - older + deeper = deep" desc5 = "old - older + deeper = deep"
label5 = word_to_id["deep"] label5 = word_to_id["deep"]
return [[emb1, desc1], [emb2, desc2], [emb3, desc3], [emb4, desc4],
[emb5, desc5]], [label1, label2, label3, label4, label5] emb6 = emb[word_to_id['boy']]
desc6 = "boy"
label6 = word_to_id["boy"]
emb7 = emb[word_to_id['king']]
desc7 = "king"
label7 = word_to_id["king"]
emb8 = emb[word_to_id['sun']]
desc8 = "sun"
label8 = word_to_id["sun"]
emb9 = emb[word_to_id['key']]
desc9 = "key"
label9 = word_to_id["key"]
test_cases = [emb1, emb2, emb3, emb4, emb5, emb6, emb7, emb8, emb9]
test_case_desc = [
desc1, desc2, desc3, desc4, desc5, desc6, desc7, desc8, desc9
]
test_labels = [
label1, label2, label3, label4, label5, label6, label7, label8, label9
]
return norm(np.array(test_cases)), test_case_desc, test_labels
def build_test_case(args, emb): def build_test_case(args, emb):
...@@ -144,86 +160,80 @@ def build_test_case(args, emb): ...@@ -144,86 +160,80 @@ def build_test_case(args, emb):
return build_small_test_case(emb) return build_small_test_case(emb)
def norm(x):
y = np.linalg.norm(x, axis=1, keepdims=True)
return x / y
def inference_test(scope, model_dir, args): def inference_test(scope, model_dir, args):
BuildWord_IdMap(args.dict_path) BuildWord_IdMap(args.dict_path)
logger.info("model_dir is: {}".format(model_dir + "/")) logger.info("model_dir is: {}".format(model_dir + "/"))
emb = np.array(scope.find_var("embeding").get_tensor()) emb = np.array(scope.find_var("embeding").get_tensor())
x = norm(emb)
logger.info("inference result: ====================") logger.info("inference result: ====================")
test_cases = list() test_cases = None
test_case_desc = list()
test_labels = list() test_labels = list()
exclude_lists = list() exclude_lists = list()
if args.test_acc: if args.test_acc:
test_cases, test_labels, exclude_lists = build_test_case(args, emb) test_cases, test_case_desc, test_labels, exclude_lists = build_test_case(
args, emb)
else: else:
test_cases, test_labels = build_test_case(args, emb) test_cases, test_case_desc, test_labels = build_test_case(args, emb)
exclude_lists = [[-1]] exclude_lists = [[-1]]
accual_rank = 1 if args.test_acc else args.rank_num accual_rank = 1 if args.test_acc else args.rank_num
correct_num = 0 correct_num = 0
cosine_similarity_matrix = np.dot(test_cases, x.T)
results = topKs(accual_rank, cosine_similarity_matrix, exclude_lists,
args.test_acc)
for i in range(len(test_labels)): for i in range(len(test_labels)):
pq = None logger.info("Test result for {}".format(test_case_desc[i]))
if args.test_acc: result = results[i]
pq = topK(
accual_rank,
emb,
test_cases[i][0],
exclude_lists[i],
is_acc=True)
else:
pq = pq = topK(
accual_rank,
emb,
test_cases[i][0],
exclude_lists[0],
is_acc=False)
logger.info("Test result for {}".format(test_cases[i][1]))
for j in range(accual_rank): for j in range(accual_rank):
pq_tmps = pq.get() if result[j][1] == test_labels[
if (j == accual_rank - 1) and ( i]: # if the nearest word is what we want
pq_tmps.id == test_labels[i]
): # if the nearest word is what we want
correct_num += 1 correct_num += 1
logger.info("{} nearest is {}, rate is {}".format( logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
accual_rank - j, id_to_word[pq_tmps.id], pq_tmps.priority)) result[j][1]], result[j][0]))
acc = correct_num / len(test_labels) logger.info("Test acc is: {}, there are {} / {}".format(correct_num / len(
logger.info("Test acc is: {}, there are {} / {}}".format(acc, correct_num, test_labels), correct_num, len(test_labels)))
len(test_labels)))
class PQ_Entry(object): def topK(k, cosine_similarity_list, exclude_list, is_acc=False):
def __init__(self, cos_similarity, id): if k == 1 and is_acc: # accelerate acc calculate
self.priority = cos_similarity max = cosine_similarity_list[0]
self.id = id id = 0
for i in range(len(cosine_similarity_list)):
def __cmp__(self, other): if cosine_similarity_list[i] >= max and (i not in exclude_list):
return cmp(self.priority, other.priority) max = cosine_similarity_list[i]
id = i
else:
pass
return [[max, id]]
else:
result = list()
result_index = np.argpartition(cosine_similarity_list, -k)[-k:]
for index in result_index:
result.append([cosine_similarity_list[index], index])
result.sort(reverse=True)
return result
def topK(k, emb, test_emb, exclude_list, is_acc=False):
pq = PriorityQueue(k + 1)
while not pq.empty():
try:
pq.get(False)
except Empty:
continue
pq.task_done()
if len(emb) <= k: def topKs(k, cosine_similarity_matrix, exclude_lists, is_acc=False):
for i in range(len(emb)): results = list()
x = cosine_similarity([emb[i]], [test_emb]) result_queues = list()
pq.put(PQ_Entry(x, i)) correct_num = 0
return pq
for i in range(len(emb)): for i in range(cosine_similarity_matrix.shape[0]):
if is_acc and (i in exclude_list): tmp_pq = None
pass if is_acc:
tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[i],
is_acc)
else: else:
x = cosine_similarity([emb[i]], [test_emb]) tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[0],
pq_e = PQ_Entry(x, i) is_acc)
if pq.full(): result_queues.append(tmp_pq)
pq.get() return result_queues
pq.put(pq_e)
pq.get()
return pq
def infer_during_train(args): def infer_during_train(args):
...@@ -235,8 +245,6 @@ def infer_during_train(args): ...@@ -235,8 +245,6 @@ def infer_during_train(args):
while True: while True:
time.sleep(60) time.sleep(60)
current_list = os.listdir(args.model_output_dir) current_list = os.listdir(args.model_output_dir)
# logger.info("current_list is : {}".format(current_list))
# logger.info("model_file_list is : {}".format(model_file_list))
if set(model_file_list) == set(current_list): if set(model_file_list) == set(current_list):
if solved_new: if solved_new:
solved_new = False solved_new = False
...@@ -271,6 +279,8 @@ def infer_once(args): ...@@ -271,6 +279,8 @@ def infer_once(args):
fluid.io.load_persistables( fluid.io.load_persistables(
executor=exe, dirname=args.model_output_dir + "/") executor=exe, dirname=args.model_output_dir + "/")
inference_test(Scope, args.model_output_dir, args) inference_test(Scope, args.model_output_dir, args)
else:
logger.info("Wrong Directory or save model failed!")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -95,8 +95,7 @@ def skip_gram_word2vec(dict_size, ...@@ -95,8 +95,7 @@ def skip_gram_word2vec(dict_size,
capacity=64, feed_list=datas, name='py_reader', use_double_buffer=True) capacity=64, feed_list=datas, name='py_reader', use_double_buffer=True)
words = fluid.layers.read_file(py_reader) words = fluid.layers.read_file(py_reader)
target_emb = fluid.layers.embedding(
emb = fluid.layers.embedding(
input=words[0], input=words[0],
is_sparse=is_sparse, is_sparse=is_sparse,
size=[dict_size, embedding_size], size=[dict_size, embedding_size],
...@@ -104,16 +103,23 @@ def skip_gram_word2vec(dict_size, ...@@ -104,16 +103,23 @@ def skip_gram_word2vec(dict_size,
name='embeding', name='embeding',
initializer=fluid.initializer.Normal(scale=1 / initializer=fluid.initializer.Normal(scale=1 /
math.sqrt(dict_size)))) math.sqrt(dict_size))))
context_emb = fluid.layers.embedding(
input=words[1],
is_sparse=is_sparse,
size=[dict_size, embedding_size],
param_attr=fluid.ParamAttr(
name='embeding',
initializer=fluid.initializer.Normal(scale=1 /
math.sqrt(dict_size))))
cost, cost_nce, cost_hs = None, None, None cost, cost_nce, cost_hs = None, None, None
if with_nce: if with_nce:
cost_nce = nce_layer(emb, words[1], embedding_size, dict_size, 5, cost_nce = nce_layer(target_emb, words[1], embedding_size, dict_size, 5,
"uniform", word_frequencys, None) "uniform", word_frequencys, None)
cost = cost_nce cost = cost_nce
if with_hsigmoid: if with_hsigmoid:
cost_hs = hsigmoid_layer(emb, words[1], words[2], words[3], dict_size, cost_hs = hsigmoid_layer(context_emb, words[0], words[2], words[3],
is_sparse) dict_size, is_sparse)
cost = cost_hs cost = cost_hs
if with_nce and with_hsigmoid: if with_nce and with_hsigmoid:
cost = fluid.layers.elementwise_add(cost_nce, cost_hs) cost = fluid.layers.elementwise_add(cost_nce, cost_hs)
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import re import re
import six import six
import argparse import argparse
import io
prog = re.compile("[^a-z ]", flags=0) prog = re.compile("[^a-z ]", flags=0)
word_count = dict() word_count = dict()
...@@ -83,7 +84,6 @@ def native_to_unicode(s): ...@@ -83,7 +84,6 @@ def native_to_unicode(s):
return _to_unicode(s) return _to_unicode(s)
except UnicodeDecodeError: except UnicodeDecodeError:
res = _to_unicode(s, ignore_errors=True) res = _to_unicode(s, ignore_errors=True)
tf.logging.info("Ignoring Unicode error, outputting: %s" % res)
return res return res
...@@ -199,34 +199,30 @@ def preprocess(args): ...@@ -199,34 +199,30 @@ def preprocess(args):
# word to count # word to count
if args.with_other_dict: if args.with_other_dict:
with open(args.other_dict_path, 'r') as f: with io.open(args.other_dict_path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
word_count[native_to_unicode(line.strip())] = 1 word_count[native_to_unicode(line.strip())] = 1
if args.is_local: if args.is_local:
for i in range(1, 100): for i in range(1, 100):
with open(args.data_path + "/news.en-000{:0>2d}-of-00100".format( with io.open(
i)) as f: args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
encoding='utf-8') as f:
for line in f: for line in f:
line = strip_lines(line) line = strip_lines(line)
words = line.split() words = line.split()
if args.with_other_dict:
for item in words: for item in words:
if item in word_count: if item in word_count:
word_count[item] = word_count[item] + 1 word_count[item] = word_count[item] + 1
else: else:
word_count[native_to_unicode('<UNK>')] += 1 word_count[native_to_unicode('<UNK>')] += 1
# with open(args.data_path + "/tmp.txt") as f: else:
# for line in f: for item in words:
# print("line before strip is: {}".format(line)) if item in word_count:
# line = strip_lines(line, word_count) word_count[item] = word_count[item] + 1
# print("line after strip is: {}".format(line)) else:
# words = line.split() word_count[item] = 1
# print("words after split is: {}".format(words))
# for item in words:
# if item in word_count:
# word_count[item] = word_count[item] + 1
# else:
# word_count[item] = 1
item_to_remove = [] item_to_remove = []
for item in word_count: for item in word_count:
if word_count[item] <= args.freq: if word_count[item] <= args.freq:
...@@ -236,21 +232,17 @@ def preprocess(args): ...@@ -236,21 +232,17 @@ def preprocess(args):
path_table, path_code, word_code_len = build_Huffman(word_count, 40) path_table, path_code, word_code_len = build_Huffman(word_count, 40)
with open(args.dict_path, 'w+') as f: with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
for k, v in word_count.items(): for k, v in word_count.items():
f.write(k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n') f.write(k + " " + str(v) + '\n')
with open(args.dict_path + "_ptable", 'w+') as f2: with io.open(args.dict_path + "_ptable", 'w+', encoding='utf-8') as f2:
for pk, pv in path_table.items(): for pk, pv in path_table.items():
f2.write( f2.write(pk + '\t' + ' '.join((str(x) for x in pv)) + '\n')
pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
for x in pv)) + '\n')
with open(args.dict_path + "_pcode", 'w+') as f3: with io.open(args.dict_path + "_pcode", 'w+', encoding='utf-8') as f3:
for pck, pcv in path_code.items(): for pck, pcv in path_code.items():
f3.write( f3.write(pck + '\t' + ' '.join((str(x) for x in pcv)) + '\n')
pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
for x in pcv)) + '\n')
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -2,14 +2,32 @@ ...@@ -2,14 +2,32 @@
import numpy as np import numpy as np
import preprocess import preprocess
import logging import logging
import io
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid") logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
class NumpyRandomInt(object):
def __init__(self, a, b, buf_size=1000):
self.idx = 0
self.buffer = np.random.random_integers(a, b, buf_size)
self.a = a
self.b = b
def __call__(self):
if self.idx == len(self.buffer):
self.buffer = np.random.random_integers(self.a, self.b,
len(self.buffer))
self.idx = 0
result = self.buffer[self.idx]
self.idx += 1
return result
class Word2VecReader(object): class Word2VecReader(object):
def __init__(self, def __init__(self,
dict_path, dict_path,
...@@ -24,6 +42,7 @@ class Word2VecReader(object): ...@@ -24,6 +42,7 @@ class Word2VecReader(object):
self.num_non_leaf = 0 self.num_non_leaf = 0
self.word_to_id_ = dict() self.word_to_id_ = dict()
self.id_to_word = dict() self.id_to_word = dict()
self.word_count = dict()
self.word_to_path = dict() self.word_to_path = dict()
self.word_to_code = dict() self.word_to_code = dict()
self.trainer_id = trainer_id self.trainer_id = trainer_id
...@@ -33,20 +52,19 @@ class Word2VecReader(object): ...@@ -33,20 +52,19 @@ class Word2VecReader(object):
word_counts = [] word_counts = []
word_id = 0 word_id = 0
with open(dict_path, 'r') as f: with io.open(dict_path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = line.decode(encoding='UTF-8')
word, count = line.split()[0], int(line.split()[1]) word, count = line.split()[0], int(line.split()[1])
self.word_count[word] = count
self.word_to_id_[word] = word_id self.word_to_id_[word] = word_id
self.id_to_word[word_id] = word #build id to word dict self.id_to_word[word_id] = word #build id to word dict
word_id += 1 word_id += 1
word_counts.append(count) word_counts.append(count)
word_all_count += count word_all_count += count
with open(dict_path + "_word_to_id_", 'w+') as f6: with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6:
for k, v in self.word_to_id_.items(): for k, v in self.word_to_id_.items():
f6.write( f6.write(k + " " + str(v) + '\n')
k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
self.dict_size = len(self.word_to_id_) self.dict_size = len(self.word_to_id_)
self.word_frequencys = [ self.word_frequencys = [
...@@ -55,22 +73,22 @@ class Word2VecReader(object): ...@@ -55,22 +73,22 @@ class Word2VecReader(object):
print("dict_size = " + str( print("dict_size = " + str(
self.dict_size)) + " word_all_count = " + str(word_all_count) self.dict_size)) + " word_all_count = " + str(word_all_count)
with open(dict_path + "_ptable", 'r') as f2: with io.open(dict_path + "_ptable", 'r', encoding='utf-8') as f2:
for line in f2: for line in f2:
self.word_to_path[line.split("\t")[0]] = np.fromstring( self.word_to_path[line.split('\t')[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ') line.split('\t')[1], dtype=int, sep=' ')
self.num_non_leaf = np.fromstring( self.num_non_leaf = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')[0] line.split('\t')[1], dtype=int, sep=' ')[0]
print("word_ptable dict_size = " + str(len(self.word_to_path))) print("word_ptable dict_size = " + str(len(self.word_to_path)))
with open(dict_path + "_pcode", 'r') as f3: with io.open(dict_path + "_pcode", 'r', encoding='utf-8') as f3:
for line in f3: for line in f3:
line = line.decode(encoding='UTF-8') self.word_to_code[line.split('\t')[0]] = np.fromstring(
self.word_to_code[line.split("\t")[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ') line.split('\t')[1], dtype=int, sep=' ')
print("word_pcode dict_size = " + str(len(self.word_to_code))) print("word_pcode dict_size = " + str(len(self.word_to_code)))
self.random_generator = NumpyRandomInt(1, self.window_size_ + 1)
def get_context_words(self, words, idx, window_size): def get_context_words(self, words, idx):
""" """
Get the context word list of target word. Get the context word list of target word.
...@@ -78,31 +96,34 @@ class Word2VecReader(object): ...@@ -78,31 +96,34 @@ class Word2VecReader(object):
idx: input word index idx: input word index
window_size: window size window_size: window size
""" """
target_window = np.random.randint(1, window_size + 1) target_window = self.random_generator()
# need to keep in mind that maybe there are no enough words before the target word. start_point = idx - target_window # if (idx - target_window) > 0 else 0
start_point = idx - target_window if (idx - target_window) > 0 else 0 if start_point < 0:
start_point = 0
end_point = idx + target_window end_point = idx + target_window
# context words of the target word targets = words[start_point:idx] + words[idx + 1:end_point + 1]
targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
return list(targets) return set(targets)
def train(self, with_hs): def train(self, with_hs):
def _reader(): def _reader():
for file in self.filelist: for file in self.filelist:
with open(self.data_path_ + "/" + file, 'r') as f: with io.open(
self.data_path_ + "/" + file, 'r',
encoding='utf-8') as f:
logger.info("running data in {}".format(self.data_path_ + logger.info("running data in {}".format(self.data_path_ +
"/" + file)) "/" + file))
count = 1 count = 1
for line in f: for line in f:
if self.trainer_id == count % self.trainer_num: if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line) line = preprocess.strip_lines(line, self.word_count)
word_ids = [ word_ids = [
self.word_to_id_[word] for word in line.split() self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_ if word in self.word_to_id_
] ]
for idx, target_id in enumerate(word_ids): for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words( context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_) word_ids, idx)
for context_id in context_word_ids: for context_id in context_word_ids:
yield [target_id], [context_id] yield [target_id], [context_id]
else: else:
...@@ -111,26 +132,28 @@ class Word2VecReader(object): ...@@ -111,26 +132,28 @@ class Word2VecReader(object):
def _reader_hs(): def _reader_hs():
for file in self.filelist: for file in self.filelist:
with open(self.data_path_ + "/" + file, 'r') as f: with io.open(
self.data_path_ + "/" + file, 'r',
encoding='utf-8') as f:
logger.info("running data in {}".format(self.data_path_ + logger.info("running data in {}".format(self.data_path_ +
"/" + file)) "/" + file))
count = 1 count = 1
for line in f: for line in f:
if self.trainer_id == count % self.trainer_num: if self.trainer_id == count % self.trainer_num:
line = preprocess.strip_lines(line) line = preprocess.strip_lines(line, self.word_count)
word_ids = [ word_ids = [
self.word_to_id_[word] for word in line.split() self.word_to_id_[word] for word in line.split()
if word in self.word_to_id_ if word in self.word_to_id_
] ]
for idx, target_id in enumerate(word_ids): for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words( context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_) word_ids, idx)
for context_id in context_word_ids: for context_id in context_word_ids:
yield [target_id], [context_id], [ yield [target_id], [context_id], [
self.word_to_code[self.id_to_word[ self.word_to_path[self.id_to_word[
target_id]] target_id]]
], [ ], [
self.word_to_path[self.id_to_word[ self.word_to_code[self.id_to_word[
target_id]] target_id]]
] ]
else: else:
...@@ -144,13 +167,20 @@ class Word2VecReader(object): ...@@ -144,13 +167,20 @@ class Word2VecReader(object):
if __name__ == "__main__": if __name__ == "__main__":
window_size = 10 window_size = 5
reader = Word2VecReader(
"./data/1-billion_dict",
"./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/",
["news.en-00001-of-00100"], 0, 1)
reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size)
i = 0 i = 0
for x, y in reader.train()(): # print(reader.train(True))
for x, y, z, f in reader.train(True)():
print("x: " + str(x)) print("x: " + str(x))
print("y: " + str(y)) print("y: " + str(y))
print("path: " + str(z))
print("code: " + str(f))
print("\n") print("\n")
if i == 10: if i == 10:
exit(0) exit(0)
......
...@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "" ...@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.executor import global_scope from paddle.fluid.executor import global_scope
import six
import reader import reader
from network_conf import skip_gram_word2vec from network_conf import skip_gram_word2vec
from infer import inference_test from infer import inference_test
...@@ -29,7 +29,7 @@ def parse_args(): ...@@ -29,7 +29,7 @@ def parse_args():
'--train_data_path', '--train_data_path',
type=str, type=str,
default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled', default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
help="The path of training dataset") help="The path of taining dataset")
parser.add_argument( parser.add_argument(
'--dict_path', '--dict_path',
type=str, type=str,
...@@ -43,7 +43,7 @@ def parse_args(): ...@@ -43,7 +43,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--batch_size', '--batch_size',
type=int, type=int,
default=100, default=1000,
help="The size of mini-batch (default:100)") help="The size of mini-batch (default:100)")
parser.add_argument( parser.add_argument(
'--num_passes', '--num_passes',
...@@ -125,14 +125,44 @@ def parse_args(): ...@@ -125,14 +125,44 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
def convert_python_to_tensor(batch_size, sample_reader, is_hs):
def __reader__():
result = None
if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
for sample in sample_reader():
for i, fea in enumerate(sample):
result[i].append(fea)
if len(result[0]) == batch_size:
tensor_result = []
for tensor in result:
t = fluid.Tensor()
dat = np.array(tensor, dtype='int64')
if len(dat.shape) > 2:
dat = dat.reshape((dat.shape[0], dat.shape[2]))
elif len(dat.shape) == 1:
dat = dat.reshape((-1, 1))
t.set(dat, fluid.CPUPlace())
tensor_result.append(t)
yield tensor_result
if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
return __reader__
def train_loop(args, train_program, reader, py_reader, loss, trainer_id): def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train((args.with_hs or (not args.with_nce))),
buf_size=args.batch_size * 100),
batch_size=args.batch_size)
py_reader.decorate_paddle_reader(train_reader) py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or (
not args.with_nce))), (args.with_hs or (
not args.with_nce))))
place = fluid.CPUPlace() place = fluid.CPUPlace()
...@@ -140,6 +170,7 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): ...@@ -140,6 +170,7 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
exec_strategy = fluid.ExecutionStrategy() exec_strategy = fluid.ExecutionStrategy()
exec_strategy.use_experimental_executor = True
print("CPU_NUM:" + str(os.getenv("CPU_NUM"))) print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
exec_strategy.num_threads = int(os.getenv("CPU_NUM")) exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
...@@ -161,32 +192,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): ...@@ -161,32 +192,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
profiler_step_end = 30 profiler_step_end = 30
for pass_id in range(args.num_passes): for pass_id in range(args.num_passes):
epoch_start = time.time()
py_reader.start() py_reader.start()
time.sleep(10)
epoch_start = time.time()
batch_id = 0 batch_id = 0
start = time.clock() start = time.clock()
try: try:
while True: while True:
if profiler_step == profiler_step_start:
fluid.profiler.start_profiler(profile_state)
loss_val = train_exe.run(fetch_list=[loss.name]) loss_val = train_exe.run(fetch_list=[loss.name])
loss_val = np.mean(loss_val) loss_val = np.mean(loss_val)
if profiler_step == profiler_step_end:
fluid.profiler.stop_profiler('total', 'trainer_profile.log')
profiler_step += 1
else:
profiler_step += 1
if batch_id % 50 == 0: if batch_id % 50 == 0:
logger.info( logger.info(
"TRAIN --> pass: {} batch: {} loss: {} reader queue:{}". "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
format(pass_id, batch_id, format(pass_id, batch_id,
loss_val.mean() / args.batch_size, loss_val.mean(), py_reader.queue.size()))
py_reader.queue.size()))
if args.with_speed: if args.with_speed:
if batch_id % 1000 == 0 and batch_id != 0: if batch_id % 1000 == 0 and batch_id != 0:
elapsed = (time.clock() - start) elapsed = (time.clock() - start)
...@@ -256,7 +278,7 @@ def train(args): ...@@ -256,7 +278,7 @@ def train(args):
optimizer = None optimizer = None
if args.with_Adam: if args.with_Adam:
optimizer = fluid.optimizer.Adam(learning_rate=1e-4) optimizer = fluid.optimizer.Adam(learning_rate=1e-4, lazy_mode=True)
else: else:
optimizer = fluid.optimizer.SGD(learning_rate=1e-4) optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册