提交 ac301305 编写于 作者: J JiabinYang

add costumed random and refine code

上级 4780f758
......@@ -14,6 +14,11 @@ Download dataset:
```bash
cd data && ./download.sh && cd ..
```
if you would like to use our supported third party vocab, please run:
```bash
wget http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt
```
## Model
This model implement a skip-gram model of word2vector.
......@@ -26,6 +31,7 @@ Preprocess the training data to generate a word dict.
```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
```
if you would like to use our supported third party vocab, please set
## Train
The command line options for training can be listed by `python train.py -h`.
......
#!/bin/bash
wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
tar -zxvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
......@@ -6,7 +6,6 @@ from Queue import PriorityQueue
import logging
import argparse
import preprocess
from sklearn.metrics.pairwise import cosine_similarity
word_to_id = dict()
id_to_word = dict()
......@@ -89,11 +88,8 @@ def build_test_case_from_file(args, emb):
exclude_lists = list()
for file_dir in current_list:
with open(args.test_files_dir + "/" + file_dir, 'r') as f:
count = 0
for line in f:
if count == 0:
pass
elif ':' in line:
if ':' in line:
logger.info("{}".format(line))
pass
else:
......@@ -110,7 +106,6 @@ def build_test_case_from_file(args, emb):
word_to_id[line.split()[0]],
word_to_id[line.split()[1]], word_to_id[line.split()[2]]
])
count += 1
test_cases = norm(np.array(test_cases))
return test_cases, test_case_descs, test_labels, exclude_lists
......@@ -151,8 +146,8 @@ def build_test_case(args, emb):
def norm(x):
emb = np.linalg.norm(x, axis=1, keepdims=True)
return x / emb
y = np.linalg.norm(x, axis=1, keepdims=True)
return x / y
def inference_test(scope, model_dir, args):
......@@ -180,9 +175,8 @@ def inference_test(scope, model_dir, args):
logger.info("Test result for {}".format(test_case_desc[i]))
result = results[i]
for j in range(accual_rank):
if (j == accual_rank - 1) and (
result[j][1] == test_labels[i]
): # if the nearest word is what we want
if result[j][1] == test_labels[
i]: # if the nearest word is what we want
correct_num += 1
logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
result[j][1]], result[j][0]))
......@@ -296,6 +290,8 @@ def infer_once(args):
fluid.io.load_persistables(
executor=exe, dirname=args.model_output_dir + "/")
inference_test(Scope, args.model_output_dir, args)
else:
logger.info("Wrong Directory or save model failed!")
if __name__ == '__main__':
......
......@@ -238,13 +238,13 @@ def preprocess(args):
with open(args.dict_path + "_ptable", 'w+') as f2:
for pk, pv in path_table.items():
f2.write(
pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
pk.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8")
for x in pv)) + '\n')
with open(args.dict_path + "_pcode", 'w+') as f3:
for pck, pcv in path_code.items():
f3.write(
pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
pck.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8")
for x in pcv)) + '\n')
......
......@@ -10,6 +10,24 @@ logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)
class NumpyRandomInt(object):
def __init__(self, a, b, buf_size=1000):
self.idx = 0
self.buffer = np.random.random_integers(a, b, buf_size)
self.a = a
self.b = b
def __call__(self):
if self.idx == len(self.buffer):
self.buffer = np.random.random_integers(self.a, self.b,
len(self.buffer))
self.idx = 0
result = self.buffer[self.idx]
self.idx += 1
return result
class Word2VecReader(object):
def __init__(self,
dict_path,
......@@ -57,7 +75,7 @@ class Word2VecReader(object):
with open(dict_path + "_ptable", 'r') as f2:
for line in f2:
self.word_to_path[line.split("\t")[0]] = np.fromstring(
self.word_to_path[line.split('\t')[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')
self.num_non_leaf = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')[0]
......@@ -66,11 +84,12 @@ class Word2VecReader(object):
with open(dict_path + "_pcode", 'r') as f3:
for line in f3:
line = line.decode(encoding='UTF-8')
self.word_to_code[line.split("\t")[0]] = np.fromstring(
self.word_to_code[line.split('\t')[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')
print("word_pcode dict_size = " + str(len(self.word_to_code)))
self.random_generator = NumpyRandomInt(1, self.window_size_ + 1)
def get_context_words(self, words, idx, window_size):
def get_context_words(self, words, idx):
"""
Get the context word list of target word.
......@@ -78,13 +97,14 @@ class Word2VecReader(object):
idx: input word index
window_size: window size
"""
target_window = np.random.randint(1, window_size + 1)
# need to keep in mind that maybe there are no enough words before the target word.
start_point = idx - target_window if (idx - target_window) > 0 else 0
target_window = self.random_generator()
start_point = idx - target_window # if (idx - target_window) > 0 else 0
if start_point < 0:
start_point = 0
end_point = idx + target_window
# context words of the target word
targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
return list(targets)
targets = words[start_point:idx] + words[idx + 1:end_point + 1]
return set(targets)
def train(self, with_hs):
def _reader():
......@@ -102,7 +122,7 @@ class Word2VecReader(object):
]
for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_)
word_ids, idx)
for context_id in context_word_ids:
yield [target_id], [context_id]
else:
......@@ -124,13 +144,13 @@ class Word2VecReader(object):
]
for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_)
word_ids, idx)
for context_id in context_word_ids:
yield [target_id], [context_id], [
self.word_to_code[self.id_to_word[
self.word_to_path[self.id_to_word[
target_id]]
], [
self.word_to_path[self.id_to_word[
self.word_to_code[self.id_to_word[
target_id]]
]
else:
......
......@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle
import paddle.fluid as fluid
from paddle.fluid.executor import global_scope
import six
import reader
from network_conf import skip_gram_word2vec
from infer import inference_test
......@@ -29,7 +29,7 @@ def parse_args():
'--train_data_path',
type=str,
default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
help="The path of training dataset")
help="The path of taining dataset")
parser.add_argument(
'--dict_path',
type=str,
......@@ -43,7 +43,7 @@ def parse_args():
parser.add_argument(
'--batch_size',
type=int,
default=100,
default=1000,
help="The size of mini-batch (default:100)")
parser.add_argument(
'--num_passes',
......@@ -125,9 +125,13 @@ def parse_args():
return parser.parse_args()
def convert_python_to_tensor(batch_size, sample_reader):
def convert_python_to_tensor(batch_size, sample_reader, is_hs):
def __reader__():
result = [[], [], [], []]
result = None
if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
for sample in sample_reader():
for i, fea in enumerate(sample):
result[i].append(fea)
......@@ -145,24 +149,21 @@ def convert_python_to_tensor(batch_size, sample_reader):
tensor_result.append(t)
yield tensor_result
result = [[], [], [], []]
if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
return __reader__
def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
# train_reader = paddle.batch(
# paddle.reader.shuffle(
# reader.train((args.with_hs or (not args.with_nce))),
# buf_size=args.batch_size * 100),
# batch_size=args.batch_size)
# py_reader.decorate_paddle_reader(train_reader)
py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or (
not args.with_nce)))))
not args.with_nce))), (args.with_hs or (
not args.with_nce))))
place = fluid.CPUPlace()
......@@ -192,32 +193,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
profiler_step_end = 30
for pass_id in range(args.num_passes):
epoch_start = time.time()
py_reader.start()
time.sleep(10)
epoch_start = time.time()
batch_id = 0
start = time.clock()
try:
while True:
if profiler_step == profiler_step_start:
fluid.profiler.start_profiler(profile_state)
loss_val = train_exe.run(fetch_list=[loss.name])
loss_val = np.mean(loss_val)
if profiler_step == profiler_step_end:
fluid.profiler.stop_profiler('total', 'trainer_profile.log')
profiler_step += 1
else:
profiler_step += 1
if batch_id % 50 == 0:
logger.info(
"TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
format(pass_id, batch_id,
loss_val.mean() / args.batch_size,
py_reader.queue.size()))
loss_val.mean(), py_reader.queue.size()))
if args.with_speed:
if batch_id % 1000 == 0 and batch_id != 0:
elapsed = (time.clock() - start)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册