提交 ac301305 编写于 作者: J JiabinYang

add costumed random and refine code

上级 4780f758
...@@ -14,6 +14,11 @@ Download dataset: ...@@ -14,6 +14,11 @@ Download dataset:
```bash ```bash
cd data && ./download.sh && cd .. cd data && ./download.sh && cd ..
``` ```
if you would like to use our supported third party vocab, please run:
```bash
wget http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt
```
## Model ## Model
This model implement a skip-gram model of word2vector. This model implement a skip-gram model of word2vector.
...@@ -26,6 +31,7 @@ Preprocess the training data to generate a word dict. ...@@ -26,6 +31,7 @@ Preprocess the training data to generate a word dict.
```bash ```bash
python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
``` ```
if you would like to use our supported third party vocab, please set
## Train ## Train
The command line options for training can be listed by `python train.py -h`. The command line options for training can be listed by `python train.py -h`.
......
#!/bin/bash
wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
tar -zxvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
...@@ -6,7 +6,6 @@ from Queue import PriorityQueue ...@@ -6,7 +6,6 @@ from Queue import PriorityQueue
import logging import logging
import argparse import argparse
import preprocess import preprocess
from sklearn.metrics.pairwise import cosine_similarity
word_to_id = dict() word_to_id = dict()
id_to_word = dict() id_to_word = dict()
...@@ -89,11 +88,8 @@ def build_test_case_from_file(args, emb): ...@@ -89,11 +88,8 @@ def build_test_case_from_file(args, emb):
exclude_lists = list() exclude_lists = list()
for file_dir in current_list: for file_dir in current_list:
with open(args.test_files_dir + "/" + file_dir, 'r') as f: with open(args.test_files_dir + "/" + file_dir, 'r') as f:
count = 0
for line in f: for line in f:
if count == 0: if ':' in line:
pass
elif ':' in line:
logger.info("{}".format(line)) logger.info("{}".format(line))
pass pass
else: else:
...@@ -110,7 +106,6 @@ def build_test_case_from_file(args, emb): ...@@ -110,7 +106,6 @@ def build_test_case_from_file(args, emb):
word_to_id[line.split()[0]], word_to_id[line.split()[0]],
word_to_id[line.split()[1]], word_to_id[line.split()[2]] word_to_id[line.split()[1]], word_to_id[line.split()[2]]
]) ])
count += 1
test_cases = norm(np.array(test_cases)) test_cases = norm(np.array(test_cases))
return test_cases, test_case_descs, test_labels, exclude_lists return test_cases, test_case_descs, test_labels, exclude_lists
...@@ -151,8 +146,8 @@ def build_test_case(args, emb): ...@@ -151,8 +146,8 @@ def build_test_case(args, emb):
def norm(x): def norm(x):
emb = np.linalg.norm(x, axis=1, keepdims=True) y = np.linalg.norm(x, axis=1, keepdims=True)
return x / emb return x / y
def inference_test(scope, model_dir, args): def inference_test(scope, model_dir, args):
...@@ -180,9 +175,8 @@ def inference_test(scope, model_dir, args): ...@@ -180,9 +175,8 @@ def inference_test(scope, model_dir, args):
logger.info("Test result for {}".format(test_case_desc[i])) logger.info("Test result for {}".format(test_case_desc[i]))
result = results[i] result = results[i]
for j in range(accual_rank): for j in range(accual_rank):
if (j == accual_rank - 1) and ( if result[j][1] == test_labels[
result[j][1] == test_labels[i] i]: # if the nearest word is what we want
): # if the nearest word is what we want
correct_num += 1 correct_num += 1
logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[ logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
result[j][1]], result[j][0])) result[j][1]], result[j][0]))
...@@ -296,6 +290,8 @@ def infer_once(args): ...@@ -296,6 +290,8 @@ def infer_once(args):
fluid.io.load_persistables( fluid.io.load_persistables(
executor=exe, dirname=args.model_output_dir + "/") executor=exe, dirname=args.model_output_dir + "/")
inference_test(Scope, args.model_output_dir, args) inference_test(Scope, args.model_output_dir, args)
else:
logger.info("Wrong Directory or save model failed!")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -238,13 +238,13 @@ def preprocess(args): ...@@ -238,13 +238,13 @@ def preprocess(args):
with open(args.dict_path + "_ptable", 'w+') as f2: with open(args.dict_path + "_ptable", 'w+') as f2:
for pk, pv in path_table.items(): for pk, pv in path_table.items():
f2.write( f2.write(
pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8") pk.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8")
for x in pv)) + '\n') for x in pv)) + '\n')
with open(args.dict_path + "_pcode", 'w+') as f3: with open(args.dict_path + "_pcode", 'w+') as f3:
for pck, pcv in path_code.items(): for pck, pcv in path_code.items():
f3.write( f3.write(
pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8") pck.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8")
for x in pcv)) + '\n') for x in pcv)) + '\n')
......
...@@ -10,6 +10,24 @@ logger = logging.getLogger("fluid") ...@@ -10,6 +10,24 @@ logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
class NumpyRandomInt(object):
def __init__(self, a, b, buf_size=1000):
self.idx = 0
self.buffer = np.random.random_integers(a, b, buf_size)
self.a = a
self.b = b
def __call__(self):
if self.idx == len(self.buffer):
self.buffer = np.random.random_integers(self.a, self.b,
len(self.buffer))
self.idx = 0
result = self.buffer[self.idx]
self.idx += 1
return result
class Word2VecReader(object): class Word2VecReader(object):
def __init__(self, def __init__(self,
dict_path, dict_path,
...@@ -57,7 +75,7 @@ class Word2VecReader(object): ...@@ -57,7 +75,7 @@ class Word2VecReader(object):
with open(dict_path + "_ptable", 'r') as f2: with open(dict_path + "_ptable", 'r') as f2:
for line in f2: for line in f2:
self.word_to_path[line.split("\t")[0]] = np.fromstring( self.word_to_path[line.split('\t')[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ') line.split('\t')[1], dtype=int, sep=' ')
self.num_non_leaf = np.fromstring( self.num_non_leaf = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ')[0] line.split('\t')[1], dtype=int, sep=' ')[0]
...@@ -66,11 +84,12 @@ class Word2VecReader(object): ...@@ -66,11 +84,12 @@ class Word2VecReader(object):
with open(dict_path + "_pcode", 'r') as f3: with open(dict_path + "_pcode", 'r') as f3:
for line in f3: for line in f3:
line = line.decode(encoding='UTF-8') line = line.decode(encoding='UTF-8')
self.word_to_code[line.split("\t")[0]] = np.fromstring( self.word_to_code[line.split('\t')[0]] = np.fromstring(
line.split('\t')[1], dtype=int, sep=' ') line.split('\t')[1], dtype=int, sep=' ')
print("word_pcode dict_size = " + str(len(self.word_to_code))) print("word_pcode dict_size = " + str(len(self.word_to_code)))
self.random_generator = NumpyRandomInt(1, self.window_size_ + 1)
def get_context_words(self, words, idx, window_size): def get_context_words(self, words, idx):
""" """
Get the context word list of target word. Get the context word list of target word.
...@@ -78,13 +97,14 @@ class Word2VecReader(object): ...@@ -78,13 +97,14 @@ class Word2VecReader(object):
idx: input word index idx: input word index
window_size: window size window_size: window size
""" """
target_window = np.random.randint(1, window_size + 1) target_window = self.random_generator()
# need to keep in mind that maybe there are no enough words before the target word. start_point = idx - target_window # if (idx - target_window) > 0 else 0
start_point = idx - target_window if (idx - target_window) > 0 else 0 if start_point < 0:
start_point = 0
end_point = idx + target_window end_point = idx + target_window
# context words of the target word targets = words[start_point:idx] + words[idx + 1:end_point + 1]
targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
return list(targets) return set(targets)
def train(self, with_hs): def train(self, with_hs):
def _reader(): def _reader():
...@@ -102,7 +122,7 @@ class Word2VecReader(object): ...@@ -102,7 +122,7 @@ class Word2VecReader(object):
] ]
for idx, target_id in enumerate(word_ids): for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words( context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_) word_ids, idx)
for context_id in context_word_ids: for context_id in context_word_ids:
yield [target_id], [context_id] yield [target_id], [context_id]
else: else:
...@@ -124,13 +144,13 @@ class Word2VecReader(object): ...@@ -124,13 +144,13 @@ class Word2VecReader(object):
] ]
for idx, target_id in enumerate(word_ids): for idx, target_id in enumerate(word_ids):
context_word_ids = self.get_context_words( context_word_ids = self.get_context_words(
word_ids, idx, self.window_size_) word_ids, idx)
for context_id in context_word_ids: for context_id in context_word_ids:
yield [target_id], [context_id], [ yield [target_id], [context_id], [
self.word_to_code[self.id_to_word[ self.word_to_path[self.id_to_word[
target_id]] target_id]]
], [ ], [
self.word_to_path[self.id_to_word[ self.word_to_code[self.id_to_word[
target_id]] target_id]]
] ]
else: else:
......
...@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "" ...@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = ""
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.executor import global_scope from paddle.fluid.executor import global_scope
import six
import reader import reader
from network_conf import skip_gram_word2vec from network_conf import skip_gram_word2vec
from infer import inference_test from infer import inference_test
...@@ -29,7 +29,7 @@ def parse_args(): ...@@ -29,7 +29,7 @@ def parse_args():
'--train_data_path', '--train_data_path',
type=str, type=str,
default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled', default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
help="The path of training dataset") help="The path of taining dataset")
parser.add_argument( parser.add_argument(
'--dict_path', '--dict_path',
type=str, type=str,
...@@ -43,7 +43,7 @@ def parse_args(): ...@@ -43,7 +43,7 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--batch_size', '--batch_size',
type=int, type=int,
default=100, default=1000,
help="The size of mini-batch (default:100)") help="The size of mini-batch (default:100)")
parser.add_argument( parser.add_argument(
'--num_passes', '--num_passes',
...@@ -125,9 +125,13 @@ def parse_args(): ...@@ -125,9 +125,13 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
def convert_python_to_tensor(batch_size, sample_reader): def convert_python_to_tensor(batch_size, sample_reader, is_hs):
def __reader__(): def __reader__():
result = [[], [], [], []] result = None
if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
for sample in sample_reader(): for sample in sample_reader():
for i, fea in enumerate(sample): for i, fea in enumerate(sample):
result[i].append(fea) result[i].append(fea)
...@@ -145,24 +149,21 @@ def convert_python_to_tensor(batch_size, sample_reader): ...@@ -145,24 +149,21 @@ def convert_python_to_tensor(batch_size, sample_reader):
tensor_result.append(t) tensor_result.append(t)
yield tensor_result yield tensor_result
result = [[], [], [], []] if is_hs:
result = [[], [], [], []]
else:
result = [[], []]
return __reader__ return __reader__
def train_loop(args, train_program, reader, py_reader, loss, trainer_id): def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
# train_reader = paddle.batch(
# paddle.reader.shuffle(
# reader.train((args.with_hs or (not args.with_nce))),
# buf_size=args.batch_size * 100),
# batch_size=args.batch_size)
# py_reader.decorate_paddle_reader(train_reader)
py_reader.decorate_tensor_provider( py_reader.decorate_tensor_provider(
convert_python_to_tensor(args.batch_size, convert_python_to_tensor(args.batch_size,
reader.train((args.with_hs or ( reader.train((args.with_hs or (
not args.with_nce))))) not args.with_nce))), (args.with_hs or (
not args.with_nce))))
place = fluid.CPUPlace() place = fluid.CPUPlace()
...@@ -192,32 +193,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id): ...@@ -192,32 +193,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
profiler_step_end = 30 profiler_step_end = 30
for pass_id in range(args.num_passes): for pass_id in range(args.num_passes):
epoch_start = time.time()
py_reader.start() py_reader.start()
time.sleep(10)
epoch_start = time.time()
batch_id = 0 batch_id = 0
start = time.clock() start = time.clock()
try: try:
while True: while True:
if profiler_step == profiler_step_start:
fluid.profiler.start_profiler(profile_state)
loss_val = train_exe.run(fetch_list=[loss.name]) loss_val = train_exe.run(fetch_list=[loss.name])
loss_val = np.mean(loss_val) loss_val = np.mean(loss_val)
if profiler_step == profiler_step_end:
fluid.profiler.stop_profiler('total', 'trainer_profile.log')
profiler_step += 1
else:
profiler_step += 1
if batch_id % 50 == 0: if batch_id % 50 == 0:
logger.info( logger.info(
"TRAIN --> pass: {} batch: {} loss: {} reader queue:{}". "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
format(pass_id, batch_id, format(pass_id, batch_id,
loss_val.mean() / args.batch_size, loss_val.mean(), py_reader.queue.size()))
py_reader.queue.size()))
if args.with_speed: if args.with_speed:
if batch_id % 1000 == 0 and batch_id != 0: if batch_id % 1000 == 0 and batch_id != 0:
elapsed = (time.clock() - start) elapsed = (time.clock() - start)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册