add costumed random and refine code

ac301305 · JiabinYang · 4780f758 · ac301305 · 4780f758 · ac301305
6 changed file
--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -14,6 +14,11 @@ Download dataset:
 ```bash
 cd data && ./download.sh && cd ..
 ```
+if you would like to use our supported third party vocab, please run:
+
+```bash
+wget http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt
+```

 ## Model
 This model implement a skip-gram model of word2vector.
@@ -26,6 +31,7 @@ Preprocess the training data to generate a word dict.
 ```bash
 python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
 ```
+if you would like to use our supported third party vocab, please set

 ## Train
 The command line options for training can be listed by `python train.py -h`.

--- a/fluid/PaddleRec/word2vec/data/download.sh
+++ b/fluid/PaddleRec/word2vec/data/download.sh
-#!/bin/bash
-
-wget http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
-tar -zxvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
--- a/fluid/PaddleRec/word2vec/infer.py
+++ b/fluid/PaddleRec/word2vec/infer.py
@@ -6,7 +6,6 @@ from Queue import PriorityQueue
 import logging
 import argparse
 import preprocess
-from sklearn.metrics.pairwise import cosine_similarity

 word_to_id = dict()
 id_to_word = dict()
@@ -89,11 +88,8 @@ def build_test_case_from_file(args, emb):
    exclude_lists = list()
    for file_dir in current_list:
        with open(args.test_files_dir + "/" + file_dir, 'r') as f:
-            count = 0
            for line in f:
-                if count == 0:
-                    pass
-                elif ':' in line:
+                if ':' in line:
                    logger.info("{}".format(line))
                    pass
                else:
@@ -110,7 +106,6 @@ def build_test_case_from_file(args, emb):
                        word_to_id[line.split()[0]],
                        word_to_id[line.split()[1]], word_to_id[line.split()[2]]
                    ])
-                count += 1
            test_cases = norm(np.array(test_cases))
    return test_cases, test_case_descs, test_labels, exclude_lists

@@ -151,8 +146,8 @@ def build_test_case(args, emb):


 def norm(x):
-    emb = np.linalg.norm(x, axis=1, keepdims=True)
-    return x / emb
+    y = np.linalg.norm(x, axis=1, keepdims=True)
+    return x / y


 def inference_test(scope, model_dir, args):
@@ -180,9 +175,8 @@ def inference_test(scope, model_dir, args):
        logger.info("Test result for {}".format(test_case_desc[i]))
        result = results[i]
        for j in range(accual_rank):
-            if (j == accual_rank - 1) and (
-                    result[j][1] == test_labels[i]
-            ):  # if the nearest word is what we want 
+            if result[j][1] == test_labels[
+                    i]:  # if the nearest word is what we want 
                correct_num += 1
            logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
                result[j][1]], result[j][0]))
@@ -296,6 +290,8 @@ def infer_once(args):
            fluid.io.load_persistables(
                executor=exe, dirname=args.model_output_dir + "/")
            inference_test(Scope, args.model_output_dir, args)
+    else:
+        logger.info("Wrong Directory or save model failed!")


 if __name__ == '__main__':

--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
@@ -238,13 +238,13 @@ def preprocess(args):
    with open(args.dict_path + "_ptable", 'w+') as f2:
        for pk, pv in path_table.items():
            f2.write(
-                pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
+                pk.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8")
                                                      for x in pv)) + '\n')

    with open(args.dict_path + "_pcode", 'w+') as f3:
        for pck, pcv in path_code.items():
            f3.write(
-                pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
+                pck.encode("utf-8") + '\t' + ' '.join((str(x).encode("utf-8")
                                                       for x in pcv)) + '\n')



--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
@@ -10,6 +10,24 @@ logger = logging.getLogger("fluid")
 logger.setLevel(logging.INFO)


+class NumpyRandomInt(object):
+    def __init__(self, a, b, buf_size=1000):
+        self.idx = 0
+        self.buffer = np.random.random_integers(a, b, buf_size)
+        self.a = a
+        self.b = b
+
+    def __call__(self):
+        if self.idx == len(self.buffer):
+            self.buffer = np.random.random_integers(self.a, self.b,
+                                                    len(self.buffer))
+            self.idx = 0
+
+        result = self.buffer[self.idx]
+        self.idx += 1
+        return result
+
+
 class Word2VecReader(object):
    def __init__(self,
                 dict_path,
@@ -57,7 +75,7 @@ class Word2VecReader(object):

        with open(dict_path + "_ptable", 'r') as f2:
            for line in f2:
-                self.word_to_path[line.split("\t")[0]] = np.fromstring(
+                self.word_to_path[line.split('\t')[0]] = np.fromstring(
                    line.split('\t')[1], dtype=int, sep=' ')
                self.num_non_leaf = np.fromstring(
                    line.split('\t')[1], dtype=int, sep=' ')[0]
@@ -66,11 +84,12 @@ class Word2VecReader(object):
        with open(dict_path + "_pcode", 'r') as f3:
            for line in f3:
                line = line.decode(encoding='UTF-8')
-                self.word_to_code[line.split("\t")[0]] = np.fromstring(
+                self.word_to_code[line.split('\t')[0]] = np.fromstring(
                    line.split('\t')[1], dtype=int, sep=' ')
        print("word_pcode dict_size = " + str(len(self.word_to_code)))
+        self.random_generator = NumpyRandomInt(1, self.window_size_ + 1)

-    def get_context_words(self, words, idx, window_size):
+    def get_context_words(self, words, idx):
        """
        Get the context word list of target word.

@@ -78,13 +97,14 @@ class Word2VecReader(object):
        idx: input word index
        window_size: window size
        """
-        target_window = np.random.randint(1, window_size + 1)
-        # need to keep in mind that maybe there are no enough words before the target word.
-        start_point = idx - target_window if (idx - target_window) > 0 else 0
+        target_window = self.random_generator()
+        start_point = idx - target_window  # if (idx - target_window) > 0 else 0
+        if start_point < 0:
+            start_point = 0
        end_point = idx + target_window
-        # context words of the target word
-        targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
-        return list(targets)
+        targets = words[start_point:idx] + words[idx + 1:end_point + 1]
+
+        return set(targets)

    def train(self, with_hs):
        def _reader():
@@ -102,7 +122,7 @@ class Word2VecReader(object):
                            ]
                            for idx, target_id in enumerate(word_ids):
                                context_word_ids = self.get_context_words(
-                                    word_ids, idx, self.window_size_)
+                                    word_ids, idx)
                                for context_id in context_word_ids:
                                    yield [target_id], [context_id]
                        else:
@@ -124,13 +144,13 @@ class Word2VecReader(object):
                            ]
                            for idx, target_id in enumerate(word_ids):
                                context_word_ids = self.get_context_words(
-                                    word_ids, idx, self.window_size_)
+                                    word_ids, idx)
                                for context_id in context_word_ids:
                                    yield [target_id], [context_id], [
-                                        self.word_to_code[self.id_to_word[
+                                        self.word_to_path[self.id_to_word[
                                            target_id]]
                                    ], [
-                                        self.word_to_path[self.id_to_word[
+                                        self.word_to_code[self.id_to_word[
                                            target_id]]
                                    ]
                        else:

--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.executor import global_scope
-
+import six
 import reader
 from network_conf import skip_gram_word2vec
 from infer import inference_test
@@ -29,7 +29,7 @@ def parse_args():
        '--train_data_path',
        type=str,
        default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
-        help="The path of training dataset")
+        help="The path of taining dataset")
    parser.add_argument(
        '--dict_path',
        type=str,
@@ -43,7 +43,7 @@ def parse_args():
    parser.add_argument(
        '--batch_size',
        type=int,
-        default=100,
+        default=1000,
        help="The size of mini-batch (default:100)")
    parser.add_argument(
        '--num_passes',
@@ -125,9 +125,13 @@ def parse_args():
    return parser.parse_args()


-def convert_python_to_tensor(batch_size, sample_reader):
+def convert_python_to_tensor(batch_size, sample_reader, is_hs):
    def __reader__():
-        result = [[], [], [], []]
+        result = None
+        if is_hs:
+            result = [[], [], [], []]
+        else:
+            result = [[], []]
        for sample in sample_reader():
            for i, fea in enumerate(sample):
                result[i].append(fea)
@@ -145,24 +149,21 @@ def convert_python_to_tensor(batch_size, sample_reader):

                    tensor_result.append(t)
                yield tensor_result
-                result = [[], [], [], []]
+                if is_hs:
+                    result = [[], [], [], []]
+                else:
+                    result = [[], []]

    return __reader__


 def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
-    # train_reader = paddle.batch(
-    #     paddle.reader.shuffle(
-    #         reader.train((args.with_hs or (not args.with_nce))),
-    #         buf_size=args.batch_size * 100),
-    #     batch_size=args.batch_size)
-
-    # py_reader.decorate_paddle_reader(train_reader)

    py_reader.decorate_tensor_provider(
        convert_python_to_tensor(args.batch_size,
                                 reader.train((args.with_hs or (
-                                     not args.with_nce)))))
+                                     not args.with_nce))), (args.with_hs or (
+                                         not args.with_nce))))

    place = fluid.CPUPlace()

@@ -192,32 +193,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
    profiler_step_end = 30

    for pass_id in range(args.num_passes):
-        epoch_start = time.time()
        py_reader.start()
+        time.sleep(10)
+        epoch_start = time.time()
        batch_id = 0
        start = time.clock()

        try:
            while True:

-                if profiler_step == profiler_step_start:
-                    fluid.profiler.start_profiler(profile_state)
-
                loss_val = train_exe.run(fetch_list=[loss.name])
                loss_val = np.mean(loss_val)

-                if profiler_step == profiler_step_end:
-                    fluid.profiler.stop_profiler('total', 'trainer_profile.log')
-                    profiler_step += 1
-                else:
-                    profiler_step += 1
-
                if batch_id % 50 == 0:
                    logger.info(
                        "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
                        format(pass_id, batch_id,
-                               loss_val.mean() / args.batch_size,
-                               py_reader.queue.size()))
+                               loss_val.mean(), py_reader.queue.size()))
                if args.with_speed:
                    if batch_id % 1000 == 0 and batch_id != 0:
                        elapsed = (time.clock() - start)