Merge pull request #1545 from JiabinYang/add-word2vec

support no third_party vocab

Merge pull request #1545 from JiabinYang/add-word2vec
support no third_party vocab
62148b78 · Jiabin Yang · GitHub · 1494c9a3 · cff7e36b · 62148b78
7 changed file
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
@@ -25,6 +25,7 @@ cd data && ./download.sh && cd ..
 ```bash
 python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
 ```
+如果您想使用我们支持的第三方词汇表，请将--other_dict_path设置为您存放将使用的词汇表的目录，并设置--with_other_dict使用它

 ## 训练
 训练的命令行选项可以通过`python train.py -h`列出。

--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -14,6 +14,11 @@ Download dataset:
 ```bash
 cd data && ./download.sh && cd ..
 ```
+if you would like to use our supported third party vocab, please run:
+
+```bash
+wget http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt
+```

 ## Model
 This model implement a skip-gram model of word2vector.
@@ -24,8 +29,10 @@ This model implement a skip-gram model of word2vector.
 Preprocess the training data to generate a word dict.

 ```bash
-python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --is_local --dict_path data/1-billion_dict
 ```
+if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+save the vocab you will use and set --with_other_dict flag on to using it.

 ## Train
 The command line options for training can be listed by `python train.py -h`.

--- a/fluid/PaddleRec/word2vec/infer.py
+++ b/fluid/PaddleRec/word2vec/infer.py
@@ -2,11 +2,9 @@ import time
 import os
 import paddle.fluid as fluid
 import numpy as np
-from Queue import PriorityQueue
 import logging
 import argparse
 import preprocess
-from sklearn.metrics.pairwise import cosine_similarity

 word_to_id = dict()
 id_to_word = dict()
@@ -51,8 +49,8 @@ def parse_args():
        '--test_acc',
        action='store_true',
        required=False,
-        default=True,
-        help='if using test_files , (default: True)')
+        default=False,
+        help='if using test_files , (default: False)')
    parser.add_argument(
        '--test_files_dir',
        type=str,
@@ -85,14 +83,12 @@ def build_test_case_from_file(args, emb):
    logger.info("test files list: {}".format(current_list))
    test_cases = list()
    test_labels = list()
+    test_case_descs = list()
    exclude_lists = list()
    for file_dir in current_list:
        with open(args.test_files_dir + "/" + file_dir, 'r') as f:
-            count = 0
            for line in f:
-                if count == 0:
-                    pass
-                elif ':' in line:
+                if ':' in line:
                    logger.info("{}".format(line))
                    pass
                else:
@@ -102,14 +98,15 @@ def build_test_case_from_file(args, emb):
                            line.split()[2]]]
                    test_case_desc = line.split()[0] + " - " + line.split()[
                        1] + " + " + line.split()[2] + " = " + line.split()[3]
-                    test_cases.append([test_case, test_case_desc])
+                    test_cases.append(test_case)
+                    test_case_descs.append(test_case_desc)
                    test_labels.append(word_to_id[line.split()[3]])
                    exclude_lists.append([
                        word_to_id[line.split()[0]],
                        word_to_id[line.split()[1]], word_to_id[line.split()[2]]
                    ])
-                count += 1
-    return test_cases, test_labels, exclude_lists
+            test_cases = norm(np.array(test_cases))
+    return test_cases, test_case_descs, test_labels, exclude_lists


 def build_small_test_case(emb):
@@ -133,8 +130,27 @@ def build_small_test_case(emb):
        'deeper']]
    desc5 = "old - older + deeper = deep"
    label5 = word_to_id["deep"]
-    return [[emb1, desc1], [emb2, desc2], [emb3, desc3], [emb4, desc4],
-            [emb5, desc5]], [label1, label2, label3, label4, label5]
+
+    emb6 = emb[word_to_id['boy']]
+    desc6 = "boy"
+    label6 = word_to_id["boy"]
+    emb7 = emb[word_to_id['king']]
+    desc7 = "king"
+    label7 = word_to_id["king"]
+    emb8 = emb[word_to_id['sun']]
+    desc8 = "sun"
+    label8 = word_to_id["sun"]
+    emb9 = emb[word_to_id['key']]
+    desc9 = "key"
+    label9 = word_to_id["key"]
+    test_cases = [emb1, emb2, emb3, emb4, emb5, emb6, emb7, emb8, emb9]
+    test_case_desc = [
+        desc1, desc2, desc3, desc4, desc5, desc6, desc7, desc8, desc9
+    ]
+    test_labels = [
+        label1, label2, label3, label4, label5, label6, label7, label8, label9
+    ]
+    return norm(np.array(test_cases)), test_case_desc, test_labels


 def build_test_case(args, emb):
@@ -144,86 +160,80 @@ def build_test_case(args, emb):
        return build_small_test_case(emb)


+def norm(x):
+    y = np.linalg.norm(x, axis=1, keepdims=True)
+    return x / y
+
+
 def inference_test(scope, model_dir, args):
    BuildWord_IdMap(args.dict_path)
    logger.info("model_dir is: {}".format(model_dir + "/"))
    emb = np.array(scope.find_var("embeding").get_tensor())
+    x = norm(emb)
    logger.info("inference result: ====================")
-    test_cases = list()
+    test_cases = None
+    test_case_desc = list()
    test_labels = list()
    exclude_lists = list()
    if args.test_acc:
-        test_cases, test_labels, exclude_lists = build_test_case(args, emb)
+        test_cases, test_case_desc, test_labels, exclude_lists = build_test_case(
+            args, emb)
    else:
-        test_cases, test_labels = build_test_case(args, emb)
+        test_cases, test_case_desc, test_labels = build_test_case(args, emb)
        exclude_lists = [[-1]]
    accual_rank = 1 if args.test_acc else args.rank_num
    correct_num = 0
+    cosine_similarity_matrix = np.dot(test_cases, x.T)
+    results = topKs(accual_rank, cosine_similarity_matrix, exclude_lists,
+                    args.test_acc)
    for i in range(len(test_labels)):
-        pq = None
-        if args.test_acc:
-            pq = topK(
-                accual_rank,
-                emb,
-                test_cases[i][0],
-                exclude_lists[i],
-                is_acc=True)
-        else:
-            pq = pq = topK(
-                accual_rank,
-                emb,
-                test_cases[i][0],
-                exclude_lists[0],
-                is_acc=False)
-        logger.info("Test result for {}".format(test_cases[i][1]))
+        logger.info("Test result for {}".format(test_case_desc[i]))
+        result = results[i]
        for j in range(accual_rank):
-            pq_tmps = pq.get()
-            if (j == accual_rank - 1) and (
-                    pq_tmps.id == test_labels[i]
-            ):  # if the nearest word is what we want 
+            if result[j][1] == test_labels[
+                    i]:  # if the nearest word is what we want 
                correct_num += 1
-            logger.info("{} nearest is {}, rate is {}".format(
-                accual_rank - j, id_to_word[pq_tmps.id], pq_tmps.priority))
-    acc = correct_num / len(test_labels)
-    logger.info("Test acc is: {}, there are {} / {}}".format(acc, correct_num,
-                                                             len(test_labels)))
-
+            logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
+                result[j][1]], result[j][0]))
+    logger.info("Test acc is: {}, there are {} / {}".format(correct_num / len(
+        test_labels), correct_num, len(test_labels)))

-class PQ_Entry(object):
-    def __init__(self, cos_similarity, id):
-        self.priority = cos_similarity
-        self.id = id
-
-    def __cmp__(self, other):
-        return cmp(self.priority, other.priority)

+def topK(k, cosine_similarity_list, exclude_list, is_acc=False):
+    if k == 1 and is_acc:  # accelerate acc calculate
+        max = cosine_similarity_list[0]
+        id = 0
+        for i in range(len(cosine_similarity_list)):
+            if cosine_similarity_list[i] >= max and (i not in exclude_list):
+                max = cosine_similarity_list[i]
+                id = i
+            else:
+                pass
+        return [[max, id]]
+    else:
+        result = list()
+        result_index = np.argpartition(cosine_similarity_list, -k)[-k:]
+        for index in result_index:
+            result.append([cosine_similarity_list[index], index])
+        result.sort(reverse=True)
+        return result

-def topK(k, emb, test_emb, exclude_list, is_acc=False):
-    pq = PriorityQueue(k + 1)
-    while not pq.empty():
-        try:
-            pq.get(False)
-        except Empty:
-            continue
-        pq.task_done()

-    if len(emb) <= k:
-        for i in range(len(emb)):
-            x = cosine_similarity([emb[i]], [test_emb])
-            pq.put(PQ_Entry(x, i))
-        return pq
+def topKs(k, cosine_similarity_matrix, exclude_lists, is_acc=False):
+    results = list()
+    result_queues = list()
+    correct_num = 0

-    for i in range(len(emb)):
-        if is_acc and (i in exclude_list):
-            pass
+    for i in range(cosine_similarity_matrix.shape[0]):
+        tmp_pq = None
+        if is_acc:
+            tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[i],
+                          is_acc)
        else:
-            x = cosine_similarity([emb[i]], [test_emb])
-            pq_e = PQ_Entry(x, i)
-            if pq.full():
-                pq.get()
-            pq.put(pq_e)
-    pq.get()
-    return pq
+            tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[0],
+                          is_acc)
+        result_queues.append(tmp_pq)
+    return result_queues


 def infer_during_train(args):
@@ -235,8 +245,6 @@ def infer_during_train(args):
    while True:
        time.sleep(60)
        current_list = os.listdir(args.model_output_dir)
-        # logger.info("current_list is : {}".format(current_list))
-        # logger.info("model_file_list is : {}".format(model_file_list))
        if set(model_file_list) == set(current_list):
            if solved_new:
                solved_new = False
@@ -271,6 +279,8 @@ def infer_once(args):
            fluid.io.load_persistables(
                executor=exe, dirname=args.model_output_dir + "/")
            inference_test(Scope, args.model_output_dir, args)
+    else:
+        logger.info("Wrong Directory or save model failed!")


 if __name__ == '__main__':

--- a/fluid/PaddleRec/word2vec/network_conf.py
+++ b/fluid/PaddleRec/word2vec/network_conf.py
@@ -95,8 +95,7 @@ def skip_gram_word2vec(dict_size,
        capacity=64, feed_list=datas, name='py_reader', use_double_buffer=True)

    words = fluid.layers.read_file(py_reader)
-
-    emb = fluid.layers.embedding(
+    target_emb = fluid.layers.embedding(
        input=words[0],
        is_sparse=is_sparse,
        size=[dict_size, embedding_size],
@@ -104,16 +103,23 @@ def skip_gram_word2vec(dict_size,
            name='embeding',
            initializer=fluid.initializer.Normal(scale=1 /
                                                 math.sqrt(dict_size))))
-
+    context_emb = fluid.layers.embedding(
+        input=words[1],
+        is_sparse=is_sparse,
+        size=[dict_size, embedding_size],
+        param_attr=fluid.ParamAttr(
+            name='embeding',
+            initializer=fluid.initializer.Normal(scale=1 /
+                                                 math.sqrt(dict_size))))
    cost, cost_nce, cost_hs = None, None, None

    if with_nce:
-        cost_nce = nce_layer(emb, words[1], embedding_size, dict_size, 5,
+        cost_nce = nce_layer(target_emb, words[1], embedding_size, dict_size, 5,
                             "uniform", word_frequencys, None)
        cost = cost_nce
    if with_hsigmoid:
-        cost_hs = hsigmoid_layer(emb, words[1], words[2], words[3], dict_size,
-                                 is_sparse)
+        cost_hs = hsigmoid_layer(context_emb, words[0], words[2], words[3],
+                                 dict_size, is_sparse)
        cost = cost_hs
    if with_nce and with_hsigmoid:
        cost = fluid.layers.elementwise_add(cost_nce, cost_hs)

--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
@@ -3,6 +3,7 @@
 import re
 import six
 import argparse
+import io

 prog = re.compile("[^a-z ]", flags=0)
 word_count = dict()
@@ -83,7 +84,6 @@ def native_to_unicode(s):
        return _to_unicode(s)
    except UnicodeDecodeError:
        res = _to_unicode(s, ignore_errors=True)
-        tf.logging.info("Ignoring Unicode error, outputting: %s" % res)
        return res


@@ -199,34 +199,30 @@ def preprocess(args):
    # word to count

    if args.with_other_dict:
-        with open(args.other_dict_path, 'r') as f:
+        with io.open(args.other_dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                word_count[native_to_unicode(line.strip())] = 1

    if args.is_local:
        for i in range(1, 100):
-            with open(args.data_path + "/news.en-000{:0>2d}-of-00100".format(
-                    i)) as f:
+            with io.open(
+                    args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
+                    encoding='utf-8') as f:
                for line in f:
                    line = strip_lines(line)
                    words = line.split()
-                    for item in words:
-                        if item in word_count:
-                            word_count[item] = word_count[item] + 1
-                        else:
-                            word_count[native_to_unicode('<UNK>')] += 1
-    # with open(args.data_path + "/tmp.txt") as f:
-    #     for line in f:
-    #         print("line before strip is: {}".format(line))
-    #         line = strip_lines(line, word_count)
-    #         print("line after strip is: {}".format(line))
-    #         words = line.split()
-    #         print("words after split is: {}".format(words))
-    #         for item in words:
-    #             if item in word_count:
-    #                 word_count[item] = word_count[item] + 1
-    #             else:
-    #                 word_count[item] = 1
+                    if args.with_other_dict:
+                        for item in words:
+                            if item in word_count:
+                                word_count[item] = word_count[item] + 1
+                            else:
+                                word_count[native_to_unicode('<UNK>')] += 1
+                    else:
+                        for item in words:
+                            if item in word_count:
+                                word_count[item] = word_count[item] + 1
+                            else:
+                                word_count[item] = 1
    item_to_remove = []
    for item in word_count:
        if word_count[item] <= args.freq:
@@ -236,21 +232,17 @@ def preprocess(args):

    path_table, path_code, word_code_len = build_Huffman(word_count, 40)

-    with open(args.dict_path, 'w+') as f:
+    with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
        for k, v in word_count.items():
-            f.write(k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
+            f.write(k + " " + str(v) + '\n')

-    with open(args.dict_path + "_ptable", 'w+') as f2:
+    with io.open(args.dict_path + "_ptable", 'w+', encoding='utf-8') as f2:
        for pk, pv in path_table.items():
-            f2.write(
-                pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
-                                                      for x in pv)) + '\n')
+            f2.write(pk + '\t' + ' '.join((str(x) for x in pv)) + '\n')

-    with open(args.dict_path + "_pcode", 'w+') as f3:
+    with io.open(args.dict_path + "_pcode", 'w+', encoding='utf-8') as f3:
        for pck, pcv in path_code.items():
-            f3.write(
-                pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
-                                                       for x in pcv)) + '\n')
+            f3.write(pck + '\t' + ' '.join((str(x) for x in pcv)) + '\n')


 if __name__ == "__main__":

--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
@@ -2,14 +2,32 @@

 import numpy as np
 import preprocess
-
 import logging
+import io

 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("fluid")
 logger.setLevel(logging.INFO)


+class NumpyRandomInt(object):
+    def __init__(self, a, b, buf_size=1000):
+        self.idx = 0
+        self.buffer = np.random.random_integers(a, b, buf_size)
+        self.a = a
+        self.b = b
+
+    def __call__(self):
+        if self.idx == len(self.buffer):
+            self.buffer = np.random.random_integers(self.a, self.b,
+                                                    len(self.buffer))
+            self.idx = 0
+
+        result = self.buffer[self.idx]
+        self.idx += 1
+        return result
+
+
 class Word2VecReader(object):
    def __init__(self,
                 dict_path,
@@ -24,6 +42,7 @@ class Word2VecReader(object):
        self.num_non_leaf = 0
        self.word_to_id_ = dict()
        self.id_to_word = dict()
+        self.word_count = dict()
        self.word_to_path = dict()
        self.word_to_code = dict()
        self.trainer_id = trainer_id
@@ -33,20 +52,19 @@ class Word2VecReader(object):
        word_counts = []
        word_id = 0

-        with open(dict_path, 'r') as f:
+        with io.open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
-                line = line.decode(encoding='UTF-8')
                word, count = line.split()[0], int(line.split()[1])
+                self.word_count[word] = count
                self.word_to_id_[word] = word_id
                self.id_to_word[word_id] = word  #build id to word dict
                word_id += 1
                word_counts.append(count)
                word_all_count += count

-        with open(dict_path + "_word_to_id_", 'w+') as f6:
+        with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6:
            for k, v in self.word_to_id_.items():
-                f6.write(
-                    k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
+                f6.write(k + " " + str(v) + '\n')

        self.dict_size = len(self.word_to_id_)
        self.word_frequencys = [
@@ -55,22 +73,22 @@ class Word2VecReader(object):
        print("dict_size = " + str(
            self.dict_size)) + " word_all_count = " + str(word_all_count)

-        with open(dict_path + "_ptable", 'r') as f2:
+        with io.open(dict_path + "_ptable", 'r', encoding='utf-8') as f2:
            for line in f2:
-                self.word_to_path[line.split("\t")[0]] = np.fromstring(
+                self.word_to_path[line.split('\t')[0]] = np.fromstring(
                    line.split('\t')[1], dtype=int, sep=' ')
                self.num_non_leaf = np.fromstring(
                    line.split('\t')[1], dtype=int, sep=' ')[0]
        print("word_ptable dict_size = " + str(len(self.word_to_path)))

-        with open(dict_path + "_pcode", 'r') as f3:
+        with io.open(dict_path + "_pcode", 'r', encoding='utf-8') as f3:
            for line in f3:
-                line = line.decode(encoding='UTF-8')
-                self.word_to_code[line.split("\t")[0]] = np.fromstring(
+                self.word_to_code[line.split('\t')[0]] = np.fromstring(
                    line.split('\t')[1], dtype=int, sep=' ')
        print("word_pcode dict_size = " + str(len(self.word_to_code)))
+        self.random_generator = NumpyRandomInt(1, self.window_size_ + 1)

-    def get_context_words(self, words, idx, window_size):
+    def get_context_words(self, words, idx):
        """
        Get the context word list of target word.

@@ -78,31 +96,34 @@ class Word2VecReader(object):
        idx: input word index
        window_size: window size
        """
-        target_window = np.random.randint(1, window_size + 1)
-        # need to keep in mind that maybe there are no enough words before the target word.
-        start_point = idx - target_window if (idx - target_window) > 0 else 0
+        target_window = self.random_generator()
+        start_point = idx - target_window  # if (idx - target_window) > 0 else 0
+        if start_point < 0:
+            start_point = 0
        end_point = idx + target_window
-        # context words of the target word
-        targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
-        return list(targets)
+        targets = words[start_point:idx] + words[idx + 1:end_point + 1]
+
+        return set(targets)

    def train(self, with_hs):
        def _reader():
            for file in self.filelist:
-                with open(self.data_path_ + "/" + file, 'r') as f:
+                with io.open(
+                        self.data_path_ + "/" + file, 'r',
+                        encoding='utf-8') as f:
                    logger.info("running data in {}".format(self.data_path_ +
                                                            "/" + file))
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line)
+                            line = preprocess.strip_lines(line, self.word_count)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_
                            ]
                            for idx, target_id in enumerate(word_ids):
                                context_word_ids = self.get_context_words(
-                                    word_ids, idx, self.window_size_)
+                                    word_ids, idx)
                                for context_id in context_word_ids:
                                    yield [target_id], [context_id]
                        else:
@@ -111,26 +132,28 @@ class Word2VecReader(object):

        def _reader_hs():
            for file in self.filelist:
-                with open(self.data_path_ + "/" + file, 'r') as f:
+                with io.open(
+                        self.data_path_ + "/" + file, 'r',
+                        encoding='utf-8') as f:
                    logger.info("running data in {}".format(self.data_path_ +
                                                            "/" + file))
                    count = 1
                    for line in f:
                        if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line)
+                            line = preprocess.strip_lines(line, self.word_count)
                            word_ids = [
                                self.word_to_id_[word] for word in line.split()
                                if word in self.word_to_id_
                            ]
                            for idx, target_id in enumerate(word_ids):
                                context_word_ids = self.get_context_words(
-                                    word_ids, idx, self.window_size_)
+                                    word_ids, idx)
                                for context_id in context_word_ids:
                                    yield [target_id], [context_id], [
-                                        self.word_to_code[self.id_to_word[
+                                        self.word_to_path[self.id_to_word[
                                            target_id]]
                                    ], [
-                                        self.word_to_path[self.id_to_word[
+                                        self.word_to_code[self.id_to_word[
                                            target_id]]
                                    ]
                        else:
@@ -144,13 +167,20 @@ class Word2VecReader(object):


 if __name__ == "__main__":
-    window_size = 10
+    window_size = 5
+
+    reader = Word2VecReader(
+        "./data/1-billion_dict",
+        "./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/",
+        ["news.en-00001-of-00100"], 0, 1)

-    reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size)
    i = 0
-    for x, y in reader.train()():
+    # print(reader.train(True))
+    for x, y, z, f in reader.train(True)():
        print("x: " + str(x))
        print("y: " + str(y))
+        print("path: " + str(z))
+        print("code: " + str(f))
        print("\n")
        if i == 10:
            exit(0)

--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.executor import global_scope
-
+import six
 import reader
 from network_conf import skip_gram_word2vec
 from infer import inference_test
@@ -29,7 +29,7 @@ def parse_args():
        '--train_data_path',
        type=str,
        default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
-        help="The path of training dataset")
+        help="The path of taining dataset")
    parser.add_argument(
        '--dict_path',
        type=str,
@@ -43,7 +43,7 @@ def parse_args():
    parser.add_argument(
        '--batch_size',
        type=int,
-        default=100,
+        default=1000,
        help="The size of mini-batch (default:100)")
    parser.add_argument(
        '--num_passes',
@@ -125,14 +125,44 @@ def parse_args():
    return parser.parse_args()


+def convert_python_to_tensor(batch_size, sample_reader, is_hs):
+    def __reader__():
+        result = None
+        if is_hs:
+            result = [[], [], [], []]
+        else:
+            result = [[], []]
+        for sample in sample_reader():
+            for i, fea in enumerate(sample):
+                result[i].append(fea)
+            if len(result[0]) == batch_size:
+                tensor_result = []
+                for tensor in result:
+                    t = fluid.Tensor()
+                    dat = np.array(tensor, dtype='int64')
+                    if len(dat.shape) > 2:
+                        dat = dat.reshape((dat.shape[0], dat.shape[2]))
+                    elif len(dat.shape) == 1:
+                        dat = dat.reshape((-1, 1))
+                    t.set(dat, fluid.CPUPlace())
+
+                    tensor_result.append(t)
+                yield tensor_result
+                if is_hs:
+                    result = [[], [], [], []]
+                else:
+                    result = [[], []]
+
+    return __reader__
+
+
 def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            reader.train((args.with_hs or (not args.with_nce))),
-            buf_size=args.batch_size * 100),
-        batch_size=args.batch_size)

-    py_reader.decorate_paddle_reader(train_reader)
+    py_reader.decorate_tensor_provider(
+        convert_python_to_tensor(args.batch_size,
+                                 reader.train((args.with_hs or (
+                                     not args.with_nce))), (args.with_hs or (
+                                         not args.with_nce))))

    place = fluid.CPUPlace()

@@ -140,6 +170,7 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
    exe.run(fluid.default_startup_program())

    exec_strategy = fluid.ExecutionStrategy()
+    exec_strategy.use_experimental_executor = True

    print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
    exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
@@ -161,32 +192,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
    profiler_step_end = 30

    for pass_id in range(args.num_passes):
-        epoch_start = time.time()
        py_reader.start()
+        time.sleep(10)
+        epoch_start = time.time()
        batch_id = 0
        start = time.clock()

        try:
            while True:

-                if profiler_step == profiler_step_start:
-                    fluid.profiler.start_profiler(profile_state)
-
                loss_val = train_exe.run(fetch_list=[loss.name])
                loss_val = np.mean(loss_val)

-                if profiler_step == profiler_step_end:
-                    fluid.profiler.stop_profiler('total', 'trainer_profile.log')
-                    profiler_step += 1
-                else:
-                    profiler_step += 1
-
                if batch_id % 50 == 0:
                    logger.info(
                        "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
                        format(pass_id, batch_id,
-                               loss_val.mean() / args.batch_size,
-                               py_reader.queue.size()))
+                               loss_val.mean(), py_reader.queue.size()))
                if args.with_speed:
                    if batch_id % 1000 == 0 and batch_id != 0:
                        elapsed = (time.clock() - start)
@@ -256,7 +278,7 @@ def train(args):

    optimizer = None
    if args.with_Adam:
-        optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
+        optimizer = fluid.optimizer.Adam(learning_rate=1e-4, lazy_mode=True)
    else:
        optimizer = fluid.optimizer.SGD(learning_rate=1e-4)