infer.py 6.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
import paddle
import time
import os
import paddle.fluid as fluid
import numpy as np
from Queue import PriorityQueue
import logging
import argparse
from sklearn.metrics.pairwise import cosine_similarity

word_to_id = dict()
id_to_word = dict()

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)


def parse_args():
    parser = argparse.ArgumentParser(
        description="PaddlePaddle Word2vec infer example")
    parser.add_argument(
        '--dict_path',
        type=str,
        default='./data/1-billion_dict',
        help="The path of training dataset")
    parser.add_argument(
        '--model_output_dir',
        type=str,
        default='models',
        help="The path for model to store (with infer_once please set specify dir to models) (default: models)"
    )
    parser.add_argument(
        '--rank_num',
        type=int,
        default=4,
        help="find rank_num-nearest result for test (default: 4)")
    parser.add_argument(
        '--infer_once',
        action='store_true',
        required=False,
        default=False,
        help='if using infer_once, (default: False)')
J
JiabinYang 已提交
44 45 46 47 48 49
    parser.add_argument(
        '--infer_during_train',
        action='store_true',
        required=False,
        default=True,
        help='if using infer_during_train, (default: True)')
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

    return parser.parse_args()


def BuildWord_IdMap(dict_path):
    with open(dict_path + "_word_to_id_", 'r') as f:
        for line in f:
            word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
            id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]


def inference_prog():
    fluid.layers.create_parameter(
        shape=[1, 1], dtype='float32', name="embeding")


def build_test_case(emb):
    emb1 = emb[word_to_id['boy']] - emb[word_to_id['girl']] + emb[word_to_id[
        'aunt']]
    desc1 = "boy - girl + aunt = uncle"
    emb2 = emb[word_to_id['brother']] - emb[word_to_id['sister']] + emb[
        word_to_id['sisters']]
    desc2 = "brother - sister + sisters = brothers"
    emb3 = emb[word_to_id['king']] - emb[word_to_id['queen']] + emb[word_to_id[
        'woman']]
    desc3 = "king - queen + woman = man"
    emb4 = emb[word_to_id['reluctant']] - emb[word_to_id['reluctantly']] + emb[
        word_to_id['slowly']]
    desc4 = "reluctant - reluctantly + slowly = slow"
    emb5 = emb[word_to_id['old']] - emb[word_to_id['older']] + emb[word_to_id[
        'deeper']]
    desc5 = "old - older + deeper = deep"
    return [[emb1, desc1], [emb2, desc2], [emb3, desc3], [emb4, desc4],
            [emb5, desc5]]


J
JiabinYang 已提交
86
def inference_test(scope, model_dir, args):
87 88
    BuildWord_IdMap(args.dict_path)
    logger.info("model_dir is: {}".format(model_dir + "/"))
J
JiabinYang 已提交
89
    emb = np.array(scope.find_var("embeding").get_tensor())
90 91 92
    test_cases = build_test_case(emb)
    logger.info("inference result: ====================")
    for case in test_cases:
J
JiabinYang 已提交
93
        pq = topK(args.rank_num, emb, case[0])
94 95
        logger.info("Test result for {}".format(case[1]))
        pq_tmps = list()
J
JiabinYang 已提交
96
        for i in range(args.rank_num):
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
            pq_tmps.append(pq.get())
        for i in range(len(pq_tmps)):
            logger.info("{} nearest is {}, rate is {}".format(i, id_to_word[
                pq_tmps[len(pq_tmps) - 1 - i].id], pq_tmps[len(pq_tmps) - 1 - i]
                                                              .priority))
        del pq_tmps[:]


class PQ_Entry(object):
    def __init__(self, cos_similarity, id):
        self.priority = cos_similarity
        self.id = id

    def __cmp__(self, other):
        return cmp(self.priority, other.priority)


def topK(k, emb, test_emb):
    pq = PriorityQueue(k + 1)
    if len(emb) <= k:
        for i in range(len(emb)):
            x = cosine_similarity([emb[i]], [test_emb])
            pq.put(PQ_Entry(x, i))
        return pq

    for i in range(len(emb)):
        x = cosine_similarity([emb[i]], [test_emb])
        pq_e = PQ_Entry(x, i)
        if pq.full():
            pq.get()
        pq.put(pq_e)
    pq.get()
    return pq


def infer_during_train(args):
    model_file_list = list()
J
JiabinYang 已提交
134 135 136
    exe = fluid.Executor(fluid.CPUPlace())
    Scope = fluid.Scope()
    inference_prog()
137
    solved_new = True
138
    while True:
J
JiabinYang 已提交
139
        time.sleep(60)
140
        current_list = os.listdir(args.model_output_dir)
J
JiabinYang 已提交
141 142
        # logger.info("current_list is : {}".format(current_list))
        # logger.info("model_file_list is : {}".format(model_file_list))
143
        if set(model_file_list) == set(current_list):
144 145 146
            if solved_new:
                solved_new = False
                logger.info("No New models created")
147 148
            pass
        else:
149
            solved_new = True
150 151 152 153 154 155 156 157 158
            increment_models = list()
            for f in current_list:
                if f not in model_file_list:
                    increment_models.append(f)
            logger.info("increment_models is : {}".format(increment_models))
            for model in increment_models:
                model_dir = args.model_output_dir + "/" + model
                if os.path.exists(model_dir + "/_success"):
                    logger.info("using models from " + model_dir)
J
JiabinYang 已提交
159 160 161 162
                    with fluid.scope_guard(Scope):
                        fluid.io.load_persistables(
                            executor=exe, dirname=model_dir + "/")
                        inference_test(Scope, model_dir, args)
163 164 165 166
            model_file_list = current_list


def infer_once(args):
J
JiabinYang 已提交
167 168
    # check models file has already been finished
    if os.path.exists(args.model_output_dir + "/_success"):
169
        logger.info("using models from " + args.model_output_dir)
J
JiabinYang 已提交
170 171 172 173 174 175 176
        exe = fluid.Executor(fluid.CPUPlace())
        Scope = fluid.Scope()
        inference_prog()
        with fluid.scope_guard(Scope):
            fluid.io.load_persistables(
                executor=exe, dirname=args.model_output_dir + "/")
            inference_test(Scope, args.model_output_dir, args)
177 178 179 180 181 182 183


if __name__ == '__main__':
    args = parse_args()
    # while setting infer_once please specify the dir to models file with --model_output_dir
    if args.infer_once:
        infer_once(args)
J
JiabinYang 已提交
184
    if args.infer_during_train:
185
        infer_during_train(args)