infer.py 10.7 KB
Newer Older
1 2 3 4 5 6 7
import time
import os
import paddle.fluid as fluid
import numpy as np
from Queue import PriorityQueue
import logging
import argparse
8
import preprocess
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
from sklearn.metrics.pairwise import cosine_similarity

word_to_id = dict()
id_to_word = dict()

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("fluid")
logger.setLevel(logging.INFO)


def parse_args():
    parser = argparse.ArgumentParser(
        description="PaddlePaddle Word2vec infer example")
    parser.add_argument(
        '--dict_path',
        type=str,
        default='./data/1-billion_dict',
        help="The path of training dataset")
    parser.add_argument(
        '--model_output_dir',
        type=str,
        default='models',
        help="The path for model to store (with infer_once please set specify dir to models) (default: models)"
    )
    parser.add_argument(
        '--rank_num',
        type=int,
        default=4,
        help="find rank_num-nearest result for test (default: 4)")
    parser.add_argument(
        '--infer_once',
        action='store_true',
        required=False,
        default=False,
        help='if using infer_once, (default: False)')
J
JiabinYang 已提交
44 45 46 47 48 49
    parser.add_argument(
        '--infer_during_train',
        action='store_true',
        required=False,
        default=True,
        help='if using infer_during_train, (default: True)')
50 51 52 53
    parser.add_argument(
        '--test_acc',
        action='store_true',
        required=False,
J
JiabinYang 已提交
54 55
        default=False,
        help='if using test_files , (default: False)')
56 57 58 59 60 61 62 63 64 65
    parser.add_argument(
        '--test_files_dir',
        type=str,
        default='test',
        help="The path for test_files) (default: test)")
    parser.add_argument(
        '--test_batch_size',
        type=int,
        default=1000,
        help="test used batch size (default: 1000)")
66 67 68 69 70 71 72 73 74 75 76

    return parser.parse_args()


def BuildWord_IdMap(dict_path):
    with open(dict_path + "_word_to_id_", 'r') as f:
        for line in f:
            word_to_id[line.split(' ')[0]] = int(line.split(' ')[1])
            id_to_word[int(line.split(' ')[1])] = line.split(' ')[0]


77
def inference_prog():  # just to create program for test
78 79 80 81
    fluid.layers.create_parameter(
        shape=[1, 1], dtype='float32', name="embeding")


82 83 84 85 86 87
def build_test_case_from_file(args, emb):
    logger.info("test files dir: {}".format(args.test_files_dir))
    current_list = os.listdir(args.test_files_dir)
    logger.info("test files list: {}".format(current_list))
    test_cases = list()
    test_labels = list()
J
JiabinYang 已提交
88
    test_case_descs = list()
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    exclude_lists = list()
    for file_dir in current_list:
        with open(args.test_files_dir + "/" + file_dir, 'r') as f:
            count = 0
            for line in f:
                if count == 0:
                    pass
                elif ':' in line:
                    logger.info("{}".format(line))
                    pass
                else:
                    line = preprocess.strip_lines(line, word_to_id)
                    test_case = emb[word_to_id[line.split()[0]]] - emb[
                        word_to_id[line.split()[1]]] + emb[word_to_id[
                            line.split()[2]]]
                    test_case_desc = line.split()[0] + " - " + line.split()[
                        1] + " + " + line.split()[2] + " = " + line.split()[3]
J
JiabinYang 已提交
106 107
                    test_cases.append(test_case)
                    test_case_descs.append(test_case_desc)
108 109 110 111 112 113
                    test_labels.append(word_to_id[line.split()[3]])
                    exclude_lists.append([
                        word_to_id[line.split()[0]],
                        word_to_id[line.split()[1]], word_to_id[line.split()[2]]
                    ])
                count += 1
J
JiabinYang 已提交
114 115
            test_cases = norm(np.array(test_cases))
    return test_cases, test_case_descs, test_labels, exclude_lists
116 117 118


def build_small_test_case(emb):
119 120 121
    emb1 = emb[word_to_id['boy']] - emb[word_to_id['girl']] + emb[word_to_id[
        'aunt']]
    desc1 = "boy - girl + aunt = uncle"
122
    label1 = word_to_id["uncle"]
123 124 125
    emb2 = emb[word_to_id['brother']] - emb[word_to_id['sister']] + emb[
        word_to_id['sisters']]
    desc2 = "brother - sister + sisters = brothers"
126
    label2 = word_to_id["brothers"]
127 128 129
    emb3 = emb[word_to_id['king']] - emb[word_to_id['queen']] + emb[word_to_id[
        'woman']]
    desc3 = "king - queen + woman = man"
130
    label3 = word_to_id["man"]
131 132 133
    emb4 = emb[word_to_id['reluctant']] - emb[word_to_id['reluctantly']] + emb[
        word_to_id['slowly']]
    desc4 = "reluctant - reluctantly + slowly = slow"
134
    label4 = word_to_id["slow"]
135 136 137
    emb5 = emb[word_to_id['old']] - emb[word_to_id['older']] + emb[word_to_id[
        'deeper']]
    desc5 = "old - older + deeper = deep"
138
    label5 = word_to_id["deep"]
J
JiabinYang 已提交
139 140 141 142 143

    test_cases = [emb1, emb2, emb3, emb4, emb5]
    test_case_desc = [desc1, desc2, desc3, desc4, desc5]
    test_labels = [label1, label2, label3, label4, label5]
    return norm(np.array(test_cases)), test_case_desc, test_labels
144 145 146 147 148 149 150


def build_test_case(args, emb):
    if args.test_acc:
        return build_test_case_from_file(args, emb)
    else:
        return build_small_test_case(emb)
151 152


J
JiabinYang 已提交
153 154 155 156 157
def norm(x):
    emb = np.linalg.norm(x, axis=1, keepdims=True)
    return x / emb


J
JiabinYang 已提交
158
def inference_test(scope, model_dir, args):
159 160
    BuildWord_IdMap(args.dict_path)
    logger.info("model_dir is: {}".format(model_dir + "/"))
J
JiabinYang 已提交
161
    emb = np.array(scope.find_var("embeding").get_tensor())
J
JiabinYang 已提交
162
    x = norm(emb)
163
    logger.info("inference result: ====================")
J
JiabinYang 已提交
164 165
    test_cases = None
    test_case_desc = list()
166 167 168
    test_labels = list()
    exclude_lists = list()
    if args.test_acc:
J
JiabinYang 已提交
169 170
        test_cases, test_case_desc, test_labels, exclude_lists = build_test_case(
            args, emb)
171
    else:
J
JiabinYang 已提交
172
        test_cases, test_case_desc, test_labels = build_test_case(args, emb)
173 174 175
        exclude_lists = [[-1]]
    accual_rank = 1 if args.test_acc else args.rank_num
    correct_num = 0
J
JiabinYang 已提交
176 177 178
    cosine_similarity_matrix = np.dot(test_cases, x.T)
    results = topKs(accual_rank, cosine_similarity_matrix, exclude_lists,
                    args.test_acc)
179
    for i in range(len(test_labels)):
J
JiabinYang 已提交
180 181
        logger.info("Test result for {}".format(test_case_desc[i]))
        result = results[i]
182 183
        for j in range(accual_rank):
            if (j == accual_rank - 1) and (
J
JiabinYang 已提交
184
                    result[j][1] == test_labels[i]
185 186
            ):  # if the nearest word is what we want 
                correct_num += 1
J
JiabinYang 已提交
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
            logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
                result[j][1]], result[j][0]))
    logger.info("Test acc is: {}, there are {} / {}".format(correct_num / len(
        test_labels), correct_num, len(test_labels)))


def topK(k, cosine_similarity_list, exclude_list, is_acc=False):
    if k == 1 and is_acc:  # accelerate acc calculate
        max = cosine_similarity_list[0]
        id = 0
        for i in range(len(cosine_similarity_list)):
            if cosine_similarity_list[i] >= max and (i not in exclude_list):
                max = cosine_similarity_list[i]
                id = i
            else:
                pass
        return [[max, id]]
    else:
        pq = PriorityQueue(k + 1)
        while not pq.empty():
            try:
                pq.get(False)
            except Empty:
                continue
            pq.task_done()
        if len(cosine_similarity_list) <= k:
            for i in range(len(cosine_similarity_list)):
                pq.put([cosine_similarity_list[i], i])
            return pq

        for i in range(len(cosine_similarity_list)):
            if is_acc and (i in exclude_list):
                pass
            else:
                if pq.full():
                    pq.get()
                pq.put([cosine_similarity_list[i], i])
        pq.get()
225 226
        return pq

J
JiabinYang 已提交
227 228 229 230 231 232 233 234 235 236 237

def topKs(k, cosine_similarity_matrix, exclude_lists, is_acc=False):
    results = list()
    result_queues = list()
    correct_num = 0

    for i in range(cosine_similarity_matrix.shape[0]):
        tmp_pq = None
        if is_acc:
            tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[i],
                          is_acc)
238
        else:
J
JiabinYang 已提交
239 240 241 242 243 244 245 246 247 248 249 250 251
            tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[0],
                          is_acc)
        result_queues.append(tmp_pq)
    if is_acc and k == 1:  # accelerate acc calculate
        return result_queues
    else:
        for pq in result_queues:
            tmp_result = list()
            for i in range(k):
                tmp_result.append(pq.get())
            tmp_result.reverse()
            results.append(tmp_result)
        return results
252 253 254 255


def infer_during_train(args):
    model_file_list = list()
J
JiabinYang 已提交
256 257 258
    exe = fluid.Executor(fluid.CPUPlace())
    Scope = fluid.Scope()
    inference_prog()
259
    solved_new = True
260
    while True:
J
JiabinYang 已提交
261
        time.sleep(60)
262
        current_list = os.listdir(args.model_output_dir)
J
JiabinYang 已提交
263 264
        # logger.info("current_list is : {}".format(current_list))
        # logger.info("model_file_list is : {}".format(model_file_list))
265
        if set(model_file_list) == set(current_list):
266 267 268
            if solved_new:
                solved_new = False
                logger.info("No New models created")
269 270
            pass
        else:
271
            solved_new = True
272 273 274 275 276 277 278 279 280
            increment_models = list()
            for f in current_list:
                if f not in model_file_list:
                    increment_models.append(f)
            logger.info("increment_models is : {}".format(increment_models))
            for model in increment_models:
                model_dir = args.model_output_dir + "/" + model
                if os.path.exists(model_dir + "/_success"):
                    logger.info("using models from " + model_dir)
J
JiabinYang 已提交
281 282 283 284
                    with fluid.scope_guard(Scope):
                        fluid.io.load_persistables(
                            executor=exe, dirname=model_dir + "/")
                        inference_test(Scope, model_dir, args)
285 286 287 288
            model_file_list = current_list


def infer_once(args):
J
JiabinYang 已提交
289 290
    # check models file has already been finished
    if os.path.exists(args.model_output_dir + "/_success"):
291
        logger.info("using models from " + args.model_output_dir)
J
JiabinYang 已提交
292 293 294 295 296 297 298
        exe = fluid.Executor(fluid.CPUPlace())
        Scope = fluid.Scope()
        inference_prog()
        with fluid.scope_guard(Scope):
            fluid.io.load_persistables(
                executor=exe, dirname=args.model_output_dir + "/")
            inference_test(Scope, args.model_output_dir, args)
299 300 301 302 303 304 305


if __name__ == '__main__':
    args = parse_args()
    # while setting infer_once please specify the dir to models file with --model_output_dir
    if args.infer_once:
        infer_once(args)
J
JiabinYang 已提交
306
    elif args.infer_during_train:
307
        infer_during_train(args)
J
JiabinYang 已提交
308 309
    else:
        pass