infer.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import sys
import time
import math
import numpy as np
import six
import paddle.fluid as fluid
import paddle
import utils
if six.PY2:
    reload(sys)
    sys.setdefaultencoding('utf-8')


def parse_args():
    parser = argparse.ArgumentParser("PaddlePaddle Word2vec infer example")
    parser.add_argument(
        '--dict_path',
        type=str,
        default='./data/data_c/1-billion_dict_word_to_id_',
        help="The path of dic")
    parser.add_argument(
        '--test_dir', type=str, default='test_data', help='test file address')
    parser.add_argument(
        '--print_step', type=int, default='500000', help='print step')
    parser.add_argument(
        '--start_index', type=int, default='0', help='start index')
    parser.add_argument(
        '--last_index', type=int, default='100', help='last index')
    parser.add_argument(
        '--model_dir', type=str, default='model', help='model dir')
    parser.add_argument(
        '--use_cuda', type=int, default='0', help='whether use cuda')
    parser.add_argument(
        '--batch_size', type=int, default='5', help='batch_size')
    parser.add_argument(
        '--emb_size', type=int, default='64', help='batch_size')
    args = parser.parse_args()
    return args


def infer_network(vocab_size, emb_size):
    analogy_a = fluid.data(name="analogy_a", shape=[None], dtype='int64')
    analogy_b = fluid.data(name="analogy_b", shape=[None], dtype='int64')
    analogy_c = fluid.data(name="analogy_c", shape=[None], dtype='int64')
    all_label = fluid.data(name="all_label", shape=[vocab_size], dtype='int64')
    emb_all_label = fluid.embedding(
        input=all_label, size=[vocab_size, emb_size], param_attr="emb")

    emb_a = fluid.embedding(
        input=analogy_a, size=[vocab_size, emb_size], param_attr="emb")
    emb_b = fluid.embedding(
        input=analogy_b, size=[vocab_size, emb_size], param_attr="emb")
    emb_c = fluid.embedding(
        input=analogy_c, size=[vocab_size, emb_size], param_attr="emb")
    target = fluid.layers.elementwise_add(
        fluid.layers.elementwise_sub(emb_b, emb_a), emb_c)
    emb_all_label_l2 = fluid.layers.l2_normalize(x=emb_all_label, axis=1)
    dist = fluid.layers.matmul(x=target, y=emb_all_label_l2, transpose_y=True)
    values, pred_idx = fluid.layers.topk(input=dist, k=4)
    return values, pred_idx


def infer_epoch(args, vocab_size, test_reader, use_cuda, i2w):
    """ inference function """
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    emb_size = args.emb_size
    batch_size = args.batch_size
    with fluid.scope_guard(fluid.Scope()):
        main_program = fluid.Program()
        with fluid.program_guard(main_program):
            values, pred = infer_network(vocab_size, emb_size)
            for epoch in range(start_index, last_index + 1):
                copy_program = main_program.clone()
                model_path = model_dir + "/" + str(epoch)
                fluid.io.load_persistables(
                    exe, model_path, main_program=copy_program)
                accum_num = 0
                accum_num_sum = 0.0
                t0 = time.time()
                step_id = 0
                for data in test_reader():
                    step_id += 1
                    b_size = len([dat[0] for dat in data])
                    wa = np.array([dat[0] for dat in data]).astype(
                        "int64").reshape(b_size)
                    wb = np.array([dat[1] for dat in data]).astype(
                        "int64").reshape(b_size)
                    wc = np.array([dat[2] for dat in data]).astype(
                        "int64").reshape(b_size)

                    label = [dat[3] for dat in data]
                    input_word = [dat[4] for dat in data]
                    para = exe.run(copy_program,
                                   feed={
                                       "analogy_a": wa,
                                       "analogy_b": wb,
                                       "analogy_c": wc,
                                       "all_label": np.arange(vocab_size)
                                       .reshape(vocab_size).astype("int64"),
                                   },
                                   fetch_list=[pred.name, values],
                                   return_numpy=False)
                    pre = np.array(para[0])
                    val = np.array(para[1])
                    for ii in range(len(label)):
                        top4 = pre[ii]
                        accum_num_sum += 1
                        for idx in top4:
                            if int(idx) in input_word[ii]:
                                continue
                            if int(idx) == int(label[ii][0]):
                                accum_num += 1
                            break
                    if step_id % 1 == 0:
                        print("step:%d %d " % (step_id, accum_num))

                print("epoch:%d \t acc:%.3f " %
                      (epoch, 1.0 * accum_num / accum_num_sum))


if __name__ == "__main__":
    args = parse_args()
    start_index = args.start_index
    last_index = args.last_index
    test_dir = args.test_dir
    model_dir = args.model_dir
    batch_size = args.batch_size
    dict_path = args.dict_path
    use_cuda = True if args.use_cuda else False
    print("start index: ", start_index, " last_index:", last_index)
    vocab_size, test_reader, id2word = utils.prepare_data(
        test_dir, dict_path, batch_size=batch_size)
    print("vocab_size:", vocab_size)
    infer_epoch(
        args,
        vocab_size,
        test_reader=test_reader,
        use_cuda=use_cuda,
        i2w=id2word)