predict.py

# -*- coding: UTF-8 -*-
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import ast
import argparse

import numpy as np
import paddle

from data import load_kv_dict, batch_padding_fn, LacDataset, parse_lac_result
from model import BiGruCrf, ViterbiDecoder, ChunkEvaluator

# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--base_path", type=str, default=None, help="The folder where the dataset is located.")
parser.add_argument("--word_dict_path", type=str, default=None, help="The path of the word dictionary.")
parser.add_argument("--label_dict_path", type=str, default=None, help="The path of the label dictionary.")
parser.add_argument("--word_rep_dict_path", type=str, default=None, help="The path of the word replacement Dictionary")
parser.add_argument("--init_checkpoint", type=str, default=None, help="Path to init model.")
parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.")
parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="If set, use GPU for training.")
parser.add_argument("--emb_dim", type=int, default=128, help="The dimension in which a word is embedded.")
parser.add_argument("--hidden_size", type=int, default=128, help="The number of hidden nodes in the GRU layer.")
args = parser.parse_args()
# yapf: enable


def infer(args):
    place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
    paddle.set_device("gpu" if args.use_gpu else "cpu")

    # Load vocab to create dataset.
    word_vocab = load_kv_dict(
        args.word_dict_path, value_func=np.int64, reverse=True)
    label_vocab = load_kv_dict(
        args.label_dict_path, value_func=np.int64, reverse=True)
    word_rep_dict = load_kv_dict(args.word_rep_dict_path)
    infer_dataset = LacDataset(
        args.base_path, word_vocab, label_vocab, word_rep_dict, mode='infer')

    # Create sampler for dataloader
    infer_sampler = paddle.io.BatchSampler(
        dataset=infer_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=True)
    infer_loader = paddle.io.DataLoader(
        dataset=infer_dataset,
        batch_sampler=infer_sampler,
        places=place,
        return_list=True,
        collate_fn=batch_padding_fn(args.max_seq_len))

    # Define the model network
    network = BiGruCrf(args.emb_dim, args.hidden_size, infer_dataset.vocab_size,
                       infer_dataset.num_labels)
    model = paddle.Model(network)
    model.prepare()

    # Load the model and start predicting
    model.load(args.init_checkpoint)
    emissions, lengths, crf_decodes = model.predict(
        test_data=infer_loader, batch_size=args.batch_size)

    # Post-processing the lexical analysis results
    lengths = np.array(lengths).reshape([-1])
    preds = np.array(
        [pred for batch_pred in crf_decodes for pred in batch_pred])

    results = parse_lac_result(infer_dataset.word_ids, preds, lengths,
                               word_vocab, label_vocab)

    sent_tags = []
    for sent, tags in results:
        sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)]
        sent_tags.append(''.join(sent_tag))

    file_path = "results.txt"
    with open(file_path, "w", encoding="utf8") as fout:
        fout.write("\n".join(sent_tags))

    # Print some examples
    print(
        "The results have been saved in the file: %s, some examples are shown below: "
        % file_path)
    print("\n".join(sent_tags[:10]))


if __name__ == '__main__':
    infer(args)