train.py 4.9 KB
Newer Older
Z
Zeyu Chen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import ast
import math
import argparse

import numpy as np
import paddle

from data import load_kv_dict, batch_padding_fn, LacDataset
from model import BiGruCrf
from paddlenlp.layers.crf import LinearChainCrfLoss, ViterbiDecoder
from paddlenlp.metrics.chunk_evaluator import ChunkEvaluator

# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--base_path", type=str, default=None, help="The folder where the dataset is located.")
parser.add_argument("--word_dict_path", type=str, default=None, help="The path of the word dictionary.")
parser.add_argument("--label_dict_path", type=str, default=None, help="The path of the label dictionary.")
parser.add_argument("--word_rep_dict_path", type=str, default=None, help="The path of the word replacement Dictionary")
parser.add_argument("--init_checkpoint", type=str, default=None, help="Path to init model.")
parser.add_argument("--model_save_dir", type=str, default=None, help="The model will be saved in this path.")
parser.add_argument("--epochs", type=int, default=10, help="Corpus iteration num.")
parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.")
parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="If set, use GPU for training.")
parser.add_argument("--base_lr", type=float, default=0.001, help="The basic learning rate that affects the entire network.")
parser.add_argument("--emb_dim", type=int, default=128, help="The dimension in which a word is embedded.")
parser.add_argument("--hidden_size", type=int, default=128, help="The number of hidden nodes in the GRU layer.")
args = parser.parse_args()
# yapf: enable


def train(args):
    if args.use_gpu:
        place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
        paddle.set_device("gpu")
    else:
        place = paddle.CPUPlace()
        paddle.set_device("cpu")

    # Load vocab to create dataset.
    word_vocab = load_kv_dict(
        args.word_dict_path, value_func=np.int64, reverse=True)
    label_vocab = load_kv_dict(
        args.label_dict_path, value_func=np.int64, reverse=True)
    word_rep_dict = load_kv_dict(args.word_rep_dict_path)
    train_dataset = LacDataset(
        args.base_path, word_vocab, label_vocab, word_rep_dict, mode='train')
    test_dataset = LacDataset(
        args.base_path, word_vocab, label_vocab, word_rep_dict, mode='test')

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(
        dataset=train_dataset,
        batch_sampler=train_sampler,
        places=place,
        return_list=True,
        collate_fn=batch_padding_fn(args.max_seq_len))

    test_sampler = paddle.io.BatchSampler(
        dataset=test_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=True)
    test_loader = paddle.io.DataLoader(
        dataset=test_dataset,
        batch_sampler=test_sampler,
        places=place,
        return_list=True,
        collate_fn=batch_padding_fn(args.max_seq_len))

    # Define the model netword and its loss
    network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size,
                       train_dataset.num_labels)
    model = paddle.Model(network)

    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.base_lr, parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf.transitions)
    chunk_evaluator = ChunkEvaluator(
        int(math.ceil((train_dataset.num_labels + 1) / 2.0)),
        "IOB")  # + 1 for SOS and EOS
    model.prepare(optimizer, crf_loss, chunk_evaluator)
    if args.init_checkpoint:
        model.load(args.init_checkpoint)

    # Start training
    model.fit(train_data=train_loader,
              eval_data=test_loader,
              batch_size=args.batch_size,
              epochs=args.epochs,
              eval_freq=1,
              log_freq=1,
              save_dir=args.model_save_dir,
              save_freq=1,
              verbose=2,
              drop_last=True,
              shuffle=True)


if __name__ == "__main__":
    print(args)
    train(args)