train.py 4.4 KB
Newer Older
Z
Zeyu Chen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import ast
import math
import argparse

import numpy as np
import paddle

K
kinghuin 已提交
23
from data import LacDataset
Z
Zeyu Chen 已提交
24
from model import BiGruCrf
K
kinghuin 已提交
25
from paddlenlp.data import Pad, Tuple, Stack
Z
Zeyu Chen 已提交
26
from paddlenlp.layers.crf import LinearChainCrfLoss, ViterbiDecoder
K
kinghuin 已提交
27
from paddlenlp.metrics import ChunkEvaluator
Z
Zeyu Chen 已提交
28 29 30

# yapf: disable
parser = argparse.ArgumentParser(__doc__)
K
kinghuin 已提交
31
parser.add_argument("--data_dir", type=str, default=None, help="The folder where the dataset is located.")
Z
Zeyu Chen 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
parser.add_argument("--init_checkpoint", type=str, default=None, help="Path to init model.")
parser.add_argument("--model_save_dir", type=str, default=None, help="The model will be saved in this path.")
parser.add_argument("--epochs", type=int, default=10, help="Corpus iteration num.")
parser.add_argument("--batch_size", type=int, default=300, help="The number of sequences contained in a mini-batch.")
parser.add_argument("--max_seq_len", type=int, default=64, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="If set, use GPU for training.")
parser.add_argument("--base_lr", type=float, default=0.001, help="The basic learning rate that affects the entire network.")
parser.add_argument("--emb_dim", type=int, default=128, help="The dimension in which a word is embedded.")
parser.add_argument("--hidden_size", type=int, default=128, help="The number of hidden nodes in the GRU layer.")
args = parser.parse_args()
# yapf: enable


def train(args):
    if args.use_gpu:
        place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
        paddle.set_device("gpu")
    else:
        place = paddle.CPUPlace()
        paddle.set_device("cpu")

K
kinghuin 已提交
53
    # create dataset.
K
kinghuin 已提交
54 55
    train_dataset = LacDataset(args.data_dir, mode='train')
    test_dataset = LacDataset(args.data_dir, mode='test')
K
kinghuin 已提交
56 57 58 59 60 61

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=0),  # word_ids
        Stack(),  # length
        Pad(axis=0, pad_val=0),  # label_ids
    ): fn(samples)
Z
Zeyu Chen 已提交
62 63 64 65 66 67 68 69 70 71 72 73

    # Create sampler for dataloader
    train_sampler = paddle.io.DistributedBatchSampler(
        dataset=train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True)
    train_loader = paddle.io.DataLoader(
        dataset=train_dataset,
        batch_sampler=train_sampler,
        places=place,
        return_list=True,
K
kinghuin 已提交
74
        collate_fn=batchify_fn)
Z
Zeyu Chen 已提交
75 76 77 78 79 80 81 82 83 84 85

    test_sampler = paddle.io.BatchSampler(
        dataset=test_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        drop_last=True)
    test_loader = paddle.io.DataLoader(
        dataset=test_dataset,
        batch_sampler=test_sampler,
        places=place,
        return_list=True,
K
kinghuin 已提交
86
        collate_fn=batchify_fn)
Z
Zeyu Chen 已提交
87 88 89 90 91 92 93 94 95 96 97

    # Define the model netword and its loss
    network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size,
                       train_dataset.num_labels)
    model = paddle.Model(network)

    # Prepare optimizer, loss and metric evaluator
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.base_lr, parameters=model.parameters())
    crf_loss = LinearChainCrfLoss(network.crf.transitions)
    chunk_evaluator = ChunkEvaluator(
K
kinghuin 已提交
98
        label_list=train_dataset.label_vocab.keys(), suffix=True)
Z
Zeyu Chen 已提交
99 100 101 102 103
    model.prepare(optimizer, crf_loss, chunk_evaluator)
    if args.init_checkpoint:
        model.load(args.init_checkpoint)

    # Start training
K
kinghuin 已提交
104
    callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3)
Z
Zeyu Chen 已提交
105 106 107 108 109
    model.fit(train_data=train_loader,
              eval_data=test_loader,
              batch_size=args.batch_size,
              epochs=args.epochs,
              eval_freq=1,
K
kinghuin 已提交
110
              log_freq=10,
Z
Zeyu Chen 已提交
111 112 113
              save_dir=args.model_save_dir,
              save_freq=1,
              drop_last=True,
K
kinghuin 已提交
114 115
              shuffle=True,
              callbacks=callback)
Z
Zeyu Chen 已提交
116 117 118 119 120


if __name__ == "__main__":
    print(args)
    train(args)