train_dev.py 2.0 KB
Newer Older
Z
Zeyu Chen 已提交
1 2 3 4 5
from functools import partial

from paddle.io import DistributedBatchSampler, DataLoader
from paddle.static import InputSpec
from paddlenlp.data import Stack, Tuple, Pad
S
Steffy-zxf 已提交
6
from paddlenlp.transformers import ErnieTokenizer
Z
Zeyu Chen 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20
import numpy as np
import paddle
import paddlenlp


def convert_example(example, tokenizer, max_seq_length=128):
    text, label = example
    encoded_inputs = tokenizer.encode(text, max_seq_len=max_seq_length)
    input_ids, segment_ids = encoded_inputs["input_ids"], encoded_inputs[
        "segment_ids"]
    label = np.array([label], dtype="int64")
    return input_ids, segment_ids, label


S
Steffy-zxf 已提交
21
paddle.set_device('gpu')
Z
Zeyu Chen 已提交
22 23
train_ds, dev_ds = paddlenlp.datasets.ChnSentiCorp.get_datasets(
    ['train', 'dev'])
S
Steffy-zxf 已提交
24 25
label_list = train_ds.get_labels()
tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
Z
Zeyu Chen 已提交
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
trans_func = partial(convert_example, tokenizer=tokenizer)
train_ds = train_ds.apply(trans_func)
dev_ds = dev_ds.apply(trans_func)
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    Stack(dtype="int64") ): [data for data in fn(samples)]
batch_sampler = DistributedBatchSampler(train_ds, batch_size=32, shuffle=True)
train_loader = DataLoader(
    dataset=train_ds,
    batch_sampler=batch_sampler,
    collate_fn=batchify_fn,
    return_list=True)
dev_loader = DataLoader(
    dataset=dev_ds,
    batch_size=32,
    shuffle=False,
    collate_fn=batchify_fn,
    return_list=True)

S
Steffy-zxf 已提交
46 47
model = paddlenlp.models.Ernie(
    'ernie-1.0', task='seq-cls', num_classes=len(label_list))
Z
Zeyu Chen 已提交
48 49 50 51 52 53 54 55 56 57 58 59 60
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()
optimizer = paddle.optimizer.AdamW(
    learning_rate=5e-5, parameters=model.parameters())

inputs = [
    InputSpec(
        [None, 128], dtype='int64', name='input_ids'), InputSpec(
            [None, 128], dtype='int64', name='token_type_ids')
]
trainer = paddle.Model(model, inputs)
trainer.prepare(optimizer, criterion, metric)
trainer.fit(train_loader, dev_loader, batch_size=32, epochs=3)