remove run_ernie_crf.py

d3029c01 · Zeyu Chen · 52ba55d0 · d3029c01 · 52ba55d0 · d3029c01
5 changed file
--- a/PaddleNLP/examples/hapi/train.py
+++ b/PaddleNLP/examples/hapi/train.py
@@ -3,7 +3,7 @@ from functools import partial
 from paddle.io import DistributedBatchSampler, DataLoader
 from paddle.static import InputSpec
 from paddlenlp.data import Stack, Tuple, Pad
-from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer, ErnieForSequenceClassification, ErnieTokenizer
+from paddlenlp.transformers import ErnieTokenizer
 import numpy as np
 import paddle
 import paddlenlp
@@ -19,9 +19,12 @@ def convert_example(example, tokenizer, max_seq_length=128):


 paddle.set_device('gpu')
+# Dataset prepare
 train_ds = paddlenlp.datasets.ChnSentiCorp.get_datasets(['train'])
-label_list = train_ds.get_labels()
+
 tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+model = paddlenlp.models.Ernie('ernie-1.0', task='seq-cls', num_classes=2)
+
 trans_func = partial(convert_example, tokenizer=tokenizer)
 train_ds = train_ds.apply(trans_func)
 batchify_fn = lambda samples, fn=Tuple(
@@ -35,8 +38,6 @@ train_loader = DataLoader(
    collate_fn=batchify_fn,
    return_list=True)

-model = paddlenlp.models.Ernie(
-    'ernie-1.0', task='seq-cls', num_classes=len(label_list))
 criterion = paddle.nn.loss.CrossEntropyLoss()
 metric = paddle.metric.Accuracy()
 optimizer = paddle.optimizer.AdamW(

--- a/PaddleNLP/examples/named_entity_recognition/express_ner/run_ernie_crf.py
+++ b/PaddleNLP/examples/named_entity_recognition/express_ner/run_ernie_crf.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-import paddle.nn as nn
-from paddlenlp.data import Stack, Tuple, Pad
-from paddlenlp.transformers import ErnieTokenizer, ErniePretrainedModel
-from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss, ViterbiDecoder
-from paddlenlp.metrics import ChunkEvaluator
-from paddle.static import InputSpec 
-from functools import partial
-
-
-def parse_decodes(ds, decodes, lens):
-    decodes = [x for batch in decodes for x in batch]
-    lens = [x for batch in lens for x in batch]
-    id_label = dict(zip(ds.label_vocab.values(), ds.label_vocab.keys()))
-
-    outputs = []
-    for idx, end in enumerate(lens):
-        sent = ds.word_ids[idx][:end]
-        tags = [id_label[x] for x in decodes[idx][1:end]]
-        sent_out = []
-        tags_out = []
-        words = ""
-        for s, t in zip(sent, tags):
-            if t.endswith('-B') or t == 'O':
-                if len(words):
-                    sent_out.append(words)
-                tags_out.append(t.split('-')[0])
-                words = s
-            else:
-                words += s
-        if len(sent_out) < len(tags_out):
-            sent_out.append(words)
-        outputs.append(''.join([str((s, t)) for s, t in zip(sent_out, tags_out)]))
-    return outputs
-
- 
-def convert_example(example, tokenizer, label_vocab):
-    tokens, labels = example
-    tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-    segment_ids = [0] * len(tokens)
-    lens = len(input_ids)
-    labels = ['O'] + labels + ['O']
-    labels = [label_vocab[x] for x in labels]
-    return input_ids, segment_ids, lens, labels
- 
- 
-def load_dict(dict_path):
-    vocab = {}
-    for line in open(dict_path, 'r', encoding='utf-8'):
-        value, key = line.strip('\n').split('\t')
-        vocab[key] = int(value)
-    return vocab
- 
- 
-class ExpressDataset(paddle.io.Dataset):
-    def __init__(self, data_path):
-        self.word_vocab = load_dict('./conf/word.dic')
-        self.label_vocab = load_dict('./conf/tag.dic')
-        self.word_ids = []
-        self.label_ids = []
-        with open(data_path, 'r', encoding='utf-8') as fp:
-            next(fp)
-            for line in fp.readlines():
-                words, labels = line.strip('\n').split('\t')
-                words = words.split('\002')
-                labels = labels.split('\002')
-                self.word_ids.append(words)
-                self.label_ids.append(labels)
-        self.word_num = max(self.word_vocab.values()) + 1
-        self.label_num = max(self.label_vocab.values()) + 1
-    def __len__(self):
-        return len(self.word_ids)
-    def __getitem__(self, index):
-        return self.word_ids[index], self.label_ids[index]
- 
- 
-class ErnieForTokenClassification(ErniePretrainedModel):
-    def __init__(self, ernie, num_classes=2, dropout=None):
-        super(ErnieForTokenClassification, self).__init__()
-        self.num_classes = num_classes
-        self.ernie = ernie
-        self.dropout = nn.Dropout(self.ernie.config["hidden_dropout_prob"])
-        self.classifier = nn.Linear(self.ernie.config["hidden_size"], num_classes)
-        self.apply(self.init_weights)
- 
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                lens=None,
-                position_ids=None,
-                attention_mask=None):
-        sequence_output, _ = self.ernie(
-            input_ids,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask)
- 
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        return logits, lens, paddle.argmax(logits, axis=-1) 
-
-
-class ErnieCRF(ErnieForTokenClassification):
-    def __init__(self, ernie, num_classes=2, crf_lr=1.0, dropout=None):
-        super(ErnieCRF, self).__init__(ernie, num_classes, dropout)
-        self.crf = LinearChainCrf(num_classes, crf_lr, False)
-
-
-if __name__ == '__main__':
-    paddle.set_device('gpu')
- 
-    train_ds = ExpressDataset('./data/train.txt')
-    dev_ds = ExpressDataset('./data/dev.txt')
-    test_ds = ExpressDataset('./data/test.txt')
- 
-    tokenizer = ErnieTokenizer.from_pretrained('ernie')
-    trans_func = partial(
-        convert_example,
-        tokenizer=tokenizer,
-        label_vocab=train_ds.label_vocab)
- 
-    ignore_label = -100
-    batchify_fn = lambda samples, fn=Tuple(
-        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
-        Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
-        Stack(),
-        Pad(axis=0, pad_val=ignore_label)
-    ): fn(list(map(trans_func, samples)))
- 
-    train_loader = paddle.io.DataLoader(
-        dataset=train_ds,
-        batch_size=200,
-        shuffle=True,
-        return_list=True,
-        collate_fn=batchify_fn)
-    dev_loader = paddle.io.DataLoader(
-        dataset=dev_ds,
-        batch_size=200,
-        return_list=True,
-        collate_fn=batchify_fn)
-    test_loader = paddle.io.DataLoader(
-        dataset=test_ds,
-        batch_size=200,
-        return_list=True,
-        collate_fn=batchify_fn)
-        
-    model = ErnieCRF.from_pretrained(
-        'ernie', num_classes=train_ds.label_num)
-    loss = LinearChainCrfLoss(transitions=model.crf.transitions)
-    decoder = ViterbiDecoder(transitions=model.crf.transitions) 
-    metric = ChunkEvaluator((train_ds.label_num + 2) // 2, "IOB")
-    inputs = [InputSpec([None, None], dtype='int64', name='input_ids'), 
-              InputSpec([None, None], dtype='int64', name='token_type_ids'), 
-              InputSpec([None, None], dtype='int64', name='lens')]
- 
-    model = paddle.Model(model, inputs)
- 
-    optimizer = paddle.optimizer.AdamW(learning_rate=2e-5,parameters=model.parameters())
- 
-    model.prepare(optimizer, loss, metric) 
-    model.fit(train_data=train_loader,
-              eval_data=dev_loader,
-              epochs=10,
-              save_dir='./results',
-              log_freq=1,
-              save_freq=10000)
-
-    model.evaluate(eval_data=test_loader)
-    outputs, lens, decodes = model.predict(test_data=test_loader)
-    pred = parse_decodes(test_ds, decodes, lens)
-    print('\n'.join(pred[:10]))
--- a/PaddleNLP/paddlenlp/__init__.py
+++ b/PaddleNLP/paddlenlp/__init__.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__version__ = '2.0.0a0'
+__version__ = '2.0.0a2'

 from . import data
 from . import datasets
 from . import embeddings
 from . import layers
-#from . import losses
 from . import metrics
 from . import models
 from . import seq2vec

--- a/PaddleNLP/paddlenlp/layers/biaffine.py
+++ b/PaddleNLP/paddlenlp/layers/biaffine.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class BiAffine(nn.Layer):
-    # TODO(qiujinxuan)
-    pass
--- a/PaddleNLP/paddlenlp/models/ernie.py
+++ b/PaddleNLP/paddlenlp/models/ernie.py
@@ -17,7 +17,6 @@ import paddle.nn as nn
 import paddle.nn.functional as F

 from paddlenlp.transformers import *
-import paddlenlp as nlp


 class Ernie(nn.Layer):