提交 82e1494a 编写于 作者: Z Zeyu Chen

update senta demo

上级 5bd9a50a
export CUDA_VISIBLE_DEVICES=0
export CUDA_VISIBLE_DEVICES=2
# User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task
DATASET="chnsenticorp"
......@@ -8,4 +8,4 @@ python -u text_classifier.py \
--batch_size=24 \
--use_gpu=True \
--checkpoint_dir=${CKPT_DIR} \
--num_epoch=3
--num_epoch=10
......@@ -21,29 +21,28 @@ if __name__ == '__main__':
# Step2: Download dataset and use TextClassificationReader to read dataset
dataset = hub.dataset.ChnSentiCorp()
reader = hub.reader.TextClassificationReader(
reader = hub.reader.LACTokenizeReader(
dataset=dataset, vocab_path=module.get_vocab_path())
# Step3: construct transfer learning network
# Use "sequence_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
sent_feature = outputs["sequence_output"]
# Define a classfication finetune task by PaddleHub's API
cls_task = hub.create_text_cls_task(
feature=sequence_output, num_classes=dataset.num_labels)
feature=sent_feature, num_classes=dataset.num_labels)
# Setup feed list for data feeder
# Must feed all the tensor of senta's module need
feed_list = [inputs["words"].name, cls_task.variable('label').name]
# Setup runing config for PaddleHub Finetune API
strategy = hub.finetune.strategy.AdamWeightDecayStrategy(
learning_rate=1e-3, weight_decay=0.01, warmup_proportion=0.01)
config = hub.RunConfig(
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=hub.finetune.strategy.DefaultFinetuneStrategy())
strategy=strategy)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -2,9 +2,9 @@ import paddle.fluid as fluid
import paddlehub as hub
module = hub.Module(name="ernie")
inputs, outputs, program = module.context(trainable=True)
reader = hub.reader.ClassifyReader(hub.dataset.ChnSentiCorp(),
module.get_vocab_path())
inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
reader = hub.reader.ClassifyReader(
hub.dataset.ChnSentiCorp(), module.get_vocab_path(), max_seq_len=128)
task = hub.create_text_cls_task(feature=outputs["pooled_output"], num_classes=2)
strategy = hub.AdamWeightDecayStrategy(learning_rate=5e-5)
config = hub.RunConfig(
......
......@@ -14,5 +14,5 @@
from .nlp_reader import ClassifyReader
from .nlp_reader import SequenceLabelReader
from .nlp_reader import TextClassificationReader
from .nlp_reader import LACTokenizeReader
from .cv_reader import ImageClassificationReader
......@@ -58,7 +58,6 @@ class BaseReader(object):
self.current_example = 0
self.current_epoch = 0
self.num_examples = 0
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
......@@ -383,18 +382,47 @@ class ExtractEmbeddingReader(BaseReader):
return return_list
class TextClassificationReader(object):
def __init__(self, dataset, vocab_path, do_lower_case=False):
class LACTokenizeReader(object):
def __init__(self, dataset, vocab_path):
self.dataset = dataset
self.lac = hub.Module(name="lac")
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
vocab_file=vocab_path, do_lower_case=False)
self.vocab = self.tokenizer.vocab
self.lac = hub.Module(name="lac")
self.feed_key = list(
self.lac.processor.data_format(
sign_name="lexical_analysis").keys())[0]
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'val', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return self.num_examples[phase]
def get_train_examples(self):
"""Gets a collection of `InputExample`s for the train set."""
return self.dataset.get_train_examples()
def get_dev_examples(self):
"""Gets a collection of `InputExample`s for the dev set."""
return self.dataset.get_dev_examples()
def get_val_examples(self):
"""Gets a collection of `InputExample`s for the val set."""
return self.dataset.get_val_examples()
def get_test_examples(self):
"""Gets a collection of `InputExample`s for prediction."""
return self.dataset.get_test_examples()
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_example, self.current_epoch
def data_generator(self,
batch_size=1,
phase="train",
......@@ -402,14 +430,20 @@ class TextClassificationReader(object):
data=None):
if phase == "train":
data = self.dataset.get_train_examples()
self.num_examples['train'] = len(data)
elif phase == "test":
shuffle = False
data = self.dataset.get_test_examples()
self.num_examples['train'] = len(data)
elif phase == "val" or phase == "dev":
shuffle = False
data = self.dataset.get_dev_examples()
self.num_examples['test'] = len(data)
elif phase == "predict":
data = data
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def preprocess(text):
data_dict = {self.feed_key: [text]}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册