提交 82e1494a 编写于 作者: Z Zeyu Chen

update senta demo

上级 5bd9a50a
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=2
# User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task # User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task
DATASET="chnsenticorp" DATASET="chnsenticorp"
...@@ -8,4 +8,4 @@ python -u text_classifier.py \ ...@@ -8,4 +8,4 @@ python -u text_classifier.py \
--batch_size=24 \ --batch_size=24 \
--use_gpu=True \ --use_gpu=True \
--checkpoint_dir=${CKPT_DIR} \ --checkpoint_dir=${CKPT_DIR} \
--num_epoch=3 --num_epoch=10
...@@ -21,29 +21,28 @@ if __name__ == '__main__': ...@@ -21,29 +21,28 @@ if __name__ == '__main__':
# Step2: Download dataset and use TextClassificationReader to read dataset # Step2: Download dataset and use TextClassificationReader to read dataset
dataset = hub.dataset.ChnSentiCorp() dataset = hub.dataset.ChnSentiCorp()
reader = hub.reader.TextClassificationReader( reader = hub.reader.LACTokenizeReader(
dataset=dataset, vocab_path=module.get_vocab_path()) dataset=dataset, vocab_path=module.get_vocab_path())
# Step3: construct transfer learning network sent_feature = outputs["sequence_output"]
# Use "sequence_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Define a classfication finetune task by PaddleHub's API # Define a classfication finetune task by PaddleHub's API
cls_task = hub.create_text_cls_task( cls_task = hub.create_text_cls_task(
feature=sequence_output, num_classes=dataset.num_labels) feature=sent_feature, num_classes=dataset.num_labels)
# Setup feed list for data feeder # Setup feed list for data feeder
# Must feed all the tensor of senta's module need # Must feed all the tensor of senta's module need
feed_list = [inputs["words"].name, cls_task.variable('label').name] feed_list = [inputs["words"].name, cls_task.variable('label').name]
# Setup runing config for PaddleHub Finetune API strategy = hub.finetune.strategy.AdamWeightDecayStrategy(
learning_rate=1e-3, weight_decay=0.01, warmup_proportion=0.01)
config = hub.RunConfig( config = hub.RunConfig(
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
batch_size=args.batch_size, batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir, checkpoint_dir=args.checkpoint_dir,
strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) strategy=strategy)
# Finetune and evaluate by PaddleHub's API # Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
......
...@@ -2,9 +2,9 @@ import paddle.fluid as fluid ...@@ -2,9 +2,9 @@ import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
module = hub.Module(name="ernie") module = hub.Module(name="ernie")
inputs, outputs, program = module.context(trainable=True) inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
reader = hub.reader.ClassifyReader(hub.dataset.ChnSentiCorp(), reader = hub.reader.ClassifyReader(
module.get_vocab_path()) hub.dataset.ChnSentiCorp(), module.get_vocab_path(), max_seq_len=128)
task = hub.create_text_cls_task(feature=outputs["pooled_output"], num_classes=2) task = hub.create_text_cls_task(feature=outputs["pooled_output"], num_classes=2)
strategy = hub.AdamWeightDecayStrategy(learning_rate=5e-5) strategy = hub.AdamWeightDecayStrategy(learning_rate=5e-5)
config = hub.RunConfig( config = hub.RunConfig(
......
...@@ -14,5 +14,5 @@ ...@@ -14,5 +14,5 @@
from .nlp_reader import ClassifyReader from .nlp_reader import ClassifyReader
from .nlp_reader import SequenceLabelReader from .nlp_reader import SequenceLabelReader
from .nlp_reader import TextClassificationReader from .nlp_reader import LACTokenizeReader
from .cv_reader import ImageClassificationReader from .cv_reader import ImageClassificationReader
...@@ -58,7 +58,6 @@ class BaseReader(object): ...@@ -58,7 +58,6 @@ class BaseReader(object):
self.current_example = 0 self.current_example = 0
self.current_epoch = 0 self.current_epoch = 0
self.num_examples = 0
self.num_examples = {'train': -1, 'dev': -1, 'test': -1} self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
...@@ -383,18 +382,47 @@ class ExtractEmbeddingReader(BaseReader): ...@@ -383,18 +382,47 @@ class ExtractEmbeddingReader(BaseReader):
return return_list return return_list
class TextClassificationReader(object): class LACTokenizeReader(object):
def __init__(self, dataset, vocab_path, do_lower_case=False): def __init__(self, dataset, vocab_path):
self.dataset = dataset self.dataset = dataset
self.lac = hub.Module(name="lac")
self.tokenizer = tokenization.FullTokenizer( self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case) vocab_file=vocab_path, do_lower_case=False)
self.vocab = self.tokenizer.vocab self.vocab = self.tokenizer.vocab
self.lac = hub.Module(name="lac")
self.feed_key = list( self.feed_key = list(
self.lac.processor.data_format( self.lac.processor.data_format(
sign_name="lexical_analysis").keys())[0] sign_name="lexical_analysis").keys())[0]
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'val', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return self.num_examples[phase]
def get_train_examples(self):
"""Gets a collection of `InputExample`s for the train set."""
return self.dataset.get_train_examples()
def get_dev_examples(self):
"""Gets a collection of `InputExample`s for the dev set."""
return self.dataset.get_dev_examples()
def get_val_examples(self):
"""Gets a collection of `InputExample`s for the val set."""
return self.dataset.get_val_examples()
def get_test_examples(self):
"""Gets a collection of `InputExample`s for prediction."""
return self.dataset.get_test_examples()
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_example, self.current_epoch
def data_generator(self, def data_generator(self,
batch_size=1, batch_size=1,
phase="train", phase="train",
...@@ -402,14 +430,20 @@ class TextClassificationReader(object): ...@@ -402,14 +430,20 @@ class TextClassificationReader(object):
data=None): data=None):
if phase == "train": if phase == "train":
data = self.dataset.get_train_examples() data = self.dataset.get_train_examples()
self.num_examples['train'] = len(data)
elif phase == "test": elif phase == "test":
shuffle = False shuffle = False
data = self.dataset.get_test_examples() data = self.dataset.get_test_examples()
self.num_examples['train'] = len(data)
elif phase == "val" or phase == "dev": elif phase == "val" or phase == "dev":
shuffle = False shuffle = False
data = self.dataset.get_dev_examples() data = self.dataset.get_dev_examples()
self.num_examples['test'] = len(data)
elif phase == "predict": elif phase == "predict":
data = data data = data
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def preprocess(text): def preprocess(text):
data_dict = {self.feed_key: [text]} data_dict = {self.feed_key: [text]}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册