From 6985001a1ee984e54a4a69168086f2a505ade547 Mon Sep 17 00:00:00 2001 From: Zeyu Chen Date: Thu, 11 Apr 2019 17:39:04 +0800 Subject: [PATCH] reorganize code structure --- .../question_answering.py | 54 +++++++++---------- .../ernie-classification/question_matching.py | 54 +++++++++---------- demo/ernie-classification/sentiment_cls.py | 54 +++++++++---------- ...e_with_hub.sh => run_sequence_labeling.sh} | 4 +- ...etune_with_hub.py => sequence_labeling.py} | 50 ++++++++--------- 5 files changed, 105 insertions(+), 111 deletions(-) rename demo/ernie-seq-labeling/{run_fintune_with_hub.sh => run_sequence_labeling.sh} (79%) rename demo/ernie-seq-labeling/{finetune_with_hub.py => sequence_labeling.py} (74%) diff --git a/demo/ernie-classification/question_answering.py b/demo/ernie-classification/question_answering.py index b53b7412..a794a173 100644 --- a/demo/ernie-classification/question_answering.py +++ b/demo/ernie-classification/question_answering.py @@ -40,54 +40,52 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sentence classification dataset reader + # Step2: Download dataset and use ClassifyReader to read dataset + dataset = hub.dataset.NLPCC_DBQA() reader = hub.reader.ClassifyReader( - dataset=hub.dataset.NLPCC_DBQA(), # download NLPCC_DBQA dataset + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name ] # Define a classfication finetune task by PaddleHub's API cls_task = hub.create_text_classification_task( pooled_output, label, num_classes=num_labels) + # Step4: Select finetune strategy, setup config and finetune + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/demo/ernie-classification/question_matching.py b/demo/ernie-classification/question_matching.py index c5cc2d71..b2885724 100644 --- a/demo/ernie-classification/question_matching.py +++ b/demo/ernie-classification/question_matching.py @@ -40,54 +40,52 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sentence classification dataset reader + # Step2: Download dataset and use ClassifyReader to read dataset + dataset = hub.dataset.LCQMC() reader = hub.reader.ClassifyReader( - dataset=hub.dataset.LCQMC(), # download LCQMC dataset + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name ] # Define a classfication finetune task by PaddleHub's API cls_task = hub.create_text_classification_task( pooled_output, label, num_classes=num_labels) + # Step4: Select finetune strategy, setup config and finetune + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/demo/ernie-classification/sentiment_cls.py b/demo/ernie-classification/sentiment_cls.py index 1da03f56..3d2c5b82 100644 --- a/demo/ernie-classification/sentiment_cls.py +++ b/demo/ernie-classification/sentiment_cls.py @@ -40,54 +40,52 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sentence classification dataset reader + # Step2: Download dataset and use ClassifyReader to read dataset + dataset = hub.dataset.ChnSentiCorp() reader = hub.reader.ClassifyReader( - dataset=hub.dataset.ChnSentiCorp(), # download chnsenticorp dataset + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name ] # Define a classfication finetune task by PaddleHub's API cls_task = hub.create_text_classification_task( pooled_output, label, num_classes=num_labels) + # Step4: Select finetune strategy, setup config and finetune + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/demo/ernie-seq-labeling/run_fintune_with_hub.sh b/demo/ernie-seq-labeling/run_sequence_labeling.sh similarity index 79% rename from demo/ernie-seq-labeling/run_fintune_with_hub.sh rename to demo/ernie-seq-labeling/run_sequence_labeling.sh index 92bafcb7..89aa22f6 100644 --- a/demo/ernie-seq-labeling/run_fintune_with_hub.sh +++ b/demo/ernie-seq-labeling/run_sequence_labeling.sh @@ -1,8 +1,8 @@ export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt" +CKPT_DIR="./ckpt_sequence_labeling" -python -u finetune_with_hub.py \ +python -u sequence_labeling.py \ --batch_size 16 \ --weight_decay 0.01 \ --checkpoint_dir $CKPT_DIR \ diff --git a/demo/ernie-seq-labeling/finetune_with_hub.py b/demo/ernie-seq-labeling/sequence_labeling.py similarity index 74% rename from demo/ernie-seq-labeling/finetune_with_hub.py rename to demo/ernie-seq-labeling/sequence_labeling.py index 1424fdbc..381198a0 100644 --- a/demo/ernie-seq-labeling/finetune_with_hub.py +++ b/demo/ernie-seq-labeling/sequence_labeling.py @@ -40,35 +40,21 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sequence Label dataset reader + # Step2: Download dataset and use SequenceLabelReader to read dataset + dataset = hub.dataset.MSRA_NER(), reader = hub.reader.SequenceLabelReader( - dataset=hub.dataset.MSRA_NER(), + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data( name="label", shape=[args.max_seq_len, 1], dtype='int64') @@ -76,14 +62,15 @@ if __name__ == '__main__': # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. - sequence_output = output_dict["sequence_output"] + sequence_output = outputs["sequence_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need + # Compared to classification task, we need add seq_len tensor to feedlist feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name, seq_len + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name, + seq_len ] # Define a sequence labeling finetune task by PaddleHub's API seq_label_task = hub.create_seq_labeling_task( @@ -92,6 +79,19 @@ if __name__ == '__main__': seq_len=seq_len, num_classes=num_labels) + # Select a finetune strategy + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + strategy=strategy) # Finetune and evaluate model by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( -- GitLab