diff --git a/demo/ernie-classification/question_answering.py b/demo/ernie-classification/question_answering.py index b09ee78256050d122fbdb5057de399e2aa05349b..a794a1739b8211fdc93c48404b2abfbb132117ed 100644 --- a/demo/ernie-classification/question_answering.py +++ b/demo/ernie-classification/question_answering.py @@ -40,54 +40,52 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sentence classification dataset reader + # Step2: Download dataset and use ClassifyReader to read dataset + dataset = hub.dataset.NLPCC_DBQA() reader = hub.reader.ClassifyReader( - dataset=hub.dataset.NLPCC_DBQA(), # download NLPCC_DBQA dataset + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name ] # Define a classfication finetune task by PaddleHub's API cls_task = hub.create_text_classification_task( pooled_output, label, num_classes=num_labels) + # Step4: Select finetune strategy, setup config and finetune + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/demo/ernie-classification/question_matching.py b/demo/ernie-classification/question_matching.py index 64ee1f79b3a03edcd013a82bf6254a3580c742c4..b288572409d221e3b7d787ad0b5ed62c9a8df72d 100644 --- a/demo/ernie-classification/question_matching.py +++ b/demo/ernie-classification/question_matching.py @@ -40,54 +40,52 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sentence classification dataset reader + # Step2: Download dataset and use ClassifyReader to read dataset + dataset = hub.dataset.LCQMC() reader = hub.reader.ClassifyReader( - dataset=hub.dataset.LCQMC(), # download LCQMC dataset + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name ] # Define a classfication finetune task by PaddleHub's API cls_task = hub.create_text_classification_task( pooled_output, label, num_classes=num_labels) + # Step4: Select finetune strategy, setup config and finetune + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/demo/ernie-classification/sentiment_cls.py b/demo/ernie-classification/sentiment_cls.py index c5fd6d496b8a23dcde8b285612116ed7b4149508..3d2c5b82ed17501f7017f5862c67dc73e772d40f 100644 --- a/demo/ernie-classification/sentiment_cls.py +++ b/demo/ernie-classification/sentiment_cls.py @@ -40,54 +40,52 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - checkpoint_dir=args.checkpoint_dir, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sentence classification dataset reader + # Step2: Download dataset and use ClassifyReader to read dataset + dataset = hub.dataset.ChnSentiCorp() reader = hub.reader.ClassifyReader( - dataset=hub.dataset.ChnSentiCorp(), # download chnsenticorp dataset + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_outputs" for token-level output. - pooled_output = output_dict["pooled_output"] + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name ] # Define a classfication finetune task by PaddleHub's API cls_task = hub.create_text_classification_task( pooled_output, label, num_classes=num_labels) + # Step4: Select finetune strategy, setup config and finetune + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/demo/ernie-seq-labeling/run_fintune_with_hub.sh b/demo/ernie-seq-labeling/run_sequence_labeling.sh similarity index 79% rename from demo/ernie-seq-labeling/run_fintune_with_hub.sh rename to demo/ernie-seq-labeling/run_sequence_labeling.sh index 92bafcb75693600e2ccf4ea3deba458362e911f5..89aa22f683f97aedb14d37d46470d12fe0176651 100644 --- a/demo/ernie-seq-labeling/run_fintune_with_hub.sh +++ b/demo/ernie-seq-labeling/run_sequence_labeling.sh @@ -1,8 +1,8 @@ export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt" +CKPT_DIR="./ckpt_sequence_labeling" -python -u finetune_with_hub.py \ +python -u sequence_labeling.py \ --batch_size 16 \ --weight_decay 0.01 \ --checkpoint_dir $CKPT_DIR \ diff --git a/demo/ernie-seq-labeling/finetune_with_hub.py b/demo/ernie-seq-labeling/sequence_labeling.py similarity index 73% rename from demo/ernie-seq-labeling/finetune_with_hub.py rename to demo/ernie-seq-labeling/sequence_labeling.py index 608d0e0fc1bb49972acc8fe46d483c659e9518a8..381198a0cb0c225156d354ec1099cbea7f085311 100644 --- a/demo/ernie-seq-labeling/finetune_with_hub.py +++ b/demo/ernie-seq-labeling/sequence_labeling.py @@ -40,35 +40,21 @@ args = parser.parse_args() # yapf: enable. if __name__ == '__main__': - # Select a finetune strategy - strategy = hub.BERTFinetuneStrategy( - weight_decay=args.weight_decay, - learning_rate=args.learning_rate, - warmup_strategy="linear_warmup_decay", - ) - - # Setup runing config for PaddleHub Finetune API - config = hub.RunConfig( - eval_interval=100, - use_cuda=True, - num_epoch=args.num_epoch, - batch_size=args.batch_size, - strategy=strategy) - - # loading Paddlehub ERNIE pretrained model + # Step1: load Paddlehub ERNIE pretrained model module = hub.Module(name="ernie") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) - # Sequence Label dataset reader + # Step2: Download dataset and use SequenceLabelReader to read dataset + dataset = hub.dataset.MSRA_NER(), reader = hub.reader.SequenceLabelReader( - dataset=hub.dataset.MSRA_NER(), + dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) num_labels = len(reader.get_labels()) - input_dict, output_dict, program = module.context( - sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len) - + # Step3: construct transfer learning network with fluid.program_guard(program): label = fluid.layers.data( name="label", shape=[args.max_seq_len, 1], dtype='int64') @@ -76,14 +62,15 @@ if __name__ == '__main__': # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. - sequence_output = output_dict["sequence_output"] + sequence_output = outputs["sequence_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need + # Compared to classification task, we need add seq_len tensor to feedlist feed_list = [ - input_dict["input_ids"].name, input_dict["position_ids"].name, - input_dict["segment_ids"].name, input_dict["input_mask"].name, - label.name, seq_len + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name, + seq_len ] # Define a sequence labeling finetune task by PaddleHub's API seq_label_task = hub.create_seq_labeling_task( @@ -92,6 +79,19 @@ if __name__ == '__main__': seq_len=seq_len, num_classes=num_labels) + # Select a finetune strategy + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + strategy=strategy) # Finetune and evaluate model by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/paddlehub/module/base_processor.py b/paddlehub/module/base_processor.py index 72da4a8ca621cabd6f14d512773fd880b914608e..1d6b7fb3c2e9a293c3cb17644ef991f9d6d5d740 100644 --- a/paddlehub/module/base_processor.py +++ b/paddlehub/module/base_processor.py @@ -23,12 +23,12 @@ class BaseProcessor(object): def preprocess(self, sign_name, data_dict): raise NotImplementedError( - "BaseProcessor' preprocess should not be call!") + "BaseProcessor' preprocess should not be called!") def postprocess(self, sign_name, data_out, data_info, **kwargs): raise NotImplementedError( - "BaseProcessor' postprocess should not be call!") + "BaseProcessor' postprocess should not be called!") def data_format(self, sign_name): raise NotImplementedError( - "BaseProcessor' data_format should not be call!") + "BaseProcessor' data_format should not be called!") diff --git a/paddlehub/module/module.py b/paddlehub/module/module.py index 4290c43fc4ec55b83e4374748fb539512fa66068..c5512d5165f1d2d9ad96355a4a995e610cc0aa2e 100644 --- a/paddlehub/module/module.py +++ b/paddlehub/module/module.py @@ -119,7 +119,7 @@ class Module(object): if processor: if not issubclass(processor, BaseProcessor): raise TypeError( - "processor shoule be an instance of paddlehub.BaseProcessor" + "Processor shoule be an instance of paddlehub.BaseProcessor" ) if assets: self.assets = utils.to_list(assets) @@ -129,10 +129,10 @@ class Module(object): self._generate_module_info(module_info) self._init_with_signature(signatures=signatures) else: - raise ValueError("Error! Module initialized parameter is empty") + raise ValueError("Module initialized parameter is empty") def _init_with_name(self, name): - logger.info("Try installing module %s" % name) + logger.info("Installing %s module" % name) result, tips, module_dir = default_module_manager.install_module( module_name=name) if not result: