diff --git a/demo/sequence-labeling/sequence_label.py b/demo/sequence-labeling/sequence_label.py index 290ecb6ef440d893e41df7c17c5b3c66370aa7be..03bfb96702f6456fef564c576334e0630e0200ce 100644 --- a/demo/sequence-labeling/sequence_label.py +++ b/demo/sequence-labeling/sequence_label.py @@ -21,12 +21,13 @@ import paddlehub as hub # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") -parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") - +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") args = parser.parse_args() # yapf: enable. @@ -76,7 +77,7 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( - use_cuda=True, + use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, diff --git a/demo/text-classification/README.md b/demo/text-classification/README.md index 2d073e388921cf0eb40885160c5a99f088123b56..53f649e378eb925bb857b47482adf2870a55b30b 100644 --- a/demo/text-classification/README.md +++ b/demo/text-classification/README.md @@ -27,7 +27,7 @@ # 任务相关 --checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型 ---dataset: 有三个参数可选,分别代表3个不同的分类任务; 分别是 chnsenticorp, lcqmc, nlpcc_dbqa +--dataset: 有三个参数可选,分别代表3个不同的分类任务; 分别是 chnsenticorp, lcqmc, nlpcc_dbqa ``` ## 代码步骤 @@ -78,18 +78,18 @@ reader = hub.reader.ClassifyReader( `max_seq_len` 需要与Step1中context接口传入的序列长度保持一致 -ClassifyReader中的`data_generator`会自动按照模型对应词表对数据进行切词,以迭代器的方式返回ERNIE/BERT所需要的Tensor格式,包括`input_ids`,`position_ids`,`segment_id`与序列对应的mask `input_mask`. +ClassifyReader中的`data_generator`会自动按照模型对应词表对数据进行切词,以迭代器的方式返回ERNIE/BERT所需要的Tensor格式,包括`input_ids`,`position_ids`,`segment_id`与序列对应的mask `input_mask`. **NOTE**: Reader返回tensor的顺序是固定的,默认按照input_ids, position_ids, segment_id, input_mask这一顺序返回。 ### Step3: 构建网络并创建分类迁移任务 ```python # NOTE: 必须使用fluid.program_guard接口传入Module返回的预训练模型program -with fluid.program_guard(program): +with fluid.program_guard(program): label = fluid.layers.data(name="label", shape=[1], dtype='int64') pooled_output = outputs["pooled_output"] - + # feed_list的Tensor顺序不可以调整 feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, @@ -149,8 +149,10 @@ python cls_predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 ``` 其中CKPT_DIR为Finetune API保存最佳模型的路径, max_seq_len是ERNIE模型的最大序列长度,*请与训练时配置的参数保持一致* -参数配置正确后,请执行脚本`sh run_predict.sh`,即可看到以下文本分类预测结果。如需了解更多预测步骤,请参考`cls_predict.py` +参数配置正确后,请执行脚本`sh run_predict.sh`,即可看到以下文本分类预测结果, 以及最终准确率。 +如需了解更多预测步骤,请参考`cls_predict.py` ``` text=键盘缝隙大进灰,装系统自己不会装,屏幕有点窄玩游戏人物有点变形 label=0 predict=0 +accuracy = 0.954267 ``` diff --git a/demo/text-classification/run_classifier.sh b/demo/text-classification/run_classifier.sh index 9aaa61a5de0146926c27779c0cece85ab3eab0d3..d8cfe11e81a6b4aed96798447bc42868e24fe84e 100644 --- a/demo/text-classification/run_classifier.sh +++ b/demo/text-classification/run_classifier.sh @@ -15,5 +15,5 @@ python -u text_classifier.py \ --checkpoint_dir=${CKPT_DIR} \ --learning_rate=5e-5 \ --weight_decay=0.01 \ - --max_seq_len=128 - --num_epoch=3 \ + --max_seq_len=128 \ + --num_epoch=3 diff --git a/demo/text-classification/text_classifier.py b/demo/text-classification/text_classifier.py index 2dd7e8958c46f0e564900b513f9c26c0c7bacf48..d21900cae76a2dabcd9218f7f25a2832c1342d49 100644 --- a/demo/text-classification/text_classifier.py +++ b/demo/text-classification/text_classifier.py @@ -22,8 +22,7 @@ import paddlehub as hub # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") -parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") -parser.add_argument("--dataset", type=str, default="senticorp", help="Directory to model checkpoint") +parser.add_argument("--dataset", type=str, default="chnsenticorp", help="Directory to model checkpoint", choices=["chnsenticorp", "nlpcc_dbqa", "lcqmc"]) parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup proportion params for warmup strategy") diff --git a/paddlehub/dataset/chnsenticorp.py b/paddlehub/dataset/chnsenticorp.py index af3beb863f900bb3aa8f229957a362b91998db0e..58b0283e8e8c0cccad3dfb4f0516b94605aea2f4 100644 --- a/paddlehub/dataset/chnsenticorp.py +++ b/paddlehub/dataset/chnsenticorp.py @@ -25,7 +25,7 @@ from paddlehub.common.downloader import default_downloader from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger -DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/chnsenticorp.tar.gz" +_DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/chnsenticorp.tar.gz" class ChnSentiCorp(HubDataset): @@ -38,7 +38,7 @@ class ChnSentiCorp(HubDataset): self.dataset_dir = os.path.join(DATA_HOME, "chnsenticorp") if not os.path.exists(self.dataset_dir): ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( - url=DATA_URL, save_path=DATA_HOME, print_progress=True) + url=_DATA_URL, save_path=DATA_HOME, print_progress=True) else: logger.info("Dataset {} already cached.".format(self.dataset_dir)) diff --git a/paddlehub/dataset/lcqmc.py b/paddlehub/dataset/lcqmc.py index 06654fa14828a6d30ec9defb5d2faca7c8ce0ebf..fb16bbff2351f761e559c9e452cb384487fbb510 100644 --- a/paddlehub/dataset/lcqmc.py +++ b/paddlehub/dataset/lcqmc.py @@ -25,7 +25,7 @@ from paddlehub.common.downloader import default_downloader from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger -DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/lcqmc.tar.gz" +_DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/lcqmc.tar.gz" class LCQMC(HubDataset): @@ -33,7 +33,7 @@ class LCQMC(HubDataset): self.dataset_dir = os.path.join(DATA_HOME, "lcqmc") if not os.path.exists(self.dataset_dir): ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( - url=DATA_URL, save_path=DATA_HOME, print_progress=True) + url=_DATA_URL, save_path=DATA_HOME, print_progress=True) else: logger.info("Dataset {} already cached.".format(self.dataset_dir)) diff --git a/paddlehub/dataset/msra_ner.py b/paddlehub/dataset/msra_ner.py index 460a2fb21f7034d77d359605c414a4b57506582a..afdbebe91e09f99c4da3edf884ed889f69b22d53 100644 --- a/paddlehub/dataset/msra_ner.py +++ b/paddlehub/dataset/msra_ner.py @@ -26,7 +26,7 @@ from paddlehub.common.downloader import default_downloader from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger -DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/msra_ner.tar.gz" +_DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/msra_ner.tar.gz" class MSRA_NER(HubDataset): @@ -41,20 +41,14 @@ class MSRA_NER(HubDataset): self.dataset_dir = os.path.join(DATA_HOME, "msra_ner") if not os.path.exists(self.dataset_dir): ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( - url=DATA_URL, save_path=DATA_HOME, print_progress=True) + url=_DATA_URL, save_path=DATA_HOME, print_progress=True) else: logger.info("Dataset {} already cached.".format(self.dataset_dir)) - self._load_label_map() self._load_train_examples() self._load_test_examples() self._load_dev_examples() - def _load_label_map(self): - self.label_map_file = os.path.join(self.dataset_dir, "label_map.json") - with open(self.label_map_file) as fi: - self.label_map = json.load(fi) - def _load_train_examples(self): train_file = os.path.join(self.dataset_dir, "train.tsv") self.train_examples = self._read_tsv(train_file) diff --git a/paddlehub/dataset/nlpcc_dbqa.py b/paddlehub/dataset/nlpcc_dbqa.py index beedba95a6e9d20027baa6efc44edc4dba8b07e7..b6200c2e5245b56c9615f014cb883f9709947ee8 100644 --- a/paddlehub/dataset/nlpcc_dbqa.py +++ b/paddlehub/dataset/nlpcc_dbqa.py @@ -25,7 +25,7 @@ from paddlehub.common.downloader import default_downloader from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger -DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/nlpcc-dbqa.tar.gz" +_DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/nlpcc-dbqa.tar.gz" class NLPCC_DBQA(HubDataset): @@ -39,7 +39,7 @@ class NLPCC_DBQA(HubDataset): self.dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa") if not os.path.exists(self.dataset_dir): ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( - url=DATA_URL, save_path=DATA_HOME, print_progress=True) + url=_DATA_URL, save_path=DATA_HOME, print_progress=True) else: logger.info("Dataset {} already cached.".format(self.dataset_dir)) diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index eaca1339932706c2f97bb5abf60c6f5767307860..e9c5ac2939e93c1342f25fb11e21e605af8fbc3b 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -76,10 +76,6 @@ class BaseReader(object): """Gets a collection of `InputExample`s for prediction.""" return self.dataset.get_test_examples() - def get_labels(self): - """Gets the list of labels for this data set.""" - return self.dataset.get_labels() - def get_train_progress(self): """Gets progress for training phase.""" return self.current_example, self.current_epoch