paddlehub自定义数据训练时 报错:ZeroDivisionError: float division by zero
Created by: wcxiaowang
代码如下 import argparse import ast
import paddle.fluid as fluid import paddlehub as hub from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset parser = argparse.ArgumentParser(doc) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for fine-tuning, input should be True or False") parser.add_argument("--checkpoint_dir", type=str, default="./aimoli", help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=96, help="Number of words of the longest seqence.") parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") args = parser.parse_args() #把parser中设置的所有"add_argument"给返回到args子类实例当中
jieba_paddle = hub.Module(name='jieba_paddle')
class DemoDataset(BaseNLPDataset): """DemoDataset""" def init(self): # 数据集存放位置 self.dataset_dir = r'D:\xampp\htdocs\python\log\data' #"/data/semantic/" super(DemoDataset, self).init( base_path=self.dataset_dir, train_file="train_test.tsv", dev_file="dev_test.tsv", test_file="test_test.tsv", # 如果还有预测数据(不需要文本类别label),可以放在predict.tsv predict_file="test_test.tsv", train_file_with_header=True, dev_file_with_header=True, test_file_with_header=True, # predict_file_with_header=True, # 数据集类别集合 label_list=["0", "1"])
def cut(text): res = jieba_paddle.cut(text, use_paddle=False) return res
if name == 'main': # Load Paddlehub senta pretrained model module = hub.Module(name="senta_bilstm") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len)
# Tokenizer tokenizes the text data and encodes the data as model needed.
# If you use transformer modules (ernie, bert, roberta and so on), tokenizer should be hub.BertTokenizer.
# Otherwise, tokenizer should be hub.CustomTokenizer.
# If you choose CustomTokenizer, you can also change the chinese word segmentation tool, for example jieba.
tokenizer = hub.CustomTokenizer(
vocab_file=module.get_vocab_path(), #会返回预训练模型对应的词表
tokenize_chinese_chars=True, #是否切分中文文本
cut_function=cut, # jieba.cut as cut function
)
#准备自定义的微调数据集
dataset = DemoDataset()
print(dataset)
reader = hub.reader.LACClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path())
# Construct transfer learning network
# Use sentence-level output. 返回了senta模型对应的句子特征,可以用于句子的特征表达
sent_feature = outputs["sentence_feature"]
#选择优化策略
strategy = hub.AdamWeightDecayStrategy(
learning_rate=1e-5,
weight_decay=0.01,
warmup_proportion=0.1,
lr_scheduler="linear_decay",
)
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=strategy)
# Define a classfication fine-tune task by PaddleHub's API
#构建网络并创建分类迁移任务进行Fine-tune
cls_task = hub.TextClassifierTask(#通过输入特征,label与迁移的类别数,可以生成适用于文本分类的迁移任务TextClassifierTask;
dataset=dataset,
feature=sent_feature,
num_classes=dataset.num_labels,
config=config)
print('start')
cls_task.finetune_and_eval()
Dataset有值 Dataset: DemoDataset with 17 train examples, 5 dev examples and 2 test examples 格式看起来也是正常的