自定义2分类数据,使用ernie_tiny进行finetune_and_eval,predict出来的结果都是一样的0
Created by: weiwei1005
class MyDataset(BaseNLPDataset): """DemoDataset""" def init(self): # 数据集存放位置 self.dataset_dir = "./data_all" super(MyDataset, self).init( base_path=self.dataset_dir, train_file="train.txt", dev_file="test.txt", test_file="valid.txt", train_file_with_header=False, dev_file_with_header=False, test_file_with_header=False,
# 数据集类别集合
label_list=["0", "1"])
dataset = MyDataset() for e in dataset.get_train_examples()[0]: print("{}\t{}\t{}".format(e.guid, e.text_a, e.label))
print结果: 0 家里老催 我也不知道为什么就是没遇到真心爱我的男人 不知道在这里能不能遇到 只限重庆的35岁以下 1
//加载模型 import paddlehub as hub module = hub.Module(name="ernie_tiny",) //构建Reader reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path(), max_seq_len=128) // finetune策略 strategy = hub.AdamWeightDecayStrategy( weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5) //运行配置 config = hub.RunConfig( use_cuda=False, num_epoch=1, checkpoint_dir="model_all2", batch_size=32, eval_interval=100, strategy=strategy) %env CPU_NUM=15 //Finetune Task inputs, outputs, program = module.context(trainable=True, max_seq_len=128) pooled_output = outputs["pooled_output"]
feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ]
cls_task = hub.TextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config, metrics_choices=["f1"])
finetune
run_states = cls_task.finetune_and_eval()
finetune log如下: [2020-05-19 10:06:04,717] [ INFO] - Strategy with warmup, linear decay, slanted triangle learning rate, weight decay regularization, /usr/local/Python-3.6.8/lib/python3.6/site-packages/paddle/fluid/executor.py:811: UserWarning: There are no operators in the program to be executed. If you pass Program manually, please use fluid.program_guard to ensure the current Program is being used. warnings.warn(error_info) [2020-05-19 10:06:04,748] [ INFO] - Try loading checkpoint from model_all2/ckpt.meta [2020-05-19 10:06:04,750] [ INFO] - PaddleHub model checkpoint not found, start from scratch... [2020-05-19 10:06:04,936] [ INFO] - PaddleHub finetune start [2020-05-19 10:16:39,083] [ TRAIN] - step 10 / 43: loss=0.40500 f1=0.08569 [step/sec: 0.02] [2020-05-19 10:33:39,614] [ TRAIN] - step 20 / 43: loss=0.21602 f1=0.00000 [step/sec: 0.01] [2020-05-19 10:48:13,086] [ TRAIN] - step 30 / 43: loss=0.17749 f1=0.00000 [step/sec: 0.01] [2020-05-19 11:05:39,188] [ TRAIN] - step 40 / 43: loss=0.19343 f1=0.00000 [step/sec: 0.01] [2020-05-19 11:10:57,969] [ INFO] - Evaluation on dev dataset start share_vars_from is set, scope is ignored. [2020-05-19 11:11:53,923] [ EVAL] - [dev dataset evaluation result] loss=0.30112 f1=0.00000 [step/sec: 0.14] [2020-05-19 11:11:53,924] [ EVAL] - best model saved to model_all2/best_model [best f1=0.00000] [2020-05-19 11:11:54,419] [ INFO] - Load the best model from model_all2/best_model [2020-05-19 11:11:55,060] [ INFO] - Evaluation on test dataset start [2020-05-19 11:13:04,649] [ EVAL] - [test dataset evaluation result] loss=0.11348 f1=0.00000 [step/sec: 0.14] [2020-05-19 11:13:04,651] [ INFO] - Saving model checkpoint to model_all2/step_43 [2020-05-19 11:13:05,930] [ INFO] - PaddleHub finetune finished.
data = [['失恋!心情不好'], ['从新接受新的工作,挑战下自己'], ['今天是周末'],] run_states = cls_task.predict(data=data, return_result= True,accelerate_mode=True)
run_states : [0,0,0]