From 3a33a0bb4d8a146a946afa37d93f5e0d6608c414 Mon Sep 17 00:00:00 2001 From: 0YuanZhang0 <953963890@qq.com> Date: Thu, 22 Aug 2019 19:22:34 +0800 Subject: [PATCH] fix_qa_test_question (#3169) --- .../auto_dialogue_evaluation/README.md | 68 ++++++++++++------- .../auto_dialogue_evaluation/ade/reader.py | 11 ++- .../auto_dialogue_evaluation/predict.py | 2 + .../auto_dialogue_evaluation/run.sh | 12 ++++ .../auto_dialogue_evaluation/train.py | 10 ++- .../dialogue_general_understanding/README.md | 55 ++++++++++++--- .../data/config/dgu.yaml | 1 - .../dgu/reader.py | 42 +++++++++--- .../dgu/utils/configure.py | 2 - .../dialogue_general_understanding/dgu_net.py | 2 +- .../dialogue_general_understanding/predict.py | 3 +- .../dialogue_general_understanding/run.sh | 7 +- .../dialogue_general_understanding/train.py | 2 +- 13 files changed, 164 insertions(+), 53 deletions(-) diff --git a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/README.md b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/README.md index 78682e2e..3a096f40 100644 --- a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/README.md +++ b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/README.md @@ -1,10 +1,14 @@ # 对话自动评估模块ADE - * [1、模型简介](#1、模型简介) - * [2、快速开始](#2、快速开始) - * [3、进阶使用](#3、进阶使用) - * [4、参考论文](#4、参考论文) - * [5、版本更新](#5、版本更新) +- [**1、模型简介**](#1、模型简介) + +- [**2、快速开始**](#2、快速开始) + +- [**3、进阶使用**](#3、进阶使用) + +- [**4、参考论文**](#4、参考论文) + +- [**5、版本更新**](#5、版本更新) ## 1、模型简介 @@ -40,9 +44,11 @@ cd models/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation     本模块内模型训练主要包括两个阶段: -    1)第一阶段:训练一个匹配模型作为评估工具,可用于待评估对话系统内的回复内容进行排序;(matching任务) +    1)第一阶段:训练一个匹配模型作为评估工具,可用于待评估对话系统内的回复内容进行排序;(matching任务) +          模型结构: 输入为context和response, 对两个输入学习embedding表示, 学习到的表示经过lstm学习高阶表示, context和response的高阶表示计算双线性张量积logits, logits和label计算sigmoid_cross_entropy_with_logits loss;     2)第二阶段:利用少量的对话系统的标记数据,对第一阶段训练的匹配模型进行finetuning, 可以提高评估效果(包含human,keywords,seq2seq_att,seq2seq_naive,4个finetuning任务); +          模型结构: finetuning阶段学习表示到计算logits部分和第一阶段模型结构相同,区别在于finetuning阶段计算square_error_cost loss;     用于第二阶段fine-tuning的对话系统包括下面四部分: @@ -74,6 +80,8 @@ label_data(第二阶段finetuning数据集) cd ade && bash prepare_data_and_model.sh ``` +    数据路径:data/input/data/ +    模型路径:data/saved_models/trained_models/     下载经过预处理的数据,运行该脚本之后,data目录下会存在unlabel_data(train.ids/val.ids/test.ids),lable_data: human、keywords、seq2seq_att、seq2seq_naive(四个任务数据train.ids/val.ids/test.ids),以及word2ids. ### 单机训练 @@ -84,14 +92,14 @@ cd ade && bash prepare_data_and_model.sh bash run.sh matching train ``` -    方式一如果为CPU训练: +    如果为CPU训练: ``` 请将run.sh内参数设置为: 1、export CUDA_VISIBLE_DEVICES= ``` -    方式一如果为GPU训练: +    如果为GPU训练: ``` 请将run.sh内参数设置为: @@ -121,6 +129,12 @@ else fi pretrain_model_path="data/saved_models/matching_pretrained" + +if [ -f ${pretrain_model_path} ] +then + rm ${pretrain_model_path} +fi + if [ ! -d ${pretrain_model_path} ] then mkdir ${pretrain_model_path} @@ -181,6 +195,12 @@ else fi save_model_path="data/saved_models/human_finetuned" + +if [ -f ${save_model_path} ] +then + rm ${save_model_path} +fi + if [ ! -d ${save_model_path} ] then mkdir ${save_model_path} @@ -215,14 +235,14 @@ python -u main.py \ bash run.sh matching predict ``` -    方式一如果为CPU预测: +    如果为CPU预测: ``` 请将run.sh内参数设置为: export CUDA_VISIBLE_DEVICES= ``` -    方式一如果为GPU预测: +    如果为GPU预测: ``` 请将run.sh内参数设置为: @@ -329,23 +349,23 @@ seq2seq_naive:使用spearman相关系数来衡量评估模型对系统的打     1. 无标注数据情况下,直接使用预训练好的评估工具进行评估; - 在四个对话系统上,自动评估打分和人工评估打分spearman相关系数,如下: +        在四个对话系统上,自动评估打分和人工评估打分spearman相关系数,如下: - /|seq2seq\_naive|seq2seq\_att|keywords|human - --|:--:|--:|:--:|--: - cor|0.361|0.343|0.324|0.288 + ||seq2seq\_naive|seq2seq\_att|keywords|human| + |--|:--:|--:|:--:|--:| + |cor|0.361|0.343|0.324|0.288| - 对四个系统平均得分排序: +        对四个系统平均得分排序: - 人工评估|k(0.591) sample_pro: continue tokens = example.strip().split('\t') - assert len(tokens) == 3 + + if len(tokens) != 3: + print("data format error: %s" % example.strip()) + print("please input data: context \t response \t label") + continue + context = [int(x) for x in tokens[0].split()[: self.max_seq_len]] response = [int(x) for x in tokens[1].split()[: self.max_seq_len]] label = [int(tokens[2])] diff --git a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/predict.py b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/predict.py index 5c23f1a0..1f75b903 100644 --- a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/predict.py +++ b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/predict.py @@ -108,9 +108,11 @@ def do_predict(args): break scores = scores[: num_test_examples] + print("Write the predicted results into the output_prediction_file") with open(args.output_prediction_file, 'w') as fw: for index, score in enumerate(scores): fw.write("%s\t%s\n" % (index, score)) + print("finish........................................") if __name__ == "__main__": diff --git a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/run.sh b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/run.sh index 40e358eb..78f8da40 100755 --- a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/run.sh +++ b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/run.sh @@ -51,6 +51,11 @@ function pretrain_train() { pretrain_model_path="${SAVED_MODELS}/matching_pretrained" + if [ -f ${pretrain_model_path} ] + then + rm ${pretrain_model_path} + fi + if [ ! -d ${pretrain_model_path} ] then mkdir ${pretrain_model_path} @@ -78,10 +83,17 @@ function pretrain_train() function finetuning_train() { save_model_path="${SAVED_MODELS}/${2}_finetuned" + + if [ -f ${save_model_path} ] + then + rm ${save_model_path} + fi + if [ ! -d ${save_model_path} ] then mkdir ${save_model_path} fi + ${PYTHON_PATH} -u main.py \ --do_train=true \ --use_cuda=${1} \ diff --git a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/train.py b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/train.py index b90a64db..0ee6ade5 100755 --- a/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/train.py +++ b/PaddleNLP/dialogue_model_toolkit/auto_dialogue_evaluation/train.py @@ -129,6 +129,7 @@ def do_train(args): steps = 0 begin_time = time.time() + time_begin = time.time() for epoch_step in range(args.epoch): data_reader.start() @@ -136,7 +137,6 @@ def do_train(args): ce_loss = 0.0 while True: try: - steps += 1 fetch_list = [loss.name] outputs = exe.run(compiled_train_prog, fetch_list=fetch_list) np_loss = outputs @@ -144,14 +144,20 @@ def do_train(args): ce_loss = np.array(np_loss).mean() if steps % args.print_steps == 0: - print('epoch: %d, step: %s, avg loss %s' % (epoch_step, steps, sum_loss / args.print_steps)) + time_end = time.time() + used_time = time_end - time_begin + current_time = time.strftime('%Y-%m-%d %H:%M:%S', + time.localtime(time.time())) + print('%s epoch: %d, step: %s, avg loss %s, speed: %f steps/s' % (current_time, epoch_step, steps, sum_loss / args.print_steps, args.print_steps / used_time)) sum_loss = 0.0 + time_begin = time.time() if steps % args.save_steps == 0: if args.save_checkpoint: save_load_io.save_checkpoint(args, exe, train_prog, "step_" + str(steps)) if args.save_param: save_load_io.save_param(args, exe, train_prog, "step_" + str(steps)) + steps += 1 except fluid.core.EOFException: data_reader.reset() break diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/README.md b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/README.md index 444d201a..21db7f54 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/README.md +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/README.md @@ -1,10 +1,14 @@ # 对话通用理解模块DGU - * [1、模型简介](#1、模型简介) - * [2、快速开始](#2、快速开始) - * [3、进阶使用](#3、进阶使用) - * [4、参考论文](#4、参考论文) - * [5、版本更新](#5、版本更新) +- [**1、模型简介**](#1、模型简介) + +- [**2、快速开始**](#2、快速开始) + +- [**3、进阶使用**](#3、进阶使用) + +- [**4、参考论文**](#4、参考论文) + +- [**5、版本更新**](#5、版本更新) ## 1、模型简介 @@ -60,6 +64,10 @@ SWDA:Switchboard Dialogue Act Corpus; ``` cd dgu && bash prepare_data_and_model.sh ``` +    数据路径:data/input/data +    预训练模型路径:data/pretrain_model +    已训练模型路径:data/saved_models/trained_models +     下载的数据集中已提供了训练集,测试集和验证集,用户如果需要重新生成某任务数据集的训练数据,可执行: @@ -67,6 +75,34 @@ cd dgu && bash prepare_data_and_model.sh cd dgu/scripts && bash run_build_data.sh task_name 参数说明: task_name: udc, swda, mrda, atis, dstc2, 选择5个数据集选项中用户需要生成的数据名 + +各任务数据生成脚本描述: +dgu/scripts/build_atis_dataset.py:将ATIS开源数据集转换成训练所需的意图识别(atis_intent)和槽位解析(atis_slot)训练数据 +dgu/scripts/build_dstc2_dataset.py:将DSTC2开源数据集转换成训练所需数据格式; +dgu/scripts/build_mrda_dataset.py:将MRDA开源数据集转换成训练所需数据格式; +dgu/scripts/build_swda_dataset.py:将SWDA开源数据集转换成训练所需数据格式; +``` + +    根据脚本构造的训练数据格式说明: + +``` +udc:数据组成,label和多轮对话(分为多轮上文和当前回复),整体分割符为"\t" +format: label \t conv1 \t conv2 \t conv3 \t ......\t response + +swda:数据组成,多轮对话id, 标签label, 发言人caller, 说话内容conversation_content,整体分割符为"\t" +format: conversation_id \t label \t caller \t conversation_content + +mrda: 数据组成,多轮对话id, 标签label, 发言人caller, 说话内容conversation_content,整体分割符为"\t" +format: conversation_id \t label \t caller \t conversation_content + +atis/atis_intent: 数据组成,标签label, 说话内容conversation_content,整体分割符为"\t" +format: label \t conversation_content + +atis/atis_slot: 数据组成,说话内容conversation_content,标签序列 label_list(空格分割), 其中标签和说话内容中token为一一对应关系,整体分割符为"\t" +format: conversation_content \t label1 label2 label3 + +dstc2/dstc2: 数据组成,多轮对话id, 当前轮QA对(使用\1拼接),标签(识别到的对话状态,从对话初始状态到当前轮累计的标签集合, 空格分割),整体分割符为"\t" +format:conversation_content \t question \1 answer \t state1 state2 state3...... ``` ### 单机训练 @@ -119,6 +155,10 @@ fi TASK_NAME="atis_intent" #指定训练的任务名称 BERT_BASE_PATH="data/pretrain_model/uncased_L-12_H-768_A-12" +if [ -f "./data/saved_models/${TASK_NAME}" ]; then + rm "./data/saved_models/${TASK_NAME}" +fi + if [ ! -d "./data/saved_models/${TASK_NAME}" ]; then mkdir "./data/saved_models/${TASK_NAME}" fi @@ -141,8 +181,7 @@ python -u main.py \ --learning_rate=2e-5 \ --weight_decay=0.01 \ --max_seq_len=128 \ - --print_steps=10 \ - --use_fp16 false + --print_steps=10 ``` 注: @@ -350,7 +389,7 @@ python -u main.py \ [CLS] token11 token12 token13 [INNER_SEP] token11 token12 token13 [SEP] token21 token22 token23 [SEP] token31 token32 token33 [SEP] ``` -    输入数据以[CLS]开始,[SEP]分割内容为对话内容相关三部分,如上文,当前句,下文等,如[SEP]分割的每部分内部由多轮组成的话,使用[INNER_SEP]进行分割;第二部分和第三部分部分皆可缺省; +    输入数据以[CLS]开始,[SEP]分割对话内容(上文、当前句、下文等),如果[SEP]分割的每部分内部由多轮组成的话,使用[INNER_SEP]进行分割;第二部分和第三部分部分皆可缺省;     目前dialogue_general_understanding模块内已将数据准备部分集成到代码内,用户可根据上面输入数据格式,组装自己的数据; diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/data/config/dgu.yaml b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/data/config/dgu.yaml index 960b2ba2..3857dab2 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/data/config/dgu.yaml +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/data/config/dgu.yaml @@ -24,7 +24,6 @@ verbose: False do_lower_case: False random_seed: 0 use_cuda: True -task_name: "" in_tokens: False do_save_inference_model: False enable_ce: "" diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/reader.py b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/reader.py index 68b5ea70..56c58f0b 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/reader.py +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/reader.py @@ -14,9 +14,11 @@ """data reader""" import os import csv +import sys import types import numpy as np +sys.path.append("./dgu") import tokenization from batching import prepare_batch_data @@ -109,7 +111,7 @@ class DataProcessor(object): with open(input_file, "r") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] - for line in reader: + for line in reader: lines.append(line) return lines @@ -202,15 +204,15 @@ class InputExample(object): def __init__(self, guid, text_a, text_b=None, text_c=None, label=None): """Constructs a InputExample. - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. + text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be + label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. - """ + """ self.guid = guid self.text_a = text_a self.text_b = text_b @@ -251,7 +253,11 @@ class UDCProcessor(DataProcessor): """Creates examples for the training and dev sets.""" examples = [] print("UDC dataset is too big, loading data spent a long time, please wait patiently..................") - for (i, line) in enumerate(lines): + for (i, line) in enumerate(lines): + if len(line) < 3: + print("data format error: %s" % "\t".join(line)) + print("data row contains at least three parts: label\tconv1\t.....\tresponse") + continue guid = "%s-%d" % (set_type, i) text_a = "\t".join(line[1: -1]) text_a = tokenization.convert_to_unicode(text_a) @@ -368,7 +374,11 @@ class ATISSlotProcessor(DataProcessor): def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] - for (i, line) in enumerate(lines): + for (i, line) in enumerate(lines): + if len(line) != 2: + print("data format error: %s" % "\t".join(line)) + print("data row contains two parts: conversation_content \t label1 label2 label3") + continue guid = "%s-%d" % (set_type, i) text_a = line[0] label = line[1] @@ -413,7 +423,11 @@ class ATISIntentProcessor(DataProcessor): def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] - for (i, line) in enumerate(lines): + for (i, line) in enumerate(lines): + if len(line) != 2: + print("data format error: %s" % "\t".join(line)) + print("data row contains two parts: label \t conversation_content") + continue guid = "%s-%d" % (set_type, i) text_a = line[1] text_a = tokenization.convert_to_unicode(text_a) @@ -471,6 +485,10 @@ class DSTC2Processor(DataProcessor): index = 0 conv_example = [] for (i, line) in enumerate(lines): + if len(line) != 3: + print("data format error: %s" % "\t".join(line)) + print("data row contains three parts: conversation_content \t question \1 answer \t state1 state2 state3......") + continue conv_no = line[0] text_a = line[1] label_list = line[2].split() @@ -622,6 +640,10 @@ def create_multi_turn_examples(lines, set_type): conv_example = [] index = 0 for (i, line) in enumerate(lines): + if len(line) != 4: + print("data format error: %s" % "\t".join(line)) + print("data row contains four parts: conversation_id \t label \t caller \t conversation_content") + continue tokens = line conv_no = tokens[0] if conv_no != conv_id and i != 0: diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/utils/configure.py b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/utils/configure.py index 201883bb..4251752a 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/utils/configure.py +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu/utils/configure.py @@ -95,8 +95,6 @@ class ArgConfig(object): ) train_g.add_arg("save_steps", int, 1000, "The steps interval to save checkpoints.") - train_g.add_arg("use_fp16", bool, False, - "Whether to use fp16 mixed precision training.") train_g.add_arg( "loss_scaling", float, 1.0, "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled." diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu_net.py b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu_net.py index 84489c7e..0634e0e3 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu_net.py +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/dgu_net.py @@ -46,7 +46,7 @@ def create_net( sentence_ids=sent_ids, input_mask=input_mask, config=bert_conf, - use_fp16=args.use_fp16) + use_fp16=False) params = {'num_labels': num_labels, 'src_ids': src_ids, diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/predict.py b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/predict.py index 7f1579dc..c1c32fcb 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/predict.py +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/predict.py @@ -115,7 +115,7 @@ def do_predict(args): save_load_io.init_from_pretrain_model(args, exe, test_prog) compiled_test_prog = fluid.CompiledProgram(test_prog) - + processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, @@ -141,6 +141,7 @@ def do_predict(args): break np.set_printoptions(precision=4, suppress=True) + print("Write the predicted results into the output_prediction_file") with open(args.output_prediction_file, 'w') as fw: if task_name not in ['atis_slot']: for index, result in enumerate(all_results): diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/run.sh b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/run.sh index e229e84b..1cf3aa3f 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/run.sh +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/run.sh @@ -23,6 +23,10 @@ OUTPUT_PATH="./data/output" INFERENCE_MODEL="data/inference_models" PYTHON_PATH="python" +if [ -f ${SAVE_MODEL_PATH} ]; then + rm ${SAVE_MODEL_PATH} +fi + if [ ! -d ${SAVE_MODEL_PATH} ]; then mkdir ${SAVE_MODEL_PATH} fi @@ -116,8 +120,7 @@ function train() --learning_rate=${learning_rate} \ --weight_decay=0.01 \ --max_seq_len=${max_seq_len} \ - --print_steps=${print_steps} \ - --use_fp16 false; + --print_steps=${print_steps}; } #predicting diff --git a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/train.py b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/train.py index 66258952..2c4540a7 100644 --- a/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/train.py +++ b/PaddleNLP/dialogue_model_toolkit/dialogue_general_understanding/train.py @@ -140,7 +140,7 @@ def do_train(args): startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, - use_fp16=args.use_fp16, + use_fp16=False, loss_scaling=args.loss_scaling) data_reader.decorate_batch_generator(batch_generator) -- GitLab