diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/README.md b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/README.md index 5e9d78bf861777235e850349eacd5ae6b09eb557..3e977d297e0392201d60afbb8c0e1f53db290abf 100644 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/README.md +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/README.md @@ -82,7 +82,7 @@ label_data(第二阶段finetuning数据集)     数据集、相关模型下载: ``` -cd ade && bash prepare_data_and_model.sh +python ade/prepare_data_and_model.py ```     数据路径:data/input/data/ diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/prepare_data_and_model.py b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/prepare_data_and_model.py new file mode 100644 index 0000000000000000000000000000000000000000..eb27ed1c136292cf074041cee994a713503bc698 --- /dev/null +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/prepare_data_and_model.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tarfile +import shutil +import urllib +import sys +import io +import os + +URLLIB=urllib +if sys.version_info >= (3, 0): + URLLIB=urllib.request + +DATA_MODEL_PATH = {"DATA_PATH": "https://baidu-nlp.bj.bcebos.com/auto_dialogue_evaluation_dataset-1.0.0.tar.gz", + "TRAINED_MODEL": "https://baidu-nlp.bj.bcebos.com/auto_dialogue_evaluation_models.2.0.0.tar.gz"} + +PATH_MAP = {'DATA_PATH': "./data/input", + 'TRAINED_MODEL': './data/saved_models'} + + +def un_tar(tar_name, dir_name): + try: + t = tarfile.open(tar_name) + t.extractall(path = dir_name) + return True + except Exception as e: + print(e) + return False + + +def download_model_and_data(): + print("Downloading ade data, pretrain model and trained models......") + print("This process is quite long, please wait patiently............") + for path in ['./data/input/data', './data/saved_models/trained_models']: + if not os.path.exists(path): + continue + shutil.rmtree(path) + for path_key in DATA_MODEL_PATH: + filename = os.path.basename(DATA_MODEL_PATH[path_key]) + URLLIB.urlretrieve(DATA_MODEL_PATH[path_key], os.path.join("./", filename)) + state = un_tar(filename, PATH_MAP[path_key]) + if not state: + print("Tar %s error....." % path_key) + return False + os.remove(filename) + return True + + +if __name__ == "__main__": + state = download_model_and_data() + if not state: + exit(1) + print("Downloading data and models sucess......") diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/prepare_data_and_model.sh b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/prepare_data_and_model.sh deleted file mode 100755 index dc65730e458033fa889eb796775f9cc1902ab887..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/prepare_data_and_model.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -#check data directory -cd .. -echo "Start download data and models.............." -if [ ! -d "data" ]; then - echo "Directory data does not exist, make new data directory" - mkdir data -fi -cd data - -#check configure file -if [ ! -d "config" ]; then - echo "config directory not exist........" - exit 255 -else - if [ ! -f "config/ade.yaml" ]; then - echo "config file dgu.yaml has been lost........" - exit 255 - fi -fi - -#check and download input data -if [ ! -d "input" ]; then - echo "Directory input does not exist, make new input directory" - mkdir input -fi -cd input -wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/auto_dialogue_evaluation_dataset-1.0.0.tar.gz -tar -zxvf auto_dialogue_evaluation_dataset-1.0.0.tar.gz -rm auto_dialogue_evaluation_dataset-1.0.0.tar.gz -cd .. - -#check and download pretrain model -if [ ! -d "pretrain_model" ]; then - echo "Directory pretrain_model does not exist, make new pretrain_model directory" - mkdir pretrain_model -fi - -#check and download inferenece model -if [ ! -d "inference_models" ]; then - echo "Directory inferenece_model does not exist, make new inferenece_model directory" - mkdir inference_models -fi - -#check output -if [ ! -d "output" ]; then - echo "Directory output does not exist, make new output directory" - mkdir output -fi - -#check saved model -if [ ! -d "saved_models" ]; then - echo "Directory saved_models does not exist, make new saved_models directory" - mkdir saved_models -fi - -cd saved_models -wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/auto_dialogue_evaluation_models.2.0.0.tar.gz -tar -xvf auto_dialogue_evaluation_models.2.0.0.tar.gz -rm auto_dialogue_evaluation_models.2.0.0.tar.gz -echo "Finish.............." diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/reader.py b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/reader.py index fdabdb2d2e58213920be140a3389173d0e439871..d3e2f952e3f28edb0375e29e750f7d25dcceda84 100755 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/reader.py +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/reader.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +14,7 @@ # limitations under the License. """Reader for auto dialogue evaluation""" +import io import sys import time import random @@ -34,12 +36,12 @@ class DataProcessor(object): """load examples""" examples = [] index = 0 - with open(self.data_file, 'r') as fr: - for line in fr: - if index !=0 and index % 100 == 0: - print("processing data: %d" % index) - index += 1 - examples.append(line.strip()) + fr = io.open(self.data_file, 'r', encoding="utf8") + for line in fr: + if index !=0 and index % 100 == 0: + print("processing data: %d" % index) + index += 1 + examples.append(line.strip()) return examples def get_num_examples(self, phase): @@ -47,7 +49,7 @@ class DataProcessor(object): if phase not in ['train', 'dev', 'test']: raise ValueError( "Unknown phase, which should be in ['train', 'dev', 'test'].") - count = len(open(self.data_file,'rU').readlines()) + count = len(io.open(self.data_file, 'r', encoding="utf8").readlines()) self.num_examples[phase] = count return self.num_examples[phase] diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/utils/configure.py b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/utils/configure.py index 201883bbc34195092bdc212fe1e7c876f5ee66d6..91af6f2aa0255b666b2e22d6e3381593086f3059 100644 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/utils/configure.py +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/ade/utils/configure.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,6 +18,7 @@ from __future__ import division from __future__ import print_function import os +import io import sys import argparse import json @@ -38,8 +40,8 @@ class JsonConfig(object): def _parse(self, config_path): try: - with open(config_path) as json_file: - config_dict = json.load(json_file) + json_file = io.open(config_path, 'r', encoding="utf8") + config_dict = json.load(json_file) except: raise IOError("Error in parsing bert model config file '%s'" % config_path) @@ -214,9 +216,9 @@ class PDConfig(object): raise Warning("the json file %s does not exist." % file_path) return - with open(file_path, "r") as fin: - self.json_config = json.loads(fin.read()) - fin.close() + fin = io.open(file_path, "r", encoding="utf8") + self.json_config = json.loads(fin.read()) + fin.close() if fuse_args: for name in self.json_config: @@ -238,9 +240,9 @@ class PDConfig(object): raise Warning("the yaml file %s does not exist." % file_path) return - with open(file_path, "r") as fin: - self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) - fin.close() + fin = io.open(file_path, "r", encoding="utf8") + self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) + fin.close() if fuse_args: for name in self.yaml_config: diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/eval.py b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/eval.py index 6023eb8973ff31f3f8a6a29473a70d5b89d3869c..edac4669ffccd1d8e250c631c949ab620af14b7d 100644 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/eval.py +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/eval.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +14,7 @@ # limitations under the License. """evaluation metrics""" +import io import os import sys import numpy as np @@ -24,22 +26,22 @@ from ade.utils.configure import PDConfig def do_eval(args): """evaluate metrics""" labels = [] - with open(args.evaluation_file, 'r') as fr: - for line in fr: - tokens = line.strip().split('\t') - assert len(tokens) == 3 - label = int(tokens[2]) - labels.append(label) + fr = io.open(args.evaluation_file, 'r', encoding="utf8") + for line in fr: + tokens = line.strip().split('\t') + assert len(tokens) == 3 + label = int(tokens[2]) + labels.append(label) scores = [] - with open(args.output_prediction_file, 'r') as fr: - for line in fr: - tokens = line.strip().split('\t') - assert len(tokens) == 2 - score = tokens[1].strip("[]").split() - score = np.array(score) - score = score.astype(np.float64) - scores.append(score) + fr = io.open(args.output_prediction_file, 'r', encoding="utf8") + for line in fr: + tokens = line.strip().split('\t') + assert len(tokens) == 2 + score = tokens[1].strip("[]").split() + score = np.array(score) + score = score.astype(np.float64) + scores.append(score) if args.loss_type == 'CLS': recall_dict = evaluate.evaluate_Recall(list(zip(scores, labels))) diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/inference_model.py b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/inference_model.py index ae4968a247ea93a5f2585b0d5c179fae962e6265..ca0872d00f1a9e35efe9fdcef8ee25a15bf14889 100644 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/inference_model.py +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/inference_model.py @@ -18,7 +18,6 @@ import sys import six import numpy as np import time -import multiprocessing import paddle import paddle.fluid as fluid diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/predict.py b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/predict.py index 9618e78caaa3c2562212bd34cc5ab7d2f7977ab6..279dff8844a262fbec13df81513642f999d8592b 100644 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/predict.py +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/predict.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """predict auto dialogue evaluation task""" - +import io import os import sys import six import time import numpy as np -import multiprocessing import paddle import paddle.fluid as fluid @@ -109,9 +109,9 @@ def do_predict(args): scores = scores[: num_test_examples] print("Write the predicted results into the output_prediction_file") - with open(args.output_prediction_file, 'w') as fw: - for index, score in enumerate(scores): - fw.write("%s\t%s\n" % (index, score)) + fw = io.open(args.output_prediction_file, 'w', encoding="utf8") + for index, score in enumerate(scores): + fw.write("%s\t%s\n" % (index, score)) print("finish........................................") diff --git a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/train.py b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/train.py index 828f03a9bea0296254c8ef2cbefef779ae26f0af..f9a8b28153899bf7e5aaa34c19276fa0158043ce 100755 --- a/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/train.py +++ b/PaddleNLP/PaddleDialogue/auto_dialogue_evaluation/train.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """train auto dialogue evaluation task""" - +import io import os import sys import six import time import numpy as np -import multiprocessing import paddle import paddle.fluid as fluid @@ -76,8 +76,7 @@ def do_train(args): dev_count = fluid.core.get_cuda_device_count() place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) else: - dev_count = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + dev_count = int(os.environ.get('CPU_NUM', 1)) place = fluid.CPUPlace() processor = reader.DataProcessor( @@ -115,9 +114,9 @@ def do_train(args): if args.word_emb_init: print("start loading word embedding init ...") if six.PY2: - word_emb = np.array(pickle.load(open(args.word_emb_init, 'rb'))).astype('float32') + word_emb = np.array(pickle.load(io.open(args.word_emb_init, 'rb'))).astype('float32') else: - word_emb = np.array(pickle.load(open(args.word_emb_init, 'rb'), encoding="bytes")).astype('float32') + word_emb = np.array(pickle.load(io.open(args.word_emb_init, 'rb'), encoding="bytes")).astype('float32') set_word_embedding(word_emb, place) print("finish init word embedding ...") diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/README.md b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/README.md index 3197e9a0b1961d6f65da4a993069e33cbdb954a2..6d49fda65e3df9b44e193bdc0296c2844af7b839 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/README.md +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/README.md @@ -63,7 +63,7 @@ SWDA:Switchboard Dialogue Act Corpus;     数据集、相关模型下载: ``` -cd dgu && bash prepare_data_and_model.sh +python dgu/prepare_data_and_model.py ```     数据路径:data/input/data diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/bert.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/bert.py index 43a2a62df59b3eeb1e1764fae0636df687981b67..6bab7466f970e51b6d5224b693617b2f98d84aa5 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/bert.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/bert.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,6 +19,7 @@ from __future__ import division from __future__ import print_function import os +import io import sys import six import json @@ -33,8 +35,8 @@ class BertConfig(object): def _parse(self, config_path): try: - with open(config_path) as json_file: - config_dict = json.load(json_file) + json_file = io.open(config_path, 'r', encoding="utf8") + config_dict = json.load(json_file) except Exception: raise IOError("Error in parsing bert model config file '%s'" % config_path) diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/evaluation.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/evaluation.py index 79b7d47de60f1d01edbf1e23b7d24129ac3bd321..43d3fe636eadea3b1b8da6c7c2e082ea7e1e246b 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/evaluation.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/evaluation.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -14,6 +15,7 @@ """evaluate task metrics""" import sys +import io class EvalDA(object): @@ -33,18 +35,18 @@ class EvalDA(object): """ pred_label = [] refer_label = [] - with open(self.refer_file, 'r') as fr: - for line in fr: - label = line.rstrip('\n').split('\t')[1] - refer_label.append(int(label)) + fr = io.open(self.refer_file, 'r', encoding="utf8") + for line in fr: + label = line.rstrip('\n').split('\t')[1] + refer_label.append(int(label)) idx = 0 - with open(self.pred_file, 'r') as fr: - for line in fr: - elems = line.rstrip('\n').split('\t') - if len(elems) != 2 or not elems[0].isdigit(): - continue - tag_id = int(elems[1]) - pred_label.append(tag_id) + fr = io.open(self.pred_file, 'r', encoding="utf8") + for line in fr: + elems = line.rstrip('\n').split('\t') + if len(elems) != 2 or not elems[0].isdigit(): + continue + tag_id = int(elems[1]) + pred_label.append(tag_id) return pred_label, refer_label def evaluate(self): @@ -78,18 +80,18 @@ class EvalATISIntent(object): """ pred_label = [] refer_label = [] - with open(self.refer_file, 'r') as fr: - for line in fr: - label = line.rstrip('\n').split('\t')[0] - refer_label.append(int(label)) + fr = io.open(self.refer_file, 'r', encoding="utf8") + for line in fr: + label = line.rstrip('\n').split('\t')[0] + refer_label.append(int(label)) idx = 0 - with open(self.pred_file, 'r') as fr: - for line in fr: - elems = line.rstrip('\n').split('\t') - if len(elems) != 2 or not elems[0].isdigit(): - continue - tag_id = int(elems[1]) - pred_label.append(tag_id) + fr = io.open(self.pred_file, 'r', encoding="utf8") + for line in fr: + elems = line.rstrip('\n').split('\t') + if len(elems) != 2 or not elems[0].isdigit(): + continue + tag_id = int(elems[1]) + pred_label.append(tag_id) return pred_label, refer_label def evaluate(self): @@ -123,18 +125,18 @@ class EvalATISSlot(object): """ pred_label = [] refer_label = [] - with open(self.refer_file, 'r') as fr: - for line in fr: - labels = line.rstrip('\n').split('\t')[1].split() - labels = [int(l) for l in labels] - refer_label.append(labels) - with open(self.pred_file, 'r') as fr: - for line in fr: - if len(line.split('\t')) != 2 or not line[0].isdigit(): - continue - labels = line.rstrip('\n').split('\t')[1].split()[1:] - labels = [int(l) for l in labels] - pred_label.append(labels) + fr = io.open(self.refer_file, 'r', encoding="utf8") + for line in fr: + labels = line.rstrip('\n').split('\t')[1].split() + labels = [int(l) for l in labels] + refer_label.append(labels) + fr = io.open(self.pred_file, 'r', encoding="utf8") + for line in fr: + if len(line.split('\t')) != 2 or not line[0].isdigit(): + continue + labels = line.rstrip('\n').split('\t')[1].split()[1:] + labels = [int(l) for l in labels] + pred_label.append(labels) pred_label_equal = [] refer_label_equal = [] assert len(refer_label) == len(pred_label) @@ -208,19 +210,19 @@ class EvalUDC(object): """ data = [] refer_label = [] - with open(self.refer_file, 'r') as fr: - for line in fr: - label = line.rstrip('\n').split('\t')[0] - refer_label.append(label) + fr = io.open(self.refer_file, 'r', encoding="utf8") + for line in fr: + label = line.rstrip('\n').split('\t')[0] + refer_label.append(label) idx = 0 - with open(self.pred_file, 'r') as fr: - for line in fr: - elems = line.rstrip('\n').split('\t') - if len(elems) != 2 or not elems[0].isdigit(): - continue - match_prob = elems[1] - data.append((float(match_prob), int(refer_label[idx]))) - idx += 1 + fr = io.open(self.pred_file, 'r', encoding="utf8") + for line in fr: + elems = line.rstrip('\n').split('\t') + if len(elems) != 2 or not elems[0].isdigit(): + continue + match_prob = elems[1] + data.append((float(match_prob), int(refer_label[idx]))) + idx += 1 return data def get_p_at_n_in_m(self, data, n, m, ind): @@ -281,17 +283,17 @@ class EvalDSTC2(object): """ pred_label = [] refer_label = [] - with open(self.refer_file, 'r') as fr: - for line in fr: - line = line.strip('\n') - labels = [int(l) for l in line.split('\t')[-1].split()] - labels = sorted(list(set(labels))) - refer_label.append(" ".join([str(l) for l in labels])) + fr = io.open(self.refer_file, 'r', encoding="utf8") + for line in fr: + line = line.strip('\n') + labels = [int(l) for l in line.split('\t')[-1].split()] + labels = sorted(list(set(labels))) + refer_label.append(" ".join([str(l) for l in labels])) all_pred = [] - with open(self.pred_file, 'r') as fr: - for line in fr: - line = line.strip('\n') - all_pred.append(line) + fr = io.open(self.pred_file, 'r', encoding="utf8") + for line in fr: + line = line.strip('\n') + all_pred.append(line) all_pred = all_pred[len(all_pred) - len(refer_label):] for line in all_pred: labels = [int(l) for l in line.split('\t')[-1].split()] diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/prepare_data_and_model.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/prepare_data_and_model.py new file mode 100644 index 0000000000000000000000000000000000000000..641e56fa106bba7a07aa07eea92a2d5c381e7a3a --- /dev/null +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/prepare_data_and_model.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tarfile +import shutil +import urllib +import sys +import io +import os + + +URLLIB=urllib +if sys.version_info >= (3, 0): + URLLIB=urllib.request + +DATA_MODEL_PATH = {"DATA_PATH": "https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz", + "PRETRAIN_MODEL": "https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz", + "TRAINED_MODEL": "https://baidu-nlp.bj.bcebos.com/dgu_models_2.0.0.tar.gz"} + +PATH_MAP = {'DATA_PATH': "./data/input", + 'PRETRAIN_MODEL': './data/pretrain_model', + 'TRAINED_MODEL': './data/saved_models'} + + +def un_tar(tar_name, dir_name): + try: + t = tarfile.open(tar_name) + t.extractall(path = dir_name) + return True + except Exception as e: + print(e) + return False + + +def download_model_and_data(): + print("Downloading dgu data, pretrain model and trained models......") + print("This process is quite long, please wait patiently............") + for path in ['./data/input/data', './data/pretrain_model/uncased_L-12_H-768_A-12', './data/saved_models/trained_models']: + if not os.path.exists(path): + continue + shutil.rmtree(path) + for path_key in DATA_MODEL_PATH: + filename = os.path.basename(DATA_MODEL_PATH[path_key]) + URLLIB.urlretrieve(DATA_MODEL_PATH[path_key], os.path.join("./", filename)) + state = un_tar(filename, PATH_MAP[path_key]) + if not state: + print("Tar %s error....." % path_key) + return False + os.remove(filename) + return True + + +if __name__ == "__main__": + state = download_model_and_data() + if not state: + exit(1) + print("Downloading data and models sucess......") diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/prepare_data_and_model.sh b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/prepare_data_and_model.sh deleted file mode 100644 index b0a14052258b6a4ce1cb87cf5d5f6d08a78281c4..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/prepare_data_and_model.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash - -#check data directory -cd .. -echo "Start download data and models.............." -if [ ! -d "data" ]; then - echo "Directory data does not exist, make new data directory" - mkdir data -fi -cd data - -#check configure file -if [ ! -d "config" ]; then - echo "config directory not exist........" - exit 255 -else - if [ ! -f "config/dgu.yaml" ]; then - echo "config file dgu.yaml has been lost........" - exit 255 - fi -fi - -#check and download input data -if [ ! -d "input" ]; then - echo "Directory input does not exist, make new input directory" - mkdir input -fi -cd input -wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/dmtk_data_1.0.0.tar.gz -tar -xvf dmtk_data_1.0.0.tar.gz -rm dmtk_data_1.0.0.tar.gz -cd .. - -#check and download pretrain model -if [ ! -d "pretrain_model" ]; then - echo "Directory pretrain_model does not exist, make new pretrain_model directory" - mkdir pretrain_model -fi -cd pretrain_model -wget --no-check-certificate https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz -tar -xvf uncased_L-12_H-768_A-12.tar.gz -rm uncased_L-12_H-768_A-12.tar.gz -cd .. - -#check and download inferenece model -if [ ! -d "inference_models" ]; then - echo "Directory inferenece_model does not exist, make new inferenece_model directory" - mkdir inference_models -fi - -#check output -if [ ! -d "output" ]; then - echo "Directory output does not exist, make new output directory" - mkdir output -fi - -#check saved model -if [ ! -d "saved_models" ]; then - echo "Directory saved_models does not exist, make new saved_models directory" - mkdir saved_models -fi -cd saved_models -wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/dgu_models_2.0.0.tar.gz -tar -xvf dgu_models_2.0.0.tar.gz -rm dgu_models_2.0.0.tar.gz -cd .. - -echo "Finish.............." diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/reader.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/reader.py index ba5bcc215cd14ab2b8f4814078215dd004368957..b825a889cd7cce4f00a16957bc1e6acc44e4a804 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/reader.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/reader.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +14,7 @@ # limitations under the License. """data reader""" import os +import io import csv import sys import types @@ -107,12 +109,12 @@ class DataProcessor(object): @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" - with open(input_file, "r") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - lines.append(line) - return lines + f = io.open(input_file, "r", encoding="utf8") + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines def get_num_examples(self, phase): """Get number of examples for train, dev or test.""" diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/README.md b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/README.md index 2e7edbc4958b552cb15a46d71e93bc8e0c852968..0f6f4f410fac28a469050b64fce77adaa1824671 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/README.md +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/README.md @@ -1,22 +1,22 @@ scripts:运行数据处理脚本目录, 将官方公开数据集转换成模型所需训练数据格式 运行命令: - sh run_build_data.sh [udc|swda|mrda|atis|dstc2] + python run_build_data.py [udc|swda|mrda|atis|dstc2] 1)、生成MATCHING任务所需要的训练集、开发集、测试集时: -sh run_build_data.sh udc +python run_build_data.py udc 生成数据在dialogue_general_understanding/data/input/data/udc 2)、生成DA任务所需要的训练集、开发集、测试集时: - sh run_build_data.sh swda - sh run_build_data.sh mrda + python run_build_data.py swda + python run_build_data.py mrda 生成数据分别在dialogue_general_understanding/data/input/data/swda和dialogue_general_understanding/data/input/data/mrda 3)、生成DST任务所需的训练集、开发集、测试集时: - sh run_build_data.sh dstc2 + python run_build_data.py dstc2 生成数据分别在dialogue_general_understanding/data/input/data/dstc2 4)、生成意图解析, 槽位识别任务所需训练集、开发集、测试集时: - sh run_build_data.sh atis + python run_build_data.py atis 生成槽位识别数据在dialogue_general_understanding/data/input/data/atis/atis_slot 生成意图识别数据在dialogue_general_understanding/data/input/data/atis/atis_intent diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_atis_dataset.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_atis_dataset.py index 53f7d6809c7d35a069dd7e70eb956ecfbc4bc0b0..2ea18357ca847f834f3892d09db07e2b63c19c84 100755 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_atis_dataset.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_atis_dataset.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,6 +19,7 @@ import json import sys import csv import os +import io import re @@ -51,8 +53,8 @@ class ATIS(object): os.makedirs(self.out_intent_dir) src_examples = [] json_file = os.path.join(self.src_dir, "%s.json" % data_type) - with open(json_file, 'r') as load_f: - json_dict = json.load(load_f) + load_f = io.open(json_file, 'r', encoding="utf8") + json_dict = json.load(load_f) examples = json_dict['rasa_nlu_data']['common_examples'] for example in examples: text = example.get('text') @@ -66,62 +68,62 @@ class ATIS(object): parser intent dataset """ out_filename = "%s/%s.txt" % (self.out_intent_dir, data_type) - with open(out_filename, 'w') as fw: - for example in examples: - if example[1] not in self.intent_dict: - self.intent_dict[example[1]] = self.intent_id - self.intent_id += 1 - fw.write("%s\t%s\n" % (self.intent_dict[example[1]], example[0].lower())) + fw = io.open(out_filename, 'w', encoding="utf8") + for example in examples: + if example[1] not in self.intent_dict: + self.intent_dict[example[1]] = self.intent_id + self.intent_id += 1 + fw.write("%s\t%s\n" % (self.intent_dict[example[1]], example[0].lower())) - with open(self.map_tag_intent, 'w') as fw: - for tag in self.intent_dict: - fw.write("%s\t%s\n" % (tag, self.intent_dict[tag])) + fw = io.open(self.map_tag_intent, 'w', encoding="utf8") + for tag in self.intent_dict: + fw.write("%s\t%s\n" % (tag, self.intent_dict[tag])) def _parser_slot_data(self, examples, data_type): """ parser slot dataset """ out_filename = "%s/%s.txt" % (self.out_slot_dir, data_type) - with open(out_filename, 'w') as fw: - for example in examples: - tags = [] - text = example[0] - entities = example[2] - if not entities: - tags = [str(self.slot_dict['O'])] * len(text.strip().split()) - continue - for i in range(len(entities)): - enty = entities[i] - start = enty['start'] - value_num = len(enty['value'].split()) - tags_slot = [] - for j in range(value_num): - if j == 0: - bround_tag = "B" - else: - bround_tag = "I" - tag = "%s-%s" % (bround_tag, enty['entity']) - if tag not in self.slot_dict: - self.slot_dict[tag] = self.slot_id - self.slot_id += 1 - tags_slot.append(str(self.slot_dict[tag])) - if i == 0: - if start not in [0, 1]: - prefix_num = len(text[: start].strip().split()) - tags.extend([str(self.slot_dict['O'])] * prefix_num) - tags.extend(tags_slot) + fw = io.open(out_filename, 'w', encoding="utf8") + for example in examples: + tags = [] + text = example[0] + entities = example[2] + if not entities: + tags = [str(self.slot_dict['O'])] * len(text.strip().split()) + continue + for i in range(len(entities)): + enty = entities[i] + start = enty['start'] + value_num = len(enty['value'].split()) + tags_slot = [] + for j in range(value_num): + if j == 0: + bround_tag = "B" else: - prefix_num = len(text[entities[i - 1]['end']: start].strip().split()) + bround_tag = "I" + tag = "%s-%s" % (bround_tag, enty['entity']) + if tag not in self.slot_dict: + self.slot_dict[tag] = self.slot_id + self.slot_id += 1 + tags_slot.append(str(self.slot_dict[tag])) + if i == 0: + if start not in [0, 1]: + prefix_num = len(text[: start].strip().split()) tags.extend([str(self.slot_dict['O'])] * prefix_num) - tags.extend(tags_slot) - if entities[-1]['end'] < len(text): - suffix_num = len(text[entities[-1]['end']:].strip().split()) - tags.extend([str(self.slot_dict['O'])] * suffix_num) - fw.write("%s\t%s\n" % (text.encode('utf8'), " ".join(tags).encode('utf8'))) + tags.extend(tags_slot) + else: + prefix_num = len(text[entities[i - 1]['end']: start].strip().split()) + tags.extend([str(self.slot_dict['O'])] * prefix_num) + tags.extend(tags_slot) + if entities[-1]['end'] < len(text): + suffix_num = len(text[entities[-1]['end']:].strip().split()) + tags.extend([str(self.slot_dict['O'])] * suffix_num) + fw.write("%s\t%s\n" % (text.encode('utf8'), " ".join(tags).encode('utf8'))) - with open(self.map_tag_slot, 'w') as fw: - for slot in self.slot_dict: - fw.write("%s\t%s\n" % (slot, self.slot_dict[slot])) + fw = io.open(self.map_tag_slot, 'w', encoding="utf8") + for slot in self.slot_dict: + fw.write("%s\t%s\n" % (slot, self.slot_dict[slot])) def get_train_dataset(self): """ diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_dstc2_dataset.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_dstc2_dataset.py index 86153739325a59e6f35f071dbc6e1b1df2ffcf2d..f2c83e0b7b417622bc959d975c6e2a6f1fd1109b 100755 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_dstc2_dataset.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_dstc2_dataset.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,6 +18,7 @@ import json import sys import csv import os +import io import re import commonlib @@ -55,17 +57,17 @@ class DSTC2(object): """ tag_id = 1 self.map_tag_dict['none'] = 0 - with open(self.onto_json, 'r') as fr: - ontology = json.load(fr) - slots_values = ontology['informable'] - for slot in slots_values: - for value in slots_values[slot]: - key = "%s_%s" % (slot, value) - self.map_tag_dict[key] = tag_id - tag_id += 1 - key = "%s_none" % (slot) + fr = io.open(self.onto_json, 'r', encoding="utf8") + ontology = json.load(fr) + slots_values = ontology['informable'] + for slot in slots_values: + for value in slots_values[slot]: + key = "%s_%s" % (slot, value) self.map_tag_dict[key] = tag_id tag_id += 1 + key = "%s_none" % (slot) + self.map_tag_dict[key] = tag_id + tag_id += 1 def _parser_dataset(self, data_type): """ @@ -79,31 +81,33 @@ class DSTC2(object): os.makedirs(self.out_asr_dir) out_file = os.path.join(self.out_dir, "%s.txt" % data_type) out_asr_file = os.path.join(self.out_asr_dir, "%s.txt" % data_type) - with open(out_file, 'w') as fw, open(out_asr_file, 'w') as fw_asr: - data_list = self.data_dict.get(data_type) - for fn in data_list: - log_file = os.path.join(fn, "log.json") - label_file = os.path.join(fn, "label.json") - with open(log_file, 'r') as f_log, open(label_file, 'r') as f_label: - log_json = json.load(f_log) - label_json = json.load(f_label) - session_id = log_json['session-id'] - assert len(label_json["turns"]) == len(log_json["turns"]) - for i in range(len(label_json["turns"])): - log_turn = log_json["turns"][i] - label_turn = label_json["turns"][i] - assert log_turn["turn-index"] == label_turn["turn-index"] - labels = ["%s_%s" % (slot, label_turn["goal-labels"][slot]) for slot in label_turn["goal-labels"]] - labels_ids = " ".join([str(self.map_tag_dict.get(label, self.map_tag_dict["%s_none" % label.split('_')[0]])) for label in labels]) - mach = log_turn['output']['transcript'] - user = label_turn['transcription'] - if not labels_ids.strip(): - labels_ids = self.map_tag_dict['none'] - out = "%s\t%s\1%s\t%s" % (session_id, mach, user, labels_ids) - user_asr = log_turn['input']['live']['asr-hyps'][0]['asr-hyp'].strip() - out_asr = "%s\t%s\1%s\t%s" % (session_id, mach, user_asr, labels_ids) - fw.write("%s\n" % out.encode('utf8')) - fw_asr.write("%s\n" % out_asr.encode('utf8')) + fw = io.open(out_file, 'w', encoding="utf8") + fw_asr = io.open(out_asr_file, 'w', encoding="utf8") + data_list = self.data_dict.get(data_type) + for fn in data_list: + log_file = os.path.join(fn, "log.json") + label_file = os.path.join(fn, "label.json") + f_log = io.open(log_file, 'r', encoding="utf8") + f_label = io.open(label_file, 'r', encoding="utf8") + log_json = json.load(f_log) + label_json = json.load(f_label) + session_id = log_json['session-id'] + assert len(label_json["turns"]) == len(log_json["turns"]) + for i in range(len(label_json["turns"])): + log_turn = log_json["turns"][i] + label_turn = label_json["turns"][i] + assert log_turn["turn-index"] == label_turn["turn-index"] + labels = ["%s_%s" % (slot, label_turn["goal-labels"][slot]) for slot in label_turn["goal-labels"]] + labels_ids = " ".join([str(self.map_tag_dict.get(label, self.map_tag_dict["%s_none" % label.split('_')[0]])) for label in labels]) + mach = log_turn['output']['transcript'] + user = label_turn['transcription'] + if not labels_ids.strip(): + labels_ids = self.map_tag_dict['none'] + out = "%s\t%s\1%s\t%s" % (session_id, mach, user, labels_ids) + user_asr = log_turn['input']['live']['asr-hyps'][0]['asr-hyp'].strip() + out_asr = "%s\t%s\1%s\t%s" % (session_id, mach, user_asr, labels_ids) + fw.write("%s\n" % out.encode('utf8')) + fw_asr.write("%s\n" % out_asr.encode('utf8')) def get_train_dataset(self): """ @@ -127,9 +131,9 @@ class DSTC2(object): """ get tag and map ids file """ - with open(self.map_tag, 'w') as fw: - for elem in self.map_tag_dict: - fw.write("%s\t%s\n" % (elem, self.map_tag_dict[elem])) + fw = io.open(self.map_tag, 'w', encoding="utf8") + for elem in self.map_tag_dict: + fw.write("%s\t%s\n" % (elem, self.map_tag_dict[elem])) def main(self): """ diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_mrda_dataset.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_mrda_dataset.py index 42301ba210199d6719f8eb2d34172692d867299e..7de02adc2b4552526777752ee67a0ca506801f42 100755 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_mrda_dataset.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_mrda_dataset.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,6 +17,7 @@ import sys import csv import os +import io import re import commonlib @@ -64,18 +66,18 @@ class MRDA(object): dadb_list = self.data_dict[data_type] for dadb_key in dadb_list: dadb_file = self.dadb_dict[dadb_key] - with open(dadb_file, 'r') as fr: - row = csv.reader(fr, delimiter = ',') - for line in row: - elems = line - conv_id = elems[2] - conv_id_list.append(conv_id) - if len(elems) != 14: - continue - error_code = elems[3] - da_tag = elems[-9] - da_ori_tag = elems[-6] - dadb_dict[conv_id] = (error_code, da_ori_tag, da_tag) + fr = io.open(dadb_file, 'r', encoding="utf8") + row = csv.reader(fr, delimiter = ',') + for line in row: + elems = line + conv_id = elems[2] + conv_id_list.append(conv_id) + if len(elems) != 14: + continue + error_code = elems[3] + da_tag = elems[-9] + da_ori_tag = elems[-6] + dadb_dict[conv_id] = (error_code, da_ori_tag, da_tag) return dadb_dict, conv_id_list def load_trans(self, data_type): @@ -84,16 +86,16 @@ class MRDA(object): trans_list = self.data_dict[data_type] for trans_key in trans_list: trans_file = self.trans_dict[trans_key] - with open(trans_file, 'r') as fr: - row = csv.reader(fr, delimiter = ',') - for line in row: - elems = line - if len(elems) != 3: - continue - conv_id = elems[0] - text = elems[1] - text_process = elems[2] - trans_dict[conv_id] = (text, text_process) + fr = io.open(trans_file, 'r', encoding="utf8") + row = csv.reader(fr, delimiter = ',') + for line in row: + elems = line + if len(elems) != 3: + continue + conv_id = elems[0] + text = elems[1] + text_process = elems[2] + trans_dict[conv_id] = (text, text_process) return trans_dict def _parser_dataset(self, data_type): @@ -103,23 +105,23 @@ class MRDA(object): out_filename = "%s/%s.txt" % (self.out_dir, data_type) dadb_dict, conv_id_list = self.load_dadb(data_type) trans_dict = self.load_trans(data_type) - with open(out_filename, 'w') as fw: - for elem in conv_id_list: - v_dadb = dadb_dict[elem] - v_trans = trans_dict[elem] - da_tag = v_dadb[2] - if da_tag not in self.tag_dict: - continue - tag = self.tag_dict[da_tag] - if tag == "Z": - continue - if tag not in self.map_tag_dict: - self.map_tag_dict[tag] = self.tag_id - self.tag_id += 1 - caller = elem.split('_')[0].split('-')[-1] - conv_no = elem.split('_')[0].split('-')[0] - out = "%s\t%s\t%s\t%s" % (conv_no, self.map_tag_dict[tag], caller, v_trans[0]) - fw.write("%s\n" % out) + fw = io.open(out_filename, 'w', encoding="utf8") + for elem in conv_id_list: + v_dadb = dadb_dict[elem] + v_trans = trans_dict[elem] + da_tag = v_dadb[2] + if da_tag not in self.tag_dict: + continue + tag = self.tag_dict[da_tag] + if tag == "Z": + continue + if tag not in self.map_tag_dict: + self.map_tag_dict[tag] = self.tag_id + self.tag_id += 1 + caller = elem.split('_')[0].split('-')[-1] + conv_no = elem.split('_')[0].split('-')[0] + out = "%s\t%s\t%s\t%s" % (conv_no, self.map_tag_dict[tag], caller, v_trans[0]) + fw.write("%s\n" % out) def get_train_dataset(self): """ @@ -143,9 +145,9 @@ class MRDA(object): """ get tag and map ids file """ - with open(self.map_tag, 'w') as fw: - for elem in self.map_tag_dict: - fw.write("%s\t%s\n" % (elem, self.map_tag_dict[elem])) + fw = io.open(self.map_tag, 'w', encoding="utf8") + for elem in self.map_tag_dict: + fw.write("%s\t%s\n" % (elem, self.map_tag_dict[elem])) def main(self): """ diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_swda_dataset.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_swda_dataset.py index 945eed6758086f08c3a80074b4bae6a669c4fa69..c821e7fe52a620d6931456c5216276354fa56257 100755 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_swda_dataset.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/build_swda_dataset.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,6 +17,7 @@ import sys import csv import os +import io import re import commonlib @@ -56,18 +58,18 @@ class SWDA(object): parser train dev test dataset """ out_filename = "%s/%s.txt" % (self.out_dir, data_type) - with open(out_filename, 'w') as fw: - for name in self.data_dict[data_type]: - file_path = self.file_dict[name] - with open(file_path, 'r') as fr: - idx = 0 - row = csv.reader(fr, delimiter = ',') - for r in row: - if idx == 0: - idx += 1 - continue - out = self._parser_utterence(r) - fw.write("%s\n" % out) + fw = io.open(out_filename, 'w', encoding='utf8') + for name in self.data_dict[data_type]: + file_path = self.file_dict[name] + fr = io.open(file_path, 'r', encoding="utf8") + idx = 0 + row = csv.reader(fr, delimiter = ',') + for r in row: + if idx == 0: + idx += 1 + continue + out = self._parser_utterence(r) + fw.write("%s\n" % out) def _clean_text(self, text): """ @@ -209,9 +211,9 @@ class SWDA(object): """ get tag and map ids file """ - with open(self.map_tag, 'w') as fw: - for elem in self.map_tag_dict: - fw.write("%s\t%s\n" % (elem, self.map_tag_dict[elem])) + fw = io.open(self.map_tag, 'w', encoding='utf8') + for elem in self.map_tag_dict: + fw.write("%s\t%s\n" % (elem, self.map_tag_dict[elem])) def main(self): """ diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/commonlib.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/commonlib.py index b048efc7d96928b47bb865b4db290e1319384731..b223a9f2b4eb0b8c9e650759a7b39cc5282cdceb 100755 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/commonlib.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/commonlib.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +14,7 @@ # limitations under the License. """common function""" import sys +import io import os @@ -48,13 +50,13 @@ def load_dict(conf): load swda dataset config """ conf_dict = dict() - with open(conf, 'r') as fr: - for line in fr: - line = line.strip() - elems = line.split('\t') - if elems[0] not in conf_dict: - conf_dict[elems[0]] = [] - conf_dict[elems[0]].append(elems[1]) + fr = io.open(conf, 'r', encoding="utf8") + for line in fr: + line = line.strip() + elems = line.split('\t') + if elems[0] not in conf_dict: + conf_dict[elems[0]] = [] + conf_dict[elems[0]].append(elems[1]) return conf_dict @@ -63,11 +65,11 @@ def load_voc(conf): load map dict """ map_dict = {} - with open(conf, 'r') as fr: - for line in fr: - line = line.strip() - elems = line.split('\t') - map_dict[elems[0]] = elems[1] + fr = io.open(conf, 'r', encoding="utf8") + for line in fr: + line = line.strip() + elems = line.split('\t') + map_dict[elems[0]] = elems[1] return map_dict diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/run_build_data.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/run_build_data.py new file mode 100755 index 0000000000000000000000000000000000000000..b1a61a0f9938bd7fb647194f3902ae62d5c6b509 --- /dev/null +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/run_build_data.py @@ -0,0 +1,48 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import shutil +import sys +import os + +from build_atis_dataset import ATIS +from build_dstc2_dataset import DSTC2 +from build_mrda_dataset import MRDA +from build_swda_dataset import SWDA + + +if __name__ == "__main__": + task_name = sys.argv[1] + task_name = task_name.lower() + + if task_name not in ['swda', 'mrda', 'atis', 'dstc2', 'udc']: + print("task name error: we support [swda|mrda|atis|dstc2|udc]") + exit(1) + + if task_name == 'swda': + swda_inst = SWDA() + swda_inst.main() + elif task_name == 'mrda': + mrda_inst = MRDA() + mrda_inst.main() + elif task_name == 'atis': + atis_inst = ATIS() + atis_inst.main() + shutil.copyfile("../../data/input/data/atis/atis_slot/test.txt", "../../data/input/data/atis/atis_slot/dev.txt") + shutil.copyfile("../../data/input/data/atis/atis_intent/test.txt", "../../data/input/data/atis/atis_intent/dev.txt") + elif task_name == 'dstc2': + dstc_inst = DSTC2() + dstc_inst.main() + else: + exit(0) + diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/run_build_data.sh b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/run_build_data.sh deleted file mode 100755 index 167861c327ab0e346ef454cee42e625b6a24ebb4..0000000000000000000000000000000000000000 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/scripts/run_build_data.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -TASK_DATA=$1 -typeset -l TASK_DATA - -if [ "${TASK_DATA}" = "udc" ] -then - exit 0 -elif [ "${TASK_DATA}" = "swda" ] -then - python build_swda_dataset.py -elif [ "${TASK_DATA}" = "mrda" ] -then - python build_mrda_dataset.py -elif [[ "${TASK_DATA}" =~ "atis" ]] -then - python build_atis_dataset.py - cat ../../data/input/data/atis/atis_slot/test.txt > ../../data/input/data/atis/atis_slot/dev.txt - cat ../../data/input/data/atis/atis_intent/test.txt > ../../data/input/data/atis/atis_intent/dev.txt -elif [ "${TASK_DATA}" = "dstc2" ] -then - python build_dstc2_dataset.py -else - echo "can not support $TASK_DATA , please choose [swda|mrda|atis|dstc2|multi-woz]" -fi diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/tokenization.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/tokenization.py index 466db405d6a098acc8947158f9d2c75f1ecdcd76..8268f8e8ec6f86513344c1523dcd703abeb61c44 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/tokenization.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/tokenization.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,6 +22,7 @@ from __future__ import print_function import collections import unicodedata import six +import io def convert_to_unicode(text): @@ -69,7 +71,7 @@ def printable_text(text): def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() - fin = open(vocab_file) + fin = io.open(vocab_file, 'r', encoding="utf8") for num, line in enumerate(fin): items = convert_to_unicode(line.strip()).split("\t") if len(items) > 2: diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/utils/configure.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/utils/configure.py index 4251752aaaa4fce8323b50fd9047b755060f8221..25dd0d551161e0373e36c3b5f7baef3526555c01 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/utils/configure.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/dgu/utils/configure.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -16,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import io import os import sys import argparse @@ -38,8 +40,8 @@ class JsonConfig(object): def _parse(self, config_path): try: - with open(config_path) as json_file: - config_dict = json.load(json_file) + json_file = io.open(config_path, 'r', encoding="utf8") + config_dict = json.load(json_file) except: raise IOError("Error in parsing bert model config file '%s'" % config_path) @@ -212,9 +214,9 @@ class PDConfig(object): raise Warning("the json file %s does not exist." % file_path) return - with open(file_path, "r") as fin: - self.json_config = json.loads(fin.read()) - fin.close() + fin = io.open(file_path, "r", encoding="utf8") + self.json_config = json.loads(fin.read()) + fin.close() if fuse_args: for name in self.json_config: @@ -236,9 +238,9 @@ class PDConfig(object): raise Warning("the yaml file %s does not exist." % file_path) return - with open(file_path, "r") as fin: - self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) - fin.close() + fin = io.open(file_path, "r", encoding="utf8") + self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader) + fin.close() if fuse_args: for name in self.yaml_config: diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/predict.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/predict.py index d159b302b304abfb345fa614b5ec92f505a3796e..bab34ed2f80e16004f9076e5e18b670142e37f79 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/predict.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/predict.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import os import sys import numpy as np @@ -142,15 +144,16 @@ def do_predict(args): np.set_printoptions(precision=4, suppress=True) print("Write the predicted results into the output_prediction_file") - with open(args.output_prediction_file, 'w') as fw: - if task_name not in ['atis_slot']: - for index, result in enumerate(all_results): - tags = pred_func(result) - fw.write("%s\t%s\n" % (index, tags)) - else: - tags = pred_func(all_results, args.max_seq_len) - for index, tag in enumerate(tags): - fw.write("%s\t%s\n" % (index, tag)) + + fw = io.open(args.output_prediction_file, 'w', encoding="utf8") + if task_name not in ['atis_slot']: + for index, result in enumerate(all_results): + tags = pred_func(result) + fw.write("%s\t%s\n" % (index, tags)) + else: + tags = pred_func(all_results, args.max_seq_len) + for index, tag in enumerate(tags): + fw.write("%s\t%s\n" % (index, tag)) if __name__ == "__main__": diff --git a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/train.py b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/train.py index 51bee33974a98fbe1518acf986a57fa745d1b73c..7401cee810cdf25408780d7cc6bfa059a0df5e4a 100644 --- a/PaddleNLP/PaddleDialogue/dialogue_general_understanding/train.py +++ b/PaddleNLP/PaddleDialogue/dialogue_general_understanding/train.py @@ -21,7 +21,6 @@ import os import sys import time import numpy as np -import multiprocessing import paddle import paddle.fluid as fluid @@ -111,8 +110,7 @@ def do_train(args): if args.use_cuda: dev_count = fluid.core.get_cuda_device_count() else: - dev_count = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + dev_count = int(os.environ.get('CPU_NUM', 1)) batch_generator = processor.data_generator( batch_size=args.batch_size, diff --git a/PaddleNLP/Research/MRQA2019-D-NET/multi_task_learning/README.md b/PaddleNLP/Research/MRQA2019-D-NET/multi_task_learning/README.md index 948820a6480b1f81429a29cb7a1ccd85885f4814..aea55d07d468bf883bddeb423f55afc335135317 100644 --- a/PaddleNLP/Research/MRQA2019-D-NET/multi_task_learning/README.md +++ b/PaddleNLP/Research/MRQA2019-D-NET/multi_task_learning/README.md @@ -44,10 +44,10 @@ In our MTL experiments, we use BERT as our shared encoder. The parameters are in ``` 1、cd scripts -2、download cased_model_01.tar.gz from link +2、# download cased_model_01.tar.gz from link 3、mkdir cased_model_01 && mv cased_model_01.tar.gz cased_model_01 && cd cased_model_01 && tar -xvf cased_model_01.tar.gz && cd .. 4、python convert_model_params.py --init_tf_checkpoint cased_model_01/model.ckpt --fluid_params_dir params -5、mkdir fluid_models && mv cased_model_01/vocab.txt cased_model_01/bert_config.json params fluid_models +5、mkdir squad2_model && mv cased_model_01/vocab.txt cased_model_01/bert_config.json params squad2_model ``` Alternatively, user can directly **download the parameters that we have converted**: diff --git a/PaddleNLP/Research/MRQA2019-D-NET/server/ernie_server/task_reader/mrqa_infer.py b/PaddleNLP/Research/MRQA2019-D-NET/server/ernie_server/task_reader/mrqa_infer.py index d76c559d290523a214c195d0383d671698d6347f..2f52a3acf72b4f46f2a875a5c3a29843973412d2 100644 --- a/PaddleNLP/Research/MRQA2019-D-NET/server/ernie_server/task_reader/mrqa_infer.py +++ b/PaddleNLP/Research/MRQA2019-D-NET/server/ernie_server/task_reader/mrqa_infer.py @@ -21,8 +21,8 @@ import json import random import collections import numpy as np -import tokenization -from batching import prepare_batch_data +from task_reader import tokenization +from task_reader.batching import prepare_batch_data class MRQAExample(object):