“7fa1725b04c7f1668e575665c8144225ad8d97a4”上不存在“...multi_cluster/git@gitcode.net:BaiXuePrincess/Paddle.git”
提交 07e21b51 编写于 作者: K kinghuin 提交者: wuzewu

reconsitution reader and dataset(#279)

* reconsitution reader and dataset
上级 537d3c58
......@@ -20,20 +20,7 @@ from __future__ import print_function
import argparse
import ast
import collections
import json
import io
import math
import numpy as np
import os
import six
import sys
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
from paddlehub.finetune.task.reading_comprehension_task import write_predictions
hub.common.logger.logger.setLevel("INFO")
......
......@@ -14,7 +14,7 @@
# limitations under the License.
# NLP Dataset
from .dataset import InputExample, HubDataset
from .dataset import InputExample, BaseDataset
from .chnsenticorp import ChnSentiCorp
from .msra_ner import MSRA_NER
from .nlpcc_dbqa import NLPCC_DBQA
......
......@@ -18,15 +18,61 @@ from __future__ import division
from __future__ import print_function
import os
import numpy as np
from paddlehub.dataset import BaseDataset
import paddlehub as hub
from paddlehub.common.downloader import default_downloader
from paddlehub.common.logger import logger
class BaseCVDatast(BaseDataset):
def __init__(self,
base_path,
train_list_file=None,
validate_list_file=None,
test_list_file=None,
predict_list_file=None,
label_list_file=None,
label_list=None):
super(BaseCVDatast, self).__init__(
base_path=base_path,
train_file=train_list_file,
dev_file=validate_list_file,
test_file=test_list_file,
predict_file=predict_list_file,
label_file=label_list_file,
label_list=label_list)
def _read_file(self, data_path, phase=None):
data = []
with open(data_path, "r") as file:
while True:
line = file.readline()
if not line:
break
line = line.strip()
items = line.split(" ")
if len(items) > 2:
image_path = " ".join(items[0:-1])
else:
image_path = items[0]
if not os.path.isabs(image_path):
if self.base_path is not None:
image_path = os.path.join(self.base_path, image_path)
label = items[-1]
data.append((image_path, label))
return data
# discarded. please use BaseCVDatast
class ImageClassificationDataset(object):
def __init__(self):
logger.warning(
"ImageClassificationDataset is no longer recommended from PaddleHub v1.5.0, "
"please use BaseCVDataset instead of ImageClassificationDataset. "
"It's more easy-to-use with more functions and support evaluating test set "
"in the end of finetune automatically.")
self.base_path = None
self.train_list_file = None
self.test_list_file = None
......@@ -99,12 +145,12 @@ class ImageClassificationDataset(object):
def test_data(self, shuffle=False):
test_data_path = os.path.join(self.base_path, self.test_list_file)
return self._parse_data(test_data_path, shuffle, phase='dev')
return self._parse_data(test_data_path, shuffle, phase='test')
def validate_data(self, shuffle=False):
validate_data_path = os.path.join(self.base_path,
self.validate_list_file)
return self._parse_data(validate_data_path, shuffle, phase='test')
return self._parse_data(validate_data_path, shuffle, phase='dev')
def get_train_examples(self):
return self.train_examples
......
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import io
import csv
from paddlehub.dataset import InputExample, BaseDataset
class BaseNLPDatast(BaseDataset):
def __init__(self,
base_path,
train_file=None,
dev_file=None,
test_file=None,
predict_file=None,
label_file=None,
label_list=None,
train_file_with_head=False,
dev_file_with_head=False,
test_file_with_head=False,
predict_file_with_head=False):
super(BaseNLPDatast, self).__init__(
base_path=base_path,
train_file=train_file,
dev_file=dev_file,
test_file=test_file,
predict_file=predict_file,
label_file=label_file,
label_list=label_list,
train_file_with_head=train_file_with_head,
dev_file_with_head=dev_file_with_head,
test_file_with_head=test_file_with_head,
predict_file_with_head=predict_file_with_head)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as file:
reader = csv.reader(file, delimiter="\t", quotechar=None)
examples = []
for (i, line) in enumerate(reader):
if i == 0:
ncol = len(line)
if self.if_file_with_head[phase]:
continue
if ncol == 1:
if phase != "predict":
example = InputExample(guid=i, text_a=line[0])
else:
raise Exception(
"the %s file: %s only has one column but it is not a predict file"
% (phase, input_file))
elif ncol == 2:
example = InputExample(
guid=i, text_a=line[0], label=line[1])
elif ncol == 3:
example = InputExample(
guid=i, text_a=line[0], text_b=line[1], label=line[2])
else:
raise Exception(
"the %s file: %s has too many columns (should <=3)" %
(phase, input_file))
examples.append(example)
return examples
......@@ -17,76 +17,37 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import io
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/bq.tar.gz"
class BQ(HubDataset):
class BQ(BaseNLPDatast):
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "bq")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.txt")
self.train_examples = self._read_file(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
self.dev_examples = self._read_file(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.txt")
self.test_examples = self._read_file(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return ["0", "1"]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_file(self, input_file):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as file:
examples = []
for (i, line) in enumerate(file):
data = line.strip().split("\t")
example = InputExample(
guid=i, label=data[2], text_a=data[0], text_b=data[1])
examples.append(example)
return examples
dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset(
dataset_dir,
url="https://bj.bcebos.com/paddlehub-dataset/bq.tar.gz")
super(BQ, self).__init__(
base_path=base_path,
train_file="train.txt",
dev_file="dev.txt",
test_file="test.txt",
label_file=None,
label_list=["0", "1"],
)
if __name__ == "__main__":
ds = BQ()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -17,72 +17,39 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import codecs
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/chnsenticorp.tar.gz"
class ChnSentiCorp(HubDataset):
class ChnSentiCorp(BaseNLPDatast):
"""
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining)
"""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "chnsenticorp")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.tsv")
self.train_examples = self._read_tsv(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.tsv")
self.test_examples = self._read_tsv(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return ["0", "1"]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_tsv(self, input_file, quotechar=None):
dataset_dir = os.path.join(DATA_HOME, "chnsenticorp")
base_path = self._download_dataset(
dataset_dir,
url="https://bj.bcebos.com/paddlehub-dataset/chnsenticorp.tar.gz")
super(ChnSentiCorp, self).__init__(
base_path=base_path,
train_file="train.tsv",
dev_file="dev.tsv",
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with codecs.open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
reader = csv.reader(f, delimiter="\t", quotechar=None)
examples = []
seq_id = 0
header = next(reader) # skip header
......@@ -97,5 +64,5 @@ class ChnSentiCorp(HubDataset):
if __name__ == "__main__":
ds = ChnSentiCorp()
for e in ds.get_train_examples():
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -16,12 +16,11 @@
import json
import os
import sys
from paddlehub.reader import tokenization
from paddlehub.common.downloader import default_downloader
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -63,42 +62,22 @@ class CMRC2018Example(object):
return s
class CMRC2018(object):
class CMRC2018(BaseNLPDatast):
"""A single set of features of data."""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "cmrc2018")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_dev_examples()
self._load_test_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "cmrc2018_train.json")
self.train_examples = self._read_json(self.train_file, is_training=True)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "cmrc2018_dev.json")
self.dev_examples = self._read_json(self.dev_file, is_training=False)
def _load_test_examples(self):
pass
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return []
def _read_json(self, input_file, is_training=False):
dataset_dir = os.path.join(DATA_HOME, "cmrc2018")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(CMRC2018, self).__init__(
base_path=base_path,
train_file="cmrc2018_train.json",
dev_file="cmrc2018_dev.json",
test_file=None,
label_file=None,
label_list=None,
)
def _read_file(self, input_file, phase=False):
"""Read a cmrc2018 json file into a list of CRCDExample."""
def _is_chinese_char(cp):
......@@ -197,7 +176,7 @@ class CMRC2018(object):
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
if is_training:
if phase == "train":
actual_text = "".join(
doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = "".join(
......
......@@ -17,6 +17,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.common.downloader import default_downloader
from paddlehub.common.logger import logger
class InputExample(object):
"""
......@@ -49,21 +55,124 @@ class InputExample(object):
self.text_a, self.text_b, self.label)
class HubDataset(object):
class BaseDataset(object):
def __init__(self,
base_path,
train_file=None,
dev_file=None,
test_file=None,
predict_file=None,
label_file=None,
label_list=None,
train_file_with_head=False,
dev_file_with_head=False,
test_file_with_head=False,
predict_file_with_head=False):
if not (train_file or dev_file or test_file):
raise ValueError("At least one file should be assigned")
self.base_path = base_path
self.train_file = train_file
self.dev_file = dev_file
self.test_file = test_file
self.predict_file = predict_file
self.label_file = label_file
self.label_list = label_list
self.train_examples = []
self.dev_examples = []
self.test_examples = []
self.predict_examples = []
self.if_file_with_head = {
"train": train_file_with_head,
"dev": dev_file_with_head,
"test": test_file_with_head,
"predict": predict_file_with_head
}
if train_file:
self._load_train_examples()
if dev_file:
self._load_dev_examples()
if test_file:
self._load_test_examples()
if predict_file:
self._load_predict_examples()
if self.label_file:
if not self.label_list:
self.label_list = self._load_label_data()
else:
logger.warning(
"As label_list has been assigned, label_file is noneffective"
)
def get_train_examples(self):
raise NotImplementedError()
return self.train_examples
def get_dev_examples(self):
raise NotImplementedError()
return self.dev_examples
def get_test_examples(self):
raise NotImplementedError()
return self.test_examples
def get_val_examples(self):
return self.get_dev_examples()
def get_predict_examples(self):
return self.predict_examples
def get_labels(self):
raise NotImplementedError()
return self.label_list
@property
def num_labels(self):
raise NotImplementedError()
return len(self.label_list)
def label_dict(self):
return {index: key for index, key in enumerate(self.label_list)}
def _download_dataset(self, dataset_path, url):
if not os.path.exists(dataset_path):
result, tips, dataset_path = default_downloader.download_file_and_uncompress(
url=url,
save_path=hub.common.dir.DATA_HOME,
print_progress=True,
replace=True)
if not result:
raise Exception(tips)
else:
logger.info("Dataset {} already cached.".format(dataset_path))
return dataset_path
def _load_train_examples(self):
self.train_path = os.path.join(self.base_path, self.train_file)
self.train_examples = self._read_file(self.train_path, phase="train")
def _load_dev_examples(self):
self.dev_path = os.path.join(self.base_path, self.dev_file)
self.dev_examples = self._read_file(self.dev_path, phase="dev")
def _load_test_examples(self):
self.test_path = os.path.join(self.base_path, self.test_file)
self.test_examples = self._read_file(self.test_path, phase="test")
def _load_predict_examples(self):
self.predict_path = os.path.join(self.base_path, self.predict_file)
self.predict_examples = self._read_file(
self.predict_path, phase="predict")
def _read_file(self, path, phase=None):
raise NotImplementedError
def _load_label_data(self):
with open(os.path.join(self.base_path, self.label_file), "r") as file:
return file.read().split("\n")
def __str__(self):
return "Dataset: %s with %i train examples, %i dev examples and %i test examples" % (
self.__class__.__name__, len(self.train_examples),
len(self.dev_examples), len(self.test_examples))
# add alias, compatible with old version
HubDataset = BaseDataset
......@@ -20,18 +20,33 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import ImageClassificationDataset
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
class DogCatDataset(ImageClassificationDataset):
class DogCatDataset(BaseCVDatast):
def __init__(self):
super(DogCatDataset, self).__init__()
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat")
self.base_path = self._download_dataset(
base_path = self._download_dataset(
dataset_path=dataset_path,
url="https://bj.bcebos.com/paddlehub-dataset/dog-cat.tar.gz")
self.train_list_file = "train_list.txt"
self.test_list_file = "test_list.txt"
self.validate_list_file = "validate_list.txt"
self.label_list_file = "label_list.txt"
self.num_labels = 2
super(DogCatDataset, self).__init__(
base_path=base_path,
train_list_file="train_list.txt",
validate_list_file="validate_list.txt",
test_list_file="test_list.txt",
label_list_file="label_list.txt",
label_list=None)
if __name__ == "__main__":
ds = DogCatDataset()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print(e)
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print(e)
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print(e)
print(ds)
......@@ -16,12 +16,11 @@
import json
import os
import sys
from paddlehub.reader import tokenization
from paddlehub.common.downloader import default_downloader
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -39,8 +38,7 @@ class DRCDExample(object):
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
end_position=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
......@@ -64,43 +62,22 @@ class DRCDExample(object):
return s
class DRCD(object):
class DRCD(BaseNLPDatast):
"""A single set of features of data."""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "drcd")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_dev_examples()
self._load_test_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "DRCD_training.json")
self.train_examples = self._read_json(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "DRCD_dev.json")
self.dev_examples = self._read_json(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "DRCD_test.json")
self.test_examples = self._read_json(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def _read_json(self, input_file):
dataset_dir = os.path.join(DATA_HOME, "drcd")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(DRCD, self).__init__(
base_path=base_path,
train_file="DRCD_training.json",
dev_file="DRCD_dev.json",
test_file="DRCD_test.json",
label_file=None,
label_list=None,
)
def _read_file(self, input_file, phase=None):
"""Read a DRCD json file into a list of CRCDExample."""
def _is_chinese_char(cp):
......
......@@ -20,18 +20,33 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import ImageClassificationDataset
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
class FlowersDataset(ImageClassificationDataset):
class FlowersDataset(BaseCVDatast):
def __init__(self):
super(FlowersDataset, self).__init__()
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos")
self.base_path = self._download_dataset(
base_path = self._download_dataset(
dataset_path=dataset_path,
url="https://bj.bcebos.com/paddlehub-dataset/flower_photos.tar.gz")
self.train_list_file = "train_list.txt"
self.test_list_file = "test_list.txt"
self.validate_list_file = "validate_list.txt"
self.label_list_file = "label_list.txt"
self.num_labels = 5
super(FlowersDataset, self).__init__(
base_path=base_path,
train_list_file="train_list.txt",
validate_list_file="validate_list.txt",
test_list_file="test_list.txt",
label_list_file="label_list.txt",
label_list=None)
if __name__ == "__main__":
ds = FlowersDataset()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print(e)
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print(e)
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print(e)
print(ds)
......@@ -20,19 +20,33 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import ImageClassificationDataset
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
class Food101Dataset(ImageClassificationDataset):
class Food101Dataset(BaseCVDatast):
def __init__(self):
super(Food101Dataset, self).__init__()
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101",
"images")
self.base_path = self._download_dataset(
base_path = self._download_dataset(
dataset_path=dataset_path,
url="https://bj.bcebos.com/paddlehub-dataset/Food101.tar.gz")
self.train_list_file = "train_list.txt"
self.test_list_file = "test_list.txt"
self.validate_list_file = "validate_list.txt"
self.label_list_file = "label_list.txt"
self.num_labels = 101
super(Food101Dataset, self).__init__(
base_path=base_path,
train_list_file="train_list.txt",
test_list_file="test_list.txt",
validate_list_file="validate_list.txt",
label_list_file="label_list.txt")
if __name__ == "__main__":
ds = Food101Dataset()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print(e)
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print(e)
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print(e)
print(ds)
......@@ -21,15 +21,15 @@ import os
import csv
import io
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset import InputExample
from paddlehub.common.logger import logger
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"
class GLUE(HubDataset):
class GLUE(BaseNLPDatast):
"""
Please refer to
https://gluebenchmark.com
......@@ -43,147 +43,107 @@ class GLUE(HubDataset):
'RTE', 'SST-2', 'STS-B'
]:
raise Exception(
sub_dataset +
" is not in GLUE benchmark. Please confirm the data set")
self.mismatch = False
"%s is not in GLUE benchmark. Please confirm the data set" %
sub_dataset)
mismatch = False
if sub_dataset == 'MNLI_mm':
sub_dataset = 'MNLI'
self.mismatch = True
mismatch = True
elif sub_dataset == 'MNLI_m':
sub_dataset = 'MNLI'
self.sub_dataset = sub_dataset
self.dataset_dir = os.path.join(DATA_HOME, "glue_data")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_dev_examples()
self._load_test_examples()
self._load_predict_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, self.sub_dataset,
"train.tsv")
self.train_examples = self._read_tsv(self.train_file)
def _load_dev_examples(self):
if self.sub_dataset == 'MNLI' and not self.mismatch:
self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset,
"dev_matched.tsv")
elif self.sub_dataset == 'MNLI' and self.mismatch:
self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset,
"dev_mismatched.tsv")
else:
self.dev_file = os.path.join(self.dataset_dir, self.sub_dataset,
"dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_examples = []
def _load_predict_examples(self):
if self.sub_dataset == 'MNLI' and not self.mismatch:
self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset,
"test_matched.tsv")
elif self.sub_dataset == 'MNLI' and self.mismatch:
self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset,
"test_mismatched.tsv")
else:
self.predict_file = os.path.join(self.dataset_dir, self.sub_dataset,
"test.tsv")
self.predict_examples = self._read_tsv(self.predict_file, wo_label=True)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_predict_examples(self):
return self.predict_examples
def get_labels(self):
"""See base class."""
if self.sub_dataset in ['MRPC', 'QQP', 'SST-2', 'CoLA']:
return ["0", "1"]
elif self.sub_dataset in ['QNLI', 'RTE']:
return ['not_entailment', 'entailment']
elif self.sub_dataset in ['MNLI']:
return ["neutral", "contradiction", "entailment"]
elif self.sub_dataset in ['STS-B']:
return Exception("No category labels for regreesion tasks")
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_tsv(self, input_file, quotechar=None, wo_label=False):
# test.tsv has not label,so it is a predict file
dev_file = "dev.tsv"
predict_file = "test.tsv"
if sub_dataset == 'MNLI' and not mismatch:
dev_file = 'dev_matched.tsv'
predict_file = "test_matched.tsv"
elif sub_dataset == 'MNLI' and mismatch:
dev_file = 'dev_mismatched.tsv'
predict_file = "test_mismatched.tsv"
dataset_dir = os.path.join(DATA_HOME, "glue_data")
dataset_dir = self._download_dataset(dataset_dir, url=_DATA_URL)
base_path = os.path.join(dataset_dir, self.sub_dataset)
label_list = None
if sub_dataset in ['MRPC', 'QQP', 'SST-2', 'CoLA']:
label_list = ["0", "1"]
elif sub_dataset in ['QNLI', 'RTE']:
label_list = ['not_entailment', 'entailment']
elif sub_dataset in ['MNLI']:
label_list = ["neutral", "contradiction", "entailment"]
elif sub_dataset in ['STS-B']:
label_list = None
super(GLUE, self).__init__(
base_path=base_path,
train_file="train.tsv",
dev_file=dev_file,
predict_file=predict_file,
label_file=None,
label_list=label_list,
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
reader = csv.reader(f, delimiter="\t", quotechar=None)
examples = []
seq_id = 0
if self.sub_dataset != 'CoLA' or wo_label:
if self.sub_dataset != 'CoLA' or phase == "predict":
header = next(reader) # skip header
if self.sub_dataset in [
'MRPC',
]:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, -2, -1]
else:
label_index, text_a_index, text_b_index = [0, -2, -1]
elif self.sub_dataset in [
'QNLI',
]:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, 1, 2]
else:
label_index, text_a_index, text_b_index = [3, 1, 2]
elif self.sub_dataset in [
'QQP',
]:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, 1, 2]
else:
label_index, text_a_index, text_b_index = [5, 3, 4]
elif self.sub_dataset in [
'RTE',
]:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, 1, 2]
else:
label_index, text_a_index, text_b_index = [3, 1, 2]
elif self.sub_dataset in [
'SST-2',
]:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, 1, None]
else:
label_index, text_a_index, text_b_index = [1, 0, None]
elif self.sub_dataset in [
'MNLI',
]:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, 8, 9]
else:
label_index, text_a_index, text_b_index = [-1, 8, 9]
elif self.sub_dataset in ['CoLA']:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, 1, None]
else:
label_index, text_a_index, text_b_index = [1, 3, None]
elif self.sub_dataset in ['STS-B']:
if wo_label:
if phase == "predict":
label_index, text_a_index, text_b_index = [None, -2, -1]
else:
label_index, text_a_index, text_b_index = [-1, -3, -2]
......
......@@ -17,64 +17,30 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import io
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class IFLYTEK(HubDataset):
class IFLYTEK(BaseNLPDatast):
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "iflytek")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.txt")
self.train_examples = self._read_file(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
self.dev_examples = self._read_file(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.txt")
self.test_examples = self._read_file(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return [str(i) for i in range(119)]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_file(self, input_file):
dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(IFLYTEK, self).__init__(
base_path=base_path,
train_file="train.txt",
dev_file="dev.txt",
test_file="test.txt",
label_file=None,
label_list=[str(i) for i in range(119)],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as file:
examples = []
......@@ -91,5 +57,13 @@ class IFLYTEK(HubDataset):
if __name__ == "__main__":
ds = IFLYTEK()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -20,18 +20,33 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import ImageClassificationDataset
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
class Indoor67Dataset(ImageClassificationDataset):
class Indoor67Dataset(BaseCVDatast):
def __init__(self):
super(Indoor67Dataset, self).__init__()
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67")
self.base_path = self._download_dataset(
base_path = self._download_dataset(
dataset_path=dataset_path,
url="https://bj.bcebos.com/paddlehub-dataset/Indoor67.tar.gz")
self.train_list_file = "train_list.txt"
self.test_list_file = "test_list.txt"
self.validate_list_file = "validate_list.txt"
self.label_list_file = "label_list.txt"
self.num_labels = 67
super(Indoor67Dataset, self).__init__(
base_path=base_path,
train_list_file="train_list.txt",
validate_list_file="validate_list.txt",
test_list_file="test_list.txt",
label_list_file="label_list.txt",
label_list=None)
if __name__ == "__main__":
ds = Indoor67Dataset()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print(e)
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print(e)
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print(e)
print(ds)
......@@ -17,73 +17,40 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import io
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class INews(HubDataset):
class INews(BaseNLPDatast):
"""
INews is a sentiment analysis dataset for Internet News
"""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "inews")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.txt")
self.train_examples = self._read_file(self.train_file, is_training=True)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
self.dev_examples = self._read_file(self.dev_file, is_training=False)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.txt")
self.test_examples = self._read_file(self.test_file, is_training=False)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return ["0", "1", "2"]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_file(self, input_file, is_training):
dataset_dir = os.path.join(DATA_HOME, "inews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(INews, self).__init__(
base_path=base_path,
train_file="train.txt",
dev_file="dev.txt",
test_file="test.txt",
label_file=None,
label_list=["0", "1", "2"],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as file:
examples = []
for (i, line) in enumerate(file):
if i == 0 and is_training:
if i == 0 and phase == 'train':
continue
data = line.strip().split("_!_")
example = InputExample(
......@@ -94,5 +61,13 @@ class INews(HubDataset):
if __name__ == "__main__":
ds = INews()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -17,68 +17,34 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import codecs
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class LCQMC(HubDataset):
class LCQMC(BaseNLPDatast):
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "lcqmc")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.tsv")
self.train_examples = self._read_tsv(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.tsv")
self.test_examples = self._read_tsv(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
"""See base class."""
return ["0", "1"]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_tsv(self, input_file, quotechar=None):
dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(LCQMC, self).__init__(
base_path=base_path,
train_file="train.tsv",
dev_file="dev.tsv",
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with codecs.open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
reader = csv.reader(f, delimiter="\t", quotechar=None)
examples = []
seq_id = 0
header = next(reader) # skip header
......@@ -93,5 +59,13 @@ class LCQMC(HubDataset):
if __name__ == "__main__":
ds = LCQMC()
for e in ds.get_train_examples():
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -20,18 +20,15 @@ from __future__ import print_function
import os
import codecs
import csv
import json
from collections import namedtuple
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class MSRA_NER(HubDataset):
class MSRA_NER(BaseNLPDatast):
"""
A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system
......@@ -40,55 +37,23 @@ class MSRA_NER(HubDataset):
"""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "msra_ner")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
train_file = os.path.join(self.dataset_dir, "train.tsv")
self.train_examples = self._read_tsv(train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.tsv")
self.test_examples = self._read_tsv(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def get_label_map(self):
return self.label_map
def _read_tsv(self, input_file, quotechar=None):
dataset_dir = os.path.join(DATA_HOME, "msra_ner")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(MSRA_NER, self).__init__(
base_path=base_path,
train_file="train.tsv",
dev_file="dev.tsv",
test_file="test.tsv",
label_file=None,
label_list=[
"B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"
],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with codecs.open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
reader = csv.reader(f, delimiter="\t", quotechar=None)
examples = []
seq_id = 0
header = next(reader) # skip header
......@@ -103,5 +68,13 @@ class MSRA_NER(HubDataset):
if __name__ == "__main__":
ds = MSRA_NER()
for e in ds.get_train_examples():
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -17,20 +17,18 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import codecs
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class NLPCC_DBQA(HubDataset):
class NLPCC_DBQA(BaseNLPDatast):
"""
Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
......@@ -38,53 +36,21 @@ class NLPCC_DBQA(HubDataset):
"""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.tsv")
self.train_examples = self._read_tsv(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.tsv")
self.test_examples = self._read_tsv(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
"""See base class."""
return ["0", "1"]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_tsv(self, input_file, quotechar=None):
dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(NLPCC_DBQA, self).__init__(
base_path=base_path,
train_file="train.tsv",
dev_file="dev.tsv",
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with codecs.open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
reader = csv.reader(f, delimiter="\t", quotechar=None)
examples = []
seq_id = 0
header = next(reader) # skip header
......@@ -99,5 +65,13 @@ class NLPCC_DBQA(HubDataset):
if __name__ == "__main__":
ds = NLPCC_DBQA()
for e in ds.get_train_examples():
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -16,12 +16,11 @@
import json
import os
import sys
from paddlehub.reader import tokenization
from paddlehub.common.downloader import default_downloader
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
......@@ -66,61 +65,31 @@ class SquadExample(object):
return s
class SQUAD(object):
class SQUAD(BaseNLPDatast):
"""A single set of features of data."""
def __init__(self, version_2_with_negative=False):
self.dataset_dir = os.path.join(DATA_HOME, "squad_data")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self.version_2_with_negative = version_2_with_negative
self._load_train_examples(version_2_with_negative, if_has_answer=True)
self._load_dev_examples(version_2_with_negative, if_has_answer=True)
def _load_train_examples(self,
version_2_with_negative=False,
if_has_answer=True):
if not version_2_with_negative:
self.train_file = os.path.join(self.dataset_dir, "train-v1.1.json")
else:
self.train_file = os.path.join(self.dataset_dir, "train-v2.0.json")
self.train_examples = self._read_json(self.train_file, if_has_answer,
version_2_with_negative)
def _load_dev_examples(self,
version_2_with_negative=False,
if_has_answer=True):
if not version_2_with_negative:
self.dev_file = os.path.join(self.dataset_dir, "dev-v1.1.json")
train_file = "train-v1.1.json"
dev_file = "dev-v1.1.json"
else:
self.dev_file = os.path.join(self.dataset_dir, "dev-v2.0.json")
self.dev_examples = self._read_json(self.dev_file, if_has_answer,
version_2_with_negative)
def _load_test_examples(self,
version_2_with_negative=False,
is_training=False):
self.test_file = None
logger.error("not test_file")
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return []
def _read_json(self,
input_file,
if_has_answer,
version_2_with_negative=False):
train_file = "train-v2.0.json"
dev_file = "dev-v2.0.json"
dataset_dir = os.path.join(DATA_HOME, "squad_data")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(SQUAD, self).__init__(
base_path=base_path,
train_file=train_file,
dev_file=dev_file,
test_file=None,
label_file=None,
label_list=None,
)
def _read_file(self, input_file, phase=None):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
......@@ -156,13 +125,15 @@ class SQUAD(object):
end_position = None
orig_answer_text = None
is_impossible = False
if if_has_answer:
if version_2_with_negative:
if phase in ["train", "dev"]:
if self.version_2_with_negative:
is_impossible = qa["is_impossible"]
# if (len(qa["answers"]) != 1) and (not is_impossible):
# raise ValueError(
# "For training, each question should have exactly 1 answer."
# )
if phase == "train" and (len(qa["answers"]) !=
1) and (not is_impossible):
print(qa)
raise ValueError(
"For training, each question should have exactly 1 answer."
)
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
......@@ -206,8 +177,14 @@ class SQUAD(object):
if __name__ == "__main__":
ds = SQUAD(version_2_with_negative=False)
examples = ds.get_train_examples()
for index, e in enumerate(examples):
if index < 10:
print(e)
ds = SQUAD(version_2_with_negative=True)
print("first 10 dev")
for e in ds.get_dev_examples()[:2]:
print(e)
print("first 10 train")
for e in ds.get_train_examples()[:2]:
print(e)
print("first 10 test")
for e in ds.get_test_examples()[:2]:
print(e)
print(ds)
......@@ -20,20 +20,35 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import ImageClassificationDataset
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
class StanfordDogsDataset(ImageClassificationDataset):
class StanfordDogsDataset(BaseCVDatast):
def __init__(self):
super(StanfordDogsDataset, self).__init__()
dataset_path = os.path.join(hub.common.dir.DATA_HOME,
"StanfordDogs-120")
self.base_path = self._download_dataset(
base_path = self._download_dataset(
dataset_path=dataset_path,
url="https://bj.bcebos.com/paddlehub-dataset/StanfordDogs-120.tar.gz"
)
self.train_list_file = "train_list.txt"
self.test_list_file = "test_list.txt"
self.validate_list_file = "validate_list.txt"
self.label_list_file = "label_list.txt"
self.num_labels = 120
super(StanfordDogsDataset, self).__init__(
base_path=base_path,
train_list_file="train_list.txt",
validate_list_file="validate_list.txt",
test_list_file="test_list.txt",
label_list_file="label_list.txt",
label_list=None)
if __name__ == "__main__":
ds = StanfordDogsDataset()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print(e)
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print(e)
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print(e)
print(ds)
......@@ -17,64 +17,30 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import io
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class THUCNEWS(HubDataset):
class THUCNEWS(BaseNLPDatast):
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "thucnews")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.txt")
self.train_examples = self._read_file(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.txt")
self.dev_examples = self._read_file(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.txt")
self.test_examples = self._read_file(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return [str(i) for i in range(14)]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_file(self, input_file):
dataset_dir = os.path.join(DATA_HOME, "thucnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(THUCNEWS, self).__init__(
base_path=base_path,
train_file="train.txt",
dev_file="dev.txt",
test_file="test.txt",
label_file=None,
label_list=[str(i) for i in range(14)],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as file:
examples = []
......@@ -91,5 +57,13 @@ class THUCNEWS(HubDataset):
if __name__ == "__main__":
ds = THUCNEWS()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -17,15 +17,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import io
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample, BaseDataset
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"
......@@ -48,64 +44,31 @@ LABEL_NAME = {
}
class TNews(HubDataset):
class TNews(BaseDataset):
"""
TNews is the chinese news classification dataset on Jinri Toutiao App.
"""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "tnews")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir,
"toutiao_category_train.txt")
self.train_examples = self._read_file(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir,
"toutiao_category_dev.txt")
self.dev_examples = self._read_file(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir,
"toutiao_category_test.txt")
self.test_examples = self._read_file(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return [
dataset_dir = os.path.join(DATA_HOME, "tnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [
'100', '101', '102', '103', '104', '106', '107', '108', '109',
'110', '112', '113', '114', '115', '116'
]
super(TNews, self).__init__(
base_path=base_path,
train_file="toutiao_category_train.txt",
dev_file="toutiao_category_dev.txt",
test_file="toutiao_category_test.txt",
label_file=None,
label_list=label_list,
)
def get_label_name(self, id):
return LABEL_NAME[id]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_file(self, input_file):
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as file:
examples = []
......@@ -120,5 +83,13 @@ class TNews(HubDataset):
if __name__ == "__main__":
ds = TNews()
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -17,73 +17,39 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import codecs
import os
import pandas as pd
from numpy import nan
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class Toxic(HubDataset):
class Toxic(BaseNLPDatast):
"""
The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
"""
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "toxic")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, "train.csv")
self.train_examples = self._read_csv(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, "dev.csv")
self.dev_examples = self._read_csv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, "test.csv")
self.test_examples = self._read_csv(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
return [
dataset_dir = os.path.join(DATA_HOME, "toxic")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [
'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
'identity_hate'
]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_csv(self, input_file, quotechar=None):
super(Toxic, self).__init__(
base_path=base_path,
train_file="train.csv",
dev_file="dev.csv",
test_file="test.csv",
label_file=None,
label_list=label_list,
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
data = pd.read_csv(input_file, encoding="UTF-8")
examples = []
......@@ -99,5 +65,13 @@ class Toxic(HubDataset):
if __name__ == "__main__":
ds = Toxic()
for e in ds.get_train_examples():
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -23,15 +23,14 @@ import io
import os
import csv
from paddlehub.dataset import InputExample, HubDataset
from paddlehub.common.downloader import default_downloader
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class XNLI(HubDataset):
class XNLI(BaseNLPDatast):
"""
Please refer to
https://arxiv.org/pdf/1809.05053.pdf
......@@ -43,61 +42,25 @@ class XNLI(HubDataset):
"ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw",
"th", "tr", "ur", "vi", "zh"
]:
raise Exception(language +
"is not in XNLI. Please confirm the language")
raise Exception(
"%s is not in XNLI. Please confirm the language" % language)
self.language = language
self.dataset_dir = os.path.join(DATA_HOME, "XNLI-lan")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def _load_train_examples(self):
self.train_file = os.path.join(self.dataset_dir, self.language,
self.language + "_train.tsv")
self.train_examples = self._read_tsv(self.train_file)
def _load_dev_examples(self):
self.dev_file = os.path.join(self.dataset_dir, self.language,
self.language + "_dev.tsv")
self.dev_examples = self._read_tsv(self.dev_file)
def _load_test_examples(self):
self.test_file = os.path.join(self.dataset_dir, self.language,
self.language + "_test.tsv")
self.test_examples = self._read_tsv(self.test_file)
def get_train_examples(self):
return self.train_examples
def get_dev_examples(self):
return self.dev_examples
def get_test_examples(self):
return self.test_examples
def get_labels(self):
"""See base class."""
return ["neutral", "contradiction", "entailment"]
@property
def num_labels(self):
"""
Return the number of labels in the dataset.
"""
return len(self.get_labels())
def _read_tsv(self, input_file, quotechar=None):
dataset_dir = os.path.join(DATA_HOME, "XNLI-lan")
dataset_dir = self._download_dataset(dataset_dir, url=_DATA_URL)
base_path = os.path.join(dataset_dir, language)
super(XNLI, self).__init__(
base_path=base_path,
train_file="%s_train.tsv" % language,
dev_file="%s_dev.tsv" % language,
test_file="%s_test.tsv" % language,
label_file=None,
label_list=["neutral", "contradiction", "entailment"],
)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
reader = csv.reader(f, delimiter="\t", quotechar=None)
examples = []
seq_id = 0
header = next(reader) # skip header
......@@ -112,5 +75,13 @@ class XNLI(HubDataset):
if __name__ == "__main__":
ds = XNLI()
for e in ds.get_train_examples()[:3]:
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
......@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .basic_task import BasicTask, RunEnv, RunState
from .base_task import BaseTask, RunEnv, RunState
from .classifier_task import ClassifierTask, ImageClassifierTask, TextClassifierTask, MultiLabelClassifierTask
from .reading_comprehension_task import ReadingComprehensionTask
from .regression_task import RegressionTask
......
......@@ -192,7 +192,7 @@ class TaskHooks():
return self.info(only_customized=False)
class BasicTask(object):
class BaseTask(object):
def __init__(self,
feed_list,
data_reader,
......@@ -265,7 +265,7 @@ class BasicTask(object):
for hook_type, event_hooks in self._hooks._registered_hooks.items():
self._hooks.add(hook_type, "default",
eval("self._default_%s_event" % hook_type))
setattr(BasicTask, "_%s_event" % hook_type,
setattr(BaseTask, "_%s_event" % hook_type,
self.create_event_function(hook_type))
# accelerate predict
......
......@@ -23,10 +23,10 @@ import numpy as np
import paddle.fluid as fluid
from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef
from .basic_task import BasicTask
from .base_task import BaseTask
class ClassifierTask(BasicTask):
class ClassifierTask(BaseTask):
def __init__(self,
feature,
num_classes,
......
......@@ -28,7 +28,7 @@ from collections import OrderedDict
import numpy as np
import paddle.fluid as fluid
from .basic_task import BasicTask
from .base_task import BaseTask
from paddlehub.common.logger import logger
from paddlehub.reader import tokenization
from paddlehub.finetune.evaluator import squad1_evaluate
......@@ -176,6 +176,13 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
output_nbest_file, output_null_log_odds_file,
version_2_with_negative, null_score_diff_threshold,
is_english):
_PrelimPrediction = collections.namedtuple("PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit", "end_logit"
])
_NbestPrediction = collections.namedtuple(
"NbestPrediction", ["text", "start_logit", "end_logit"])
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
......@@ -184,10 +191,6 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple("PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit", "end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
......@@ -262,9 +265,6 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
if not prelim_predictions:
......@@ -384,7 +384,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
+ "\n")
class ReadingComprehensionTask(BasicTask):
class ReadingComprehensionTask(BaseTask):
def __init__(self,
feature,
feed_list,
......@@ -420,6 +420,9 @@ class ReadingComprehensionTask(BasicTask):
self.n_best_size = n_best_size
self.max_answer_length = max_answer_length
self.RawResult = collections.namedtuple(
"RawResult", ["unique_id", "start_logits", "end_logits"])
def _build_net(self):
self.unique_ids = fluid.layers.data(
name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64")
......@@ -493,8 +496,6 @@ class ReadingComprehensionTask(BasicTask):
def _calculate_metrics(self, run_states):
total_cost, total_num_seqs, all_results = [], [], []
run_step = 0
RawResult = collections.namedtuple(
"RawResult", ["unique_id", "start_logits", "end_logits"])
for run_state in run_states:
np_loss = run_state.run_results[0]
np_num_seqs = run_state.run_results[1]
......@@ -510,7 +511,7 @@ class ReadingComprehensionTask(BasicTask):
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
self.RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
......@@ -544,13 +545,13 @@ class ReadingComprehensionTask(BasicTask):
is_english=self.is_english)
if self.phase == 'val' or self.phase == 'dev':
with open(
self.data_reader.dataset.dev_file, 'r',
self.data_reader.dataset.dev_path, 'r',
encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
elif self.phase == 'test':
with open(
self.data_reader.dataset.test_file, 'r',
self.data_reader.dataset.test_path, 'r',
encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
......@@ -577,8 +578,6 @@ class ReadingComprehensionTask(BasicTask):
def _default_predict_end_event(self, run_states):
all_results = []
RawResult = collections.namedtuple(
"RawResult", ["unique_id", "start_logits", "end_logits"])
for run_state in run_states:
np_unique_ids = run_state.run_results[0]
np_start_logits = run_state.run_results[1]
......@@ -588,7 +587,7 @@ class ReadingComprehensionTask(BasicTask):
start_logits = [float(x) for x in np_start_logits[idx].flat]
end_logits = [float(x) for x in np_end_logits[idx].flat]
all_results.append(
RawResult(
self.RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
......
......@@ -23,10 +23,10 @@ from collections import OrderedDict
import numpy as np
import paddle.fluid as fluid
from scipy.stats import spearmanr
from .basic_task import BasicTask
from .base_task import BaseTask
class RegressionTask(BasicTask):
class RegressionTask(BaseTask):
def __init__(self,
feature,
feed_list,
......
......@@ -25,10 +25,10 @@ import paddle
import paddle.fluid as fluid
from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
from paddlehub.common.utils import version_compare
from .basic_task import BasicTask
from .base_task import BaseTask
class SequenceLabelTask(BasicTask):
class SequenceLabelTask(BaseTask):
def __init__(self,
feature,
max_seq_len,
......
import numpy as np
class BaseReader(object):
def __init__(self, dataset, random_seed=None):
self.dataset = dataset
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
np.random.seed(random_seed)
def get_train_examples(self):
return self.dataset.get_train_examples()
def get_dev_examples(self):
return self.dataset.get_dev_examples()
def get_test_examples(self):
return self.dataset.get_test_examples()
def data_generator(self):
raise NotImplementedError
......@@ -22,6 +22,7 @@ import numpy as np
from PIL import Image
import paddlehub.io.augmentation as image_augmentation
from .base_reader import BaseReader
channel_order_dict = {
"RGB": [0, 1, 2],
......@@ -33,7 +34,7 @@ channel_order_dict = {
}
class ImageClassificationReader(object):
class ImageClassificationReader(BaseReader):
def __init__(self,
image_width,
image_height,
......@@ -41,15 +42,15 @@ class ImageClassificationReader(object):
channel_order="RGB",
images_mean=None,
images_std=None,
data_augmentation=False):
data_augmentation=False,
random_seed=None):
super(ImageClassificationReader, self).__init__(dataset, random_seed)
self.image_width = image_width
self.image_height = image_height
self.channel_order = channel_order
self.dataset = dataset
self.data_augmentation = data_augmentation
self.images_std = images_std
self.images_mean = images_mean
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
if self.images_mean is None:
try:
......@@ -73,24 +74,38 @@ class ImageClassificationReader(object):
raise ValueError("Image width and height should not be negative.")
def data_generator(self,
batch_size,
batch_size=1,
phase="train",
shuffle=False,
data=None):
if phase != 'predict' and not self.dataset:
raise ValueError("The dataset is none and it's not allowed!")
if phase == "train":
data = self.dataset.train_data(shuffle)
self.num_examples['train'] = len(self.get_train_examples())
elif phase == "test":
shuffle = False
data = self.dataset.test_data(shuffle)
self.num_examples['test'] = len(self.get_test_examples())
shuffle = True
if hasattr(self.dataset, "train_data"):
# Compatible with ImageClassificationDataset which has done shuffle
self.dataset.train_data()
shuffle = False
data = self.get_train_examples()
self.num_examples['train'] = len(data)
elif phase == "val" or phase == "dev":
shuffle = False
data = self.dataset.validate_data(shuffle)
self.num_examples['dev'] = len(self.get_dev_examples())
if hasattr(self.dataset, "validate_data"):
# Compatible with ImageClassificationDataset
self.dataset.validate_data()
shuffle = False
data = self.get_dev_examples()
self.num_examples['dev'] = len(data)
elif phase == "test":
shuffle = False
if hasattr(self.dataset, "test_data"):
# Compatible with ImageClassificationDataset
data = self.dataset.test_data()
shuffle = False
data = self.get_test_examples()
self.num_examples['test'] = len(data)
elif phase == "predict":
shuffle = False
data = data
def preprocess(image_path):
......@@ -118,6 +133,9 @@ class ImageClassificationReader(object):
return image
def _data_reader():
if shuffle:
np.random.shuffle(data)
if phase == "predict":
for image_path in data:
image = preprocess(image_path)
......@@ -128,12 +146,3 @@ class ImageClassificationReader(object):
yield (image, label)
return paddle.batch(_data_reader, batch_size=batch_size)
def get_train_examples(self):
return self.dataset.train_examples
def get_dev_examples(self):
return self.dataset.dev_examples
def get_test_examples(self):
return self.dataset.test_examples
......@@ -18,11 +18,8 @@ from __future__ import division
from __future__ import print_function
import collections
import json
import numpy as np
import platform
import six
import sys
from collections import namedtuple
import paddle
......@@ -31,11 +28,12 @@ from paddlehub.reader import tokenization
from paddlehub.common.logger import logger
from paddlehub.common.utils import sys_stdout_encoding
from paddlehub.dataset.dataset import InputExample
from .batching import pad_batch_data, prepare_batch_data
from .batching import pad_batch_data
import paddlehub as hub
from .base_reader import BaseReader
class BaseReader(object):
class BaseNLPReader(BaseReader):
def __init__(self,
vocab_path,
dataset=None,
......@@ -47,6 +45,7 @@ class BaseReader(object):
sp_model_path=None,
word_dict_path=None,
in_tokens=False):
super(BaseNLPReader, self).__init__(dataset, random_seed)
self.max_seq_len = max_seq_len
if sp_model_path and word_dict_path:
self.tokenizer = tokenization.WSSPTokenizer(
......@@ -55,52 +54,35 @@ class BaseReader(object):
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.dataset = dataset
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.in_tokens = in_tokens
self.use_task_id = use_task_id
if self.use_task_id:
logger.warning(
"use_task_id has been de discarded since PaddleHub v1.4.0")
self.task_id = 0
np.random.seed(random_seed)
# generate label map
self.label_map = {}
if self.dataset:
try:
for index, label in enumerate(self.dataset.get_labels()):
self.label_map[label] = index
logger.info("Dataset label map = {}".format(self.label_map))
else:
logger.info("Dataset is None! label map = {}".format(
self.label_map))
self.current_example = 0
self.current_epoch = 0
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def get_train_examples(self):
"""Gets a collection of `InputExample`s for the train set."""
return self.dataset.get_train_examples()
def get_dev_examples(self):
"""Gets a collection of `InputExample`s for the dev set."""
return self.dataset.get_dev_examples()
except:
# some dataset like squad, its label_list=None
logger.info(
"Dataset is None or it has not any labels, label map = {}".
format(self.label_map))
def get_val_examples(self):
"""Gets a collection of `InputExample`s for the val set."""
return self.dataset.get_val_examples()
def get_test_examples(self):
"""Gets a collection of `InputExample`s for prediction."""
return self.dataset.get_test_examples()
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_example, self.current_epoch
self.Record_With_Label_Id = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
self.Record_Wo_Label_Id = namedtuple(
'Record', ['token_ids', 'text_type_ids', 'position_ids'])
def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
......@@ -189,24 +171,14 @@ class BaseReader(object):
else:
label_id = example.label
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
if phase != "predict":
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
record = Record(
record = self.Record_With_Label_Id(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_id=label_id)
else:
Record = namedtuple('Record',
['token_ids', 'text_type_ids', 'position_ids'])
record = Record(
record = self.Record_Wo_Label_Id(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids)
......@@ -238,14 +210,6 @@ class BaseReader(object):
if batch_records:
yield self._pad_batch_records(batch_records, phase)
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'val', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return self.num_examples[phase]
def data_generator(self,
batch_size=1,
phase='train',
......@@ -307,7 +271,7 @@ class BaseReader(object):
return wrapper
class ClassifyReader(BaseReader):
class ClassifyReader(BaseNLPReader):
def _pad_batch_records(self, batch_records, phase=None):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
......@@ -360,7 +324,7 @@ class ClassifyReader(BaseReader):
return return_list
class SequenceLabelReader(BaseReader):
class SequenceLabelReader(BaseNLPReader):
def __init__(self,
vocab_path,
dataset=None,
......@@ -508,15 +472,11 @@ class SequenceLabelReader(BaseReader):
label_ids = [no_entity_id
] + [self.label_map[label]
for label in labels] + [no_entity_id]
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
record = Record(
record = self.Record_With_Label_Id(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_ids=label_ids)
label_id=label_ids)
else:
tokens = self._reseg_token_label(
tokens=tokens, tokenizer=tokenizer, phase=phase)
......@@ -529,9 +489,7 @@ class SequenceLabelReader(BaseReader):
position_ids = list(range(len(token_ids)))
text_type_ids = [0] * len(token_ids)
Record = namedtuple('Record',
['token_ids', 'text_type_ids', 'position_ids'])
record = Record(
record = self.Record_Wo_Label_Id(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
......@@ -540,109 +498,7 @@ class SequenceLabelReader(BaseReader):
return record
class LACClassifyReader(object):
def __init__(self, vocab_path, dataset=None, in_tokens=False):
self.dataset = dataset
self.lac = hub.Module(name="lac")
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=False)
self.vocab = self.tokenizer.vocab
self.feed_key = list(
self.lac.processor.data_format(
sign_name="lexical_analysis").keys())[0]
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
self.in_tokens = in_tokens
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'val', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return self.num_examples[phase]
def get_train_examples(self):
"""Gets a collection of `InputExample`s for the train set."""
return self.dataset.get_train_examples()
def get_dev_examples(self):
"""Gets a collection of `InputExample`s for the dev set."""
return self.dataset.get_dev_examples()
def get_val_examples(self):
"""Gets a collection of `InputExample`s for the val set."""
return self.dataset.get_val_examples()
def get_test_examples(self):
"""Gets a collection of `InputExample`s for prediction."""
return self.dataset.get_test_examples()
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_example, self.current_epoch
def data_generator(self,
batch_size=1,
phase="train",
shuffle=False,
data=None):
if phase != "predict" and not self.dataset:
raise ValueError("The dataset is None and it isn't allowed.")
if phase == "train":
shuffle = True
data = self.dataset.get_train_examples()
self.num_examples['train'] = len(data)
elif phase == "test":
shuffle = False
data = self.dataset.get_test_examples()
self.num_examples['test'] = len(data)
elif phase == "val" or phase == "dev":
shuffle = False
data = self.dataset.get_dev_examples()
self.num_examples['dev'] = len(data)
elif phase == "predict":
data = data
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def preprocess(text):
data_dict = {self.feed_key: [text]}
processed = self.lac.lexical_analysis(data=data_dict)
processed = [
self.vocab[word] for word in processed[0]['word']
if word in self.vocab
]
if len(processed) == 0:
if six.PY2:
text = text.encode(sys_stdout_encoding())
logger.warning(
"The words in text %s can't be found in the vocabulary." %
(text))
return processed
def _data_reader():
if shuffle:
np.random.shuffle(data)
if phase == "predict":
for text in data:
text = preprocess(text)
if not text:
continue
yield (text, )
else:
for item in data:
text = preprocess(item.text_a)
if not text:
continue
yield (text, item.label)
return paddle.batch(_data_reader, batch_size=batch_size)
class MultiLabelClassifyReader(BaseReader):
class MultiLabelClassifyReader(BaseNLPReader):
def _pad_batch_records(self, batch_records, phase=None):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
......@@ -749,19 +605,13 @@ class MultiLabelClassifyReader(BaseReader):
label_ids.append(int(label))
if phase != "predict":
Record = namedtuple(
'Record',
['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
record = Record(
record = self.Record_With_Label_Id(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids,
label_ids=label_ids)
label_id=label_ids)
else:
Record = namedtuple('Record',
['token_ids', 'text_type_ids', 'position_ids'])
record = Record(
record = self.Record_Wo_Label_Id(
token_ids=token_ids,
text_type_ids=text_type_ids,
position_ids=position_ids)
......@@ -769,39 +619,7 @@ class MultiLabelClassifyReader(BaseReader):
return record
class RegressionReader(BaseReader):
def __init__(self,
vocab_path,
dataset=None,
label_map_config=None,
max_seq_len=128,
do_lower_case=True,
random_seed=None,
use_task_id=False):
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.dataset = dataset
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.in_tokens = False
self.use_task_id = use_task_id
if self.use_task_id:
self.task_id = 0
np.random.seed(random_seed)
# generate label map
self.label_map = {} # Unlike BaseReader, it's not filled
self.current_example = 0
self.current_epoch = 0
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
class RegressionReader(BaseNLPReader):
def _pad_batch_records(self, batch_records, phase=None):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
......@@ -881,7 +699,7 @@ class RegressionReader(BaseReader):
for item in data:
# set label in order to run the program
label = -1 # different from BaseReader
label = -1 # different from BaseNLPReader
if len(item) == 1:
item_i = InputExample(
guid=seq_id, text_a=item[0], label=label)
......@@ -956,7 +774,7 @@ class Features(object):
return s
class ReadingComprehensionReader(BaseReader):
class ReadingComprehensionReader(BaseNLPReader):
def __init__(self,
dataset,
vocab_path,
......@@ -965,33 +783,30 @@ class ReadingComprehensionReader(BaseReader):
doc_stride=128,
max_query_length=64,
random_seed=None,
use_task_id=False):
self.dataset = dataset
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.max_seq_len = max_seq_len
use_task_id=False,
sp_model_path=None,
word_dict_path=None,
in_tokens=False):
super(ReadingComprehensionReader, self).__init__(
vocab_path=vocab_path,
dataset=dataset,
label_map_config=None,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
random_seed=random_seed,
use_task_id=use_task_id,
sp_model_path=sp_model_path,
word_dict_path=word_dict_path,
in_tokens=in_tokens)
self.doc_stride = doc_stride
self.max_query_length = max_query_length
self.use_task_id = use_task_id
self.in_tokens = False
self._DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
# self.all_examples[phase] and self.all_features[phase] will be used
# in write_prediction in reading_comprehension_task
self.all_features = {"train": [], "dev": [], "test": [], "predict": []}
self.all_examples = {"train": [], "dev": [], "test": [], "predict": []}
np.random.seed(random_seed)
self.vocab = self.tokenizer.vocab
self.vocab_size = len(self.vocab)
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.current_train_example = 0
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def _pad_batch_records(self, batch_records, phase):
batch_token_ids = [record.token_ids for record in batch_records]
batch_text_type_ids = [record.text_type_ids for record in batch_records]
......@@ -1175,14 +990,14 @@ class ReadingComprehensionReader(BaseReader):
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
doc_spans.append(
self._DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, self.doc_stride)
......@@ -1331,5 +1146,78 @@ class ReadingComprehensionReader(BaseReader):
return cur_span_index == best_span_index
class LACClassifyReader(BaseReader):
def __init__(self, vocab_path, dataset=None, in_tokens=False):
super(LACClassifyReader, self).__init__(dataset)
self.in_tokens = in_tokens
self.lac = hub.Module(name="lac")
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=False)
self.vocab = self.tokenizer.vocab
self.feed_key = list(
self.lac.processor.data_format(
sign_name="lexical_analysis").keys())[0]
def data_generator(self,
batch_size=1,
phase="train",
shuffle=False,
data=None):
if phase != "predict" and not self.dataset:
raise ValueError("The dataset is None and it isn't allowed.")
if phase == "train":
shuffle = True
data = self.dataset.get_train_examples()
self.num_examples['train'] = len(data)
elif phase == "test":
shuffle = False
data = self.dataset.get_test_examples()
self.num_examples['test'] = len(data)
elif phase == "val" or phase == "dev":
shuffle = False
data = self.dataset.get_dev_examples()
self.num_examples['dev'] = len(data)
elif phase == "predict":
data = data
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def preprocess(text):
data_dict = {self.feed_key: [text]}
processed = self.lac.lexical_analysis(data=data_dict)
processed = [
self.vocab[word] for word in processed[0]['word']
if word in self.vocab
]
if len(processed) == 0:
if six.PY2:
text = text.encode(sys_stdout_encoding())
logger.warning(
"The words in text %s can't be found in the vocabulary." %
(text))
return processed
def _data_reader():
if shuffle:
np.random.shuffle(data)
if phase == "predict":
for text in data:
text = preprocess(text)
if not text:
continue
yield (text, )
else:
for item in data:
text = preprocess(item.text_a)
if not text:
continue
yield (text, item.label)
return paddle.batch(_data_reader, batch_size=batch_size)
if __name__ == '__main__':
pass
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册