提交 90bbb041 编写于 作者: K kinghuin 提交者: wuzewu

fix typo (#316)

* fix typo

* enhance predict dataset
上级 39643114
......@@ -26,7 +26,7 @@ from paddlehub.common.downloader import default_downloader
from paddlehub.common.logger import logger
class BaseCVDatast(BaseDataset):
class BaseCVDataset(BaseDataset):
def __init__(self,
base_path,
train_list_file=None,
......@@ -35,7 +35,7 @@ class BaseCVDatast(BaseDataset):
predict_list_file=None,
label_list_file=None,
label_list=None):
super(BaseCVDatast, self).__init__(
super(BaseCVDataset, self).__init__(
base_path=base_path,
train_file=train_list_file,
dev_file=validate_list_file,
......@@ -65,7 +65,7 @@ class BaseCVDatast(BaseDataset):
return data
# discarded. please use BaseCVDatast
# discarded. please use BaseCVDataset
class ImageClassificationDataset(object):
def __init__(self):
logger.warning(
......
......@@ -21,9 +21,10 @@ import io
import csv
from paddlehub.dataset import InputExample, BaseDataset
from paddlehub.common.logger import logger
class BaseNLPDatast(BaseDataset):
class BaseNLPDataset(BaseDataset):
def __init__(self,
base_path,
train_file=None,
......@@ -32,11 +33,11 @@ class BaseNLPDatast(BaseDataset):
predict_file=None,
label_file=None,
label_list=None,
train_file_with_head=False,
dev_file_with_head=False,
test_file_with_head=False,
predict_file_with_head=False):
super(BaseNLPDatast, self).__init__(
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
predict_file_with_header=False):
super(BaseNLPDataset, self).__init__(
base_path=base_path,
train_file=train_file,
dev_file=dev_file,
......@@ -44,37 +45,54 @@ class BaseNLPDatast(BaseDataset):
predict_file=predict_file,
label_file=label_file,
label_list=label_list,
train_file_with_head=train_file_with_head,
dev_file_with_head=dev_file_with_head,
test_file_with_head=test_file_with_head,
predict_file_with_head=predict_file_with_head)
train_file_with_header=train_file_with_header,
dev_file_with_header=dev_file_with_header,
test_file_with_header=test_file_with_header,
predict_file_with_header=predict_file_with_header)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
has_warned = False
with io.open(input_file, "r", encoding="UTF-8") as file:
reader = csv.reader(file, delimiter="\t", quotechar=None)
examples = []
for (i, line) in enumerate(reader):
if i == 0:
ncol = len(line)
if self.if_file_with_head[phase]:
if self.if_file_with_header[phase]:
continue
if ncol == 1:
if phase != "predict":
example = InputExample(guid=i, text_a=line[0])
else:
if phase != "predict":
if ncol == 1:
raise Exception(
"the %s file: %s only has one column but it is not a predict file"
% (phase, input_file))
elif ncol == 2:
example = InputExample(
guid=i, text_a=line[0], label=line[1])
elif ncol == 3:
example = InputExample(
guid=i, text_a=line[0], text_b=line[1], label=line[2])
elif ncol == 2:
example = InputExample(
guid=i, text_a=line[0], label=line[1])
elif ncol == 3:
example = InputExample(
guid=i,
text_a=line[0],
text_b=line[1],
label=line[2])
else:
raise Exception(
"the %s file: %s has too many columns (should <=3)"
% (phase, input_file))
else:
raise Exception(
"the %s file: %s has too many columns (should <=3)" %
(phase, input_file))
if ncol == 1:
example = InputExample(guid=i, text_a=line[0])
elif ncol == 2:
if not has_warned:
logger.warning(
"the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
% (input_file))
has_warned = True
example = InputExample(
guid=i, text_a=line[0], text_b=line[1])
else:
raise Exception(
"the predict file: %s has too many columns (should <=2)"
% (input_file))
examples.append(example)
return examples
......@@ -20,10 +20,10 @@ from __future__ import print_function
import os
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
class BQ(BaseNLPDatast):
class BQ(BaseNLPDataset):
def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset(
......
......@@ -23,10 +23,10 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
class ChnSentiCorp(BaseNLPDatast):
class ChnSentiCorp(BaseNLPDataset):
"""
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining)
......
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -62,7 +62,7 @@ class CMRC2018Example(object):
return s
class CMRC2018(BaseNLPDatast):
class CMRC2018(BaseNLPDataset):
"""A single set of features of data."""
def __init__(self):
......
......@@ -64,10 +64,10 @@ class BaseDataset(object):
predict_file=None,
label_file=None,
label_list=None,
train_file_with_head=False,
dev_file_with_head=False,
test_file_with_head=False,
predict_file_with_head=False):
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
predict_file_with_header=False):
if not (train_file or dev_file or test_file):
raise ValueError("At least one file should be assigned")
self.base_path = base_path
......@@ -83,11 +83,11 @@ class BaseDataset(object):
self.test_examples = []
self.predict_examples = []
self.if_file_with_head = {
"train": train_file_with_head,
"dev": dev_file_with_head,
"test": test_file_with_head,
"predict": predict_file_with_head
self.if_file_with_header = {
"train": train_file_with_header,
"dev": dev_file_with_header,
"test": test_file_with_header,
"predict": predict_file_with_header
}
if train_file:
......
......@@ -20,10 +20,10 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class DogCatDataset(BaseCVDatast):
class DogCatDataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat")
base_path = self._download_dataset(
......
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -62,7 +62,7 @@ class DRCDExample(object):
return s
class DRCD(BaseNLPDatast):
class DRCD(BaseNLPDataset):
"""A single set of features of data."""
def __init__(self):
......
......@@ -20,10 +20,10 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class FlowersDataset(BaseCVDatast):
class FlowersDataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos")
base_path = self._download_dataset(
......
......@@ -20,10 +20,10 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class Food101Dataset(BaseCVDatast):
class Food101Dataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101",
"images")
......
......@@ -24,12 +24,12 @@ import io
from paddlehub.dataset import InputExample
from paddlehub.common.logger import logger
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"
class GLUE(BaseNLPDatast):
class GLUE(BaseNLPDataset):
"""
Please refer to
https://gluebenchmark.com
......
......@@ -22,12 +22,12 @@ import os
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class IFLYTEK(BaseNLPDatast):
class IFLYTEK(BaseNLPDataset):
def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
......
......@@ -20,10 +20,10 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class Indoor67Dataset(BaseCVDatast):
class Indoor67Dataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67")
base_path = self._download_dataset(
......
......@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class INews(BaseNLPDatast):
class INews(BaseNLPDataset):
"""
INews is a sentiment analysis dataset for Internet News
"""
......
......@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class LCQMC(BaseNLPDatast):
class LCQMC(BaseNLPDataset):
def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
......
......@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class MSRA_NER(BaseNLPDatast):
class MSRA_NER(BaseNLPDataset):
"""
A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system
......
......@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class NLPCC_DBQA(BaseNLPDatast):
class NLPCC_DBQA(BaseNLPDataset):
"""
Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
......
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
......@@ -65,7 +65,7 @@ class SquadExample(object):
return s
class SQUAD(BaseNLPDatast):
class SQUAD(BaseNLPDataset):
"""A single set of features of data."""
def __init__(self, version_2_with_negative=False):
......
......@@ -20,10 +20,10 @@ from __future__ import print_function
import os
import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class StanfordDogsDataset(BaseCVDatast):
class StanfordDogsDataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME,
"StanfordDogs-120")
......
......@@ -22,12 +22,12 @@ import os
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class THUCNEWS(BaseNLPDatast):
class THUCNEWS(BaseNLPDataset):
def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "thucnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
......
......@@ -22,12 +22,12 @@ import pandas as pd
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class Toxic(BaseNLPDatast):
class Toxic(BaseNLPDataset):
"""
The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
......
......@@ -25,12 +25,12 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class XNLI(BaseNLPDatast):
class XNLI(BaseNLPDataset):
"""
Please refer to
https://arxiv.org/pdf/1809.05053.pdf
......
......@@ -142,7 +142,7 @@ class ClassifierTask(BaseTask):
}
except:
raise Exception(
"ImageClassificationDataset does not support postprocessing, please use BaseCVDatast instead"
"ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
)
results = []
for batch_state in run_states:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册