提交 90bbb041 编写于 作者: K kinghuin 提交者: wuzewu

fix typo (#316)

* fix typo

* enhance predict dataset
上级 39643114
...@@ -26,7 +26,7 @@ from paddlehub.common.downloader import default_downloader ...@@ -26,7 +26,7 @@ from paddlehub.common.downloader import default_downloader
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
class BaseCVDatast(BaseDataset): class BaseCVDataset(BaseDataset):
def __init__(self, def __init__(self,
base_path, base_path,
train_list_file=None, train_list_file=None,
...@@ -35,7 +35,7 @@ class BaseCVDatast(BaseDataset): ...@@ -35,7 +35,7 @@ class BaseCVDatast(BaseDataset):
predict_list_file=None, predict_list_file=None,
label_list_file=None, label_list_file=None,
label_list=None): label_list=None):
super(BaseCVDatast, self).__init__( super(BaseCVDataset, self).__init__(
base_path=base_path, base_path=base_path,
train_file=train_list_file, train_file=train_list_file,
dev_file=validate_list_file, dev_file=validate_list_file,
...@@ -65,7 +65,7 @@ class BaseCVDatast(BaseDataset): ...@@ -65,7 +65,7 @@ class BaseCVDatast(BaseDataset):
return data return data
# discarded. please use BaseCVDatast # discarded. please use BaseCVDataset
class ImageClassificationDataset(object): class ImageClassificationDataset(object):
def __init__(self): def __init__(self):
logger.warning( logger.warning(
......
...@@ -21,9 +21,10 @@ import io ...@@ -21,9 +21,10 @@ import io
import csv import csv
from paddlehub.dataset import InputExample, BaseDataset from paddlehub.dataset import InputExample, BaseDataset
from paddlehub.common.logger import logger
class BaseNLPDatast(BaseDataset): class BaseNLPDataset(BaseDataset):
def __init__(self, def __init__(self,
base_path, base_path,
train_file=None, train_file=None,
...@@ -32,11 +33,11 @@ class BaseNLPDatast(BaseDataset): ...@@ -32,11 +33,11 @@ class BaseNLPDatast(BaseDataset):
predict_file=None, predict_file=None,
label_file=None, label_file=None,
label_list=None, label_list=None,
train_file_with_head=False, train_file_with_header=False,
dev_file_with_head=False, dev_file_with_header=False,
test_file_with_head=False, test_file_with_header=False,
predict_file_with_head=False): predict_file_with_header=False):
super(BaseNLPDatast, self).__init__( super(BaseNLPDataset, self).__init__(
base_path=base_path, base_path=base_path,
train_file=train_file, train_file=train_file,
dev_file=dev_file, dev_file=dev_file,
...@@ -44,37 +45,54 @@ class BaseNLPDatast(BaseDataset): ...@@ -44,37 +45,54 @@ class BaseNLPDatast(BaseDataset):
predict_file=predict_file, predict_file=predict_file,
label_file=label_file, label_file=label_file,
label_list=label_list, label_list=label_list,
train_file_with_head=train_file_with_head, train_file_with_header=train_file_with_header,
dev_file_with_head=dev_file_with_head, dev_file_with_header=dev_file_with_header,
test_file_with_head=test_file_with_head, test_file_with_header=test_file_with_header,
predict_file_with_head=predict_file_with_head) predict_file_with_header=predict_file_with_header)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
has_warned = False
with io.open(input_file, "r", encoding="UTF-8") as file: with io.open(input_file, "r", encoding="UTF-8") as file:
reader = csv.reader(file, delimiter="\t", quotechar=None) reader = csv.reader(file, delimiter="\t", quotechar=None)
examples = [] examples = []
for (i, line) in enumerate(reader): for (i, line) in enumerate(reader):
if i == 0: if i == 0:
ncol = len(line) ncol = len(line)
if self.if_file_with_head[phase]: if self.if_file_with_header[phase]:
continue continue
if ncol == 1: if phase != "predict":
if phase != "predict": if ncol == 1:
example = InputExample(guid=i, text_a=line[0])
else:
raise Exception( raise Exception(
"the %s file: %s only has one column but it is not a predict file" "the %s file: %s only has one column but it is not a predict file"
% (phase, input_file)) % (phase, input_file))
elif ncol == 2: elif ncol == 2:
example = InputExample( example = InputExample(
guid=i, text_a=line[0], label=line[1]) guid=i, text_a=line[0], label=line[1])
elif ncol == 3: elif ncol == 3:
example = InputExample( example = InputExample(
guid=i, text_a=line[0], text_b=line[1], label=line[2]) guid=i,
text_a=line[0],
text_b=line[1],
label=line[2])
else:
raise Exception(
"the %s file: %s has too many columns (should <=3)"
% (phase, input_file))
else: else:
raise Exception( if ncol == 1:
"the %s file: %s has too many columns (should <=3)" % example = InputExample(guid=i, text_a=line[0])
(phase, input_file)) elif ncol == 2:
if not has_warned:
logger.warning(
"the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
% (input_file))
has_warned = True
example = InputExample(
guid=i, text_a=line[0], text_b=line[1])
else:
raise Exception(
"the predict file: %s has too many columns (should <=2)"
% (input_file))
examples.append(example) examples.append(example)
return examples return examples
...@@ -20,10 +20,10 @@ from __future__ import print_function ...@@ -20,10 +20,10 @@ from __future__ import print_function
import os import os
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
class BQ(BaseNLPDatast): class BQ(BaseNLPDataset):
def __init__(self): def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "bq") dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset( base_path = self._download_dataset(
......
...@@ -23,10 +23,10 @@ import csv ...@@ -23,10 +23,10 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
class ChnSentiCorp(BaseNLPDatast): class ChnSentiCorp(BaseNLPDataset):
""" """
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining) opinion mining)
......
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁' SPIECE_UNDERLINE = '▁'
...@@ -62,7 +62,7 @@ class CMRC2018Example(object): ...@@ -62,7 +62,7 @@ class CMRC2018Example(object):
return s return s
class CMRC2018(BaseNLPDatast): class CMRC2018(BaseNLPDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self): def __init__(self):
......
...@@ -64,10 +64,10 @@ class BaseDataset(object): ...@@ -64,10 +64,10 @@ class BaseDataset(object):
predict_file=None, predict_file=None,
label_file=None, label_file=None,
label_list=None, label_list=None,
train_file_with_head=False, train_file_with_header=False,
dev_file_with_head=False, dev_file_with_header=False,
test_file_with_head=False, test_file_with_header=False,
predict_file_with_head=False): predict_file_with_header=False):
if not (train_file or dev_file or test_file): if not (train_file or dev_file or test_file):
raise ValueError("At least one file should be assigned") raise ValueError("At least one file should be assigned")
self.base_path = base_path self.base_path = base_path
...@@ -83,11 +83,11 @@ class BaseDataset(object): ...@@ -83,11 +83,11 @@ class BaseDataset(object):
self.test_examples = [] self.test_examples = []
self.predict_examples = [] self.predict_examples = []
self.if_file_with_head = { self.if_file_with_header = {
"train": train_file_with_head, "train": train_file_with_header,
"dev": dev_file_with_head, "dev": dev_file_with_header,
"test": test_file_with_head, "test": test_file_with_header,
"predict": predict_file_with_head "predict": predict_file_with_header
} }
if train_file: if train_file:
......
...@@ -20,10 +20,10 @@ from __future__ import print_function ...@@ -20,10 +20,10 @@ from __future__ import print_function
import os import os
import paddlehub as hub import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class DogCatDataset(BaseCVDatast): class DogCatDataset(BaseCVDataset):
def __init__(self): def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat") dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat")
base_path = self._download_dataset( base_path = self._download_dataset(
......
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁' SPIECE_UNDERLINE = '▁'
...@@ -62,7 +62,7 @@ class DRCDExample(object): ...@@ -62,7 +62,7 @@ class DRCDExample(object):
return s return s
class DRCD(BaseNLPDatast): class DRCD(BaseNLPDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self): def __init__(self):
......
...@@ -20,10 +20,10 @@ from __future__ import print_function ...@@ -20,10 +20,10 @@ from __future__ import print_function
import os import os
import paddlehub as hub import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class FlowersDataset(BaseCVDatast): class FlowersDataset(BaseCVDataset):
def __init__(self): def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos") dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos")
base_path = self._download_dataset( base_path = self._download_dataset(
......
...@@ -20,10 +20,10 @@ from __future__ import print_function ...@@ -20,10 +20,10 @@ from __future__ import print_function
import os import os
import paddlehub as hub import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class Food101Dataset(BaseCVDatast): class Food101Dataset(BaseCVDataset):
def __init__(self): def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101", dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101",
"images") "images")
......
...@@ -24,12 +24,12 @@ import io ...@@ -24,12 +24,12 @@ import io
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"
class GLUE(BaseNLPDatast): class GLUE(BaseNLPDataset):
""" """
Please refer to Please refer to
https://gluebenchmark.com https://gluebenchmark.com
......
...@@ -22,12 +22,12 @@ import os ...@@ -22,12 +22,12 @@ import os
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class IFLYTEK(BaseNLPDatast): class IFLYTEK(BaseNLPDataset):
def __init__(self): def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "iflytek") dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
......
...@@ -20,10 +20,10 @@ from __future__ import print_function ...@@ -20,10 +20,10 @@ from __future__ import print_function
import os import os
import paddlehub as hub import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class Indoor67Dataset(BaseCVDatast): class Indoor67Dataset(BaseCVDataset):
def __init__(self): def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67") dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67")
base_path = self._download_dataset( base_path = self._download_dataset(
......
...@@ -23,12 +23,12 @@ import csv ...@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class INews(BaseNLPDatast): class INews(BaseNLPDataset):
""" """
INews is a sentiment analysis dataset for Internet News INews is a sentiment analysis dataset for Internet News
""" """
......
...@@ -23,12 +23,12 @@ import csv ...@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class LCQMC(BaseNLPDatast): class LCQMC(BaseNLPDataset):
def __init__(self): def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "lcqmc") dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
......
...@@ -23,12 +23,12 @@ import csv ...@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class MSRA_NER(BaseNLPDatast): class MSRA_NER(BaseNLPDataset):
""" """
A set of manually annotated Chinese word-segmentation data and A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system specifications for training and testing a Chinese word-segmentation system
......
...@@ -23,12 +23,12 @@ import csv ...@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class NLPCC_DBQA(BaseNLPDatast): class NLPCC_DBQA(BaseNLPDataset):
""" """
Please refer to Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
......
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
...@@ -65,7 +65,7 @@ class SquadExample(object): ...@@ -65,7 +65,7 @@ class SquadExample(object):
return s return s
class SQUAD(BaseNLPDatast): class SQUAD(BaseNLPDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self, version_2_with_negative=False): def __init__(self, version_2_with_negative=False):
......
...@@ -20,10 +20,10 @@ from __future__ import print_function ...@@ -20,10 +20,10 @@ from __future__ import print_function
import os import os
import paddlehub as hub import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast from paddlehub.dataset.base_cv_dataset import BaseCVDataset
class StanfordDogsDataset(BaseCVDatast): class StanfordDogsDataset(BaseCVDataset):
def __init__(self): def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, dataset_path = os.path.join(hub.common.dir.DATA_HOME,
"StanfordDogs-120") "StanfordDogs-120")
......
...@@ -22,12 +22,12 @@ import os ...@@ -22,12 +22,12 @@ import os
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class THUCNEWS(BaseNLPDatast): class THUCNEWS(BaseNLPDataset):
def __init__(self): def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "thucnews") dataset_dir = os.path.join(DATA_HOME, "thucnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
......
...@@ -22,12 +22,12 @@ import pandas as pd ...@@ -22,12 +22,12 @@ import pandas as pd
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class Toxic(BaseNLPDatast): class Toxic(BaseNLPDataset):
""" """
The kaggle Toxic dataset: The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
......
...@@ -25,12 +25,12 @@ import csv ...@@ -25,12 +25,12 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class XNLI(BaseNLPDatast): class XNLI(BaseNLPDataset):
""" """
Please refer to Please refer to
https://arxiv.org/pdf/1809.05053.pdf https://arxiv.org/pdf/1809.05053.pdf
......
...@@ -142,7 +142,7 @@ class ClassifierTask(BaseTask): ...@@ -142,7 +142,7 @@ class ClassifierTask(BaseTask):
} }
except: except:
raise Exception( raise Exception(
"ImageClassificationDataset does not support postprocessing, please use BaseCVDatast instead" "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
) )
results = [] results = []
for batch_state in run_states: for batch_state in run_states:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册