fix typo (#316)

* fix typo * enhance predict dataset

fix typo (#316)
* fix typo * enhance predict dataset
90bbb041 · kinghuin · wuzewu · 39643114 · 90bbb041 · 90bbb041
23 changed file
--- a/paddlehub/dataset/base_cv_dataset.py
+++ b/paddlehub/dataset/base_cv_dataset.py
@@ -26,7 +26,7 @@ from paddlehub.common.downloader import default_downloader
 from paddlehub.common.logger import logger
-class BaseCVDatast(BaseDataset):
+class BaseCVDataset(BaseDataset):
    def __init__(self,
                 base_path,
                 train_list_file=None,
@@ -35,7 +35,7 @@ class BaseCVDatast(BaseDataset):
                 predict_list_file=None,
                 label_list_file=None,
                 label_list=None):
-        super(BaseCVDatast, self).__init__(
+        super(BaseCVDataset, self).__init__(
            base_path=base_path,
            train_file=train_list_file,
            dev_file=validate_list_file,
@@ -65,7 +65,7 @@ class BaseCVDatast(BaseDataset):
        return data
-# discarded. please use BaseCVDatast
+# discarded. please use BaseCVDataset
 class ImageClassificationDataset(object):
    def __init__(self):
        logger.warning(

--- a/paddlehub/dataset/base_nlp_dataset.py
+++ b/paddlehub/dataset/base_nlp_dataset.py
@@ -21,9 +21,10 @@ import io
 import csv
 from paddlehub.dataset import InputExample, BaseDataset
+from paddlehub.common.logger import logger
-class BaseNLPDatast(BaseDataset):
+class BaseNLPDataset(BaseDataset):
    def __init__(self,
                 base_path,
                 train_file=None,
@@ -32,11 +33,11 @@ class BaseNLPDatast(BaseDataset):
                 predict_file=None,
                 label_file=None,
                 label_list=None,
-                 train_file_with_head=False,
+                 train_file_with_header=False,
-                 dev_file_with_head=False,
+                 dev_file_with_header=False,
-                 test_file_with_head=False,
+                 test_file_with_header=False,
-                 predict_file_with_head=False):
+                 predict_file_with_header=False):
-        super(BaseNLPDatast, self).__init__(
+        super(BaseNLPDataset, self).__init__(
            base_path=base_path,
            train_file=train_file,
            dev_file=dev_file,
@@ -44,37 +45,54 @@ class BaseNLPDatast(BaseDataset):
            predict_file=predict_file,
            label_file=label_file,
            label_list=label_list,
-            train_file_with_head=train_file_with_head,
+            train_file_with_header=train_file_with_header,
-            dev_file_with_head=dev_file_with_head,
+            dev_file_with_header=dev_file_with_header,
-            test_file_with_head=test_file_with_head,
+            test_file_with_header=test_file_with_header,
-            predict_file_with_head=predict_file_with_head)
+            predict_file_with_header=predict_file_with_header)
    def _read_file(self, input_file, phase=None):
        """Reads a tab separated value file."""
+        has_warned = False
        with io.open(input_file, "r", encoding="UTF-8") as file:
            reader = csv.reader(file, delimiter="\t", quotechar=None)
            examples = []
            for (i, line) in enumerate(reader):
                if i == 0:
                    ncol = len(line)
-                    if self.if_file_with_head[phase]:
+                    if self.if_file_with_header[phase]:
                        continue
-                if ncol == 1:
+                if phase != "predict":
-                    if phase != "predict":
+                    if ncol == 1:
-                        example = InputExample(guid=i, text_a=line[0])
-                    else:
                        raise Exception(
                            "the %s file: %s only has one column but it is not a predict file"
                            % (phase, input_file))
-                elif ncol == 2:
+                    elif ncol == 2:
-                    example = InputExample(
+                        example = InputExample(
-                        guid=i, text_a=line[0], label=line[1])
+                            guid=i, text_a=line[0], label=line[1])
-                elif ncol == 3:
+                    elif ncol == 3:
-                    example = InputExample(
+                        example = InputExample(
-                        guid=i, text_a=line[0], text_b=line[1], label=line[2])
+                            guid=i,
+                            text_a=line[0],
+                            text_b=line[1],
+                            label=line[2])
+                    else:
+                        raise Exception(
+                            "the %s file: %s has too many columns (should <=3)"
+                            % (phase, input_file))
                else:
-                    raise Exception(
+                    if ncol == 1:
-                        "the %s file: %s has too many columns (should <=3)" %
+                        example = InputExample(guid=i, text_a=line[0])
-                        (phase, input_file))
+                    elif ncol == 2:
+                        if not has_warned:
+                            logger.warning(
+                                "the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
+                                % (input_file))
+                            has_warned = True
+                        example = InputExample(
+                            guid=i, text_a=line[0], text_b=line[1])
+                    else:
+                        raise Exception(
+                            "the predict file: %s has too many columns (should <=2)"
+                            % (input_file))
                examples.append(example)
            return examples
--- a/paddlehub/dataset/bq.py
+++ b/paddlehub/dataset/bq.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 import os
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
-class BQ(BaseNLPDatast):
+class BQ(BaseNLPDataset):
    def __init__(self):
        dataset_dir = os.path.join(DATA_HOME, "bq")
        base_path = self._download_dataset(

--- a/paddlehub/dataset/chnsenticorp.py
+++ b/paddlehub/dataset/chnsenticorp.py
@@ -23,10 +23,10 @@ import csv
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
-class ChnSentiCorp(BaseNLPDatast):
+class ChnSentiCorp(BaseNLPDataset):
    """
    ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
    opinion mining)

--- a/paddlehub/dataset/cmrc2018.py
+++ b/paddlehub/dataset/cmrc2018.py
@@ -20,7 +20,7 @@ import os
 from paddlehub.reader import tokenization
 from paddlehub.common.dir import DATA_HOME
 from paddlehub.common.logger import logger
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
 SPIECE_UNDERLINE = '▁'
@@ -62,7 +62,7 @@ class CMRC2018Example(object):
        return s
-class CMRC2018(BaseNLPDatast):
+class CMRC2018(BaseNLPDataset):
    """A single set of features of data."""
    def __init__(self):

--- a/paddlehub/dataset/dataset.py
+++ b/paddlehub/dataset/dataset.py
@@ -64,10 +64,10 @@ class BaseDataset(object):
                 predict_file=None,
                 label_file=None,
                 label_list=None,
-                 train_file_with_head=False,
+                 train_file_with_header=False,
-                 dev_file_with_head=False,
+                 dev_file_with_header=False,
-                 test_file_with_head=False,
+                 test_file_with_header=False,
-                 predict_file_with_head=False):
+                 predict_file_with_header=False):
        if not (train_file or dev_file or test_file):
            raise ValueError("At least one file should be assigned")
        self.base_path = base_path
@@ -83,11 +83,11 @@ class BaseDataset(object):
        self.test_examples = []
        self.predict_examples = []
-        self.if_file_with_head = {
+        self.if_file_with_header = {
-            "train": train_file_with_head,
+            "train": train_file_with_header,
-            "dev": dev_file_with_head,
+            "dev": dev_file_with_header,
-            "test": test_file_with_head,
+            "test": test_file_with_header,
-            "predict": predict_file_with_head
+            "predict": predict_file_with_header
        }
        if train_file:

--- a/paddlehub/dataset/dogcat.py
+++ b/paddlehub/dataset/dogcat.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 import os
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
-class DogCatDataset(BaseCVDatast):
+class DogCatDataset(BaseCVDataset):
    def __init__(self):
        dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat")
        base_path = self._download_dataset(

--- a/paddlehub/dataset/drcd.py
+++ b/paddlehub/dataset/drcd.py
@@ -20,7 +20,7 @@ import os
 from paddlehub.reader import tokenization
 from paddlehub.common.dir import DATA_HOME
 from paddlehub.common.logger import logger
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
 SPIECE_UNDERLINE = '▁'
@@ -62,7 +62,7 @@ class DRCDExample(object):
        return s
-class DRCD(BaseNLPDatast):
+class DRCD(BaseNLPDataset):
    """A single set of features of data."""
    def __init__(self):

--- a/paddlehub/dataset/flowers.py
+++ b/paddlehub/dataset/flowers.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 import os
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
-class FlowersDataset(BaseCVDatast):
+class FlowersDataset(BaseCVDataset):
    def __init__(self):
        dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos")
        base_path = self._download_dataset(

--- a/paddlehub/dataset/food101.py
+++ b/paddlehub/dataset/food101.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 import os
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
-class Food101Dataset(BaseCVDatast):
+class Food101Dataset(BaseCVDataset):
    def __init__(self):
        dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101",
                                    "images")

--- a/paddlehub/dataset/glue.py
+++ b/paddlehub/dataset/glue.py
@@ -24,12 +24,12 @@ import io
 from paddlehub.dataset import InputExample
 from paddlehub.common.logger import logger
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"
-class GLUE(BaseNLPDatast):
+class GLUE(BaseNLPDataset):
    """
    Please refer to
    https://gluebenchmark.com

--- a/paddlehub/dataset/iflytek.py
+++ b/paddlehub/dataset/iflytek.py
@@ -22,12 +22,12 @@ import os
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
-class IFLYTEK(BaseNLPDatast):
+class IFLYTEK(BaseNLPDataset):
    def __init__(self):
        dataset_dir = os.path.join(DATA_HOME, "iflytek")
        base_path = self._download_dataset(dataset_dir, url=_DATA_URL)

--- a/paddlehub/dataset/indoor67.py
+++ b/paddlehub/dataset/indoor67.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 import os
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
-class Indoor67Dataset(BaseCVDatast):
+class Indoor67Dataset(BaseCVDataset):
    def __init__(self):
        dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67")
        base_path = self._download_dataset(

--- a/paddlehub/dataset/inews.py
+++ b/paddlehub/dataset/inews.py
@@ -23,12 +23,12 @@ import csv
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
-class INews(BaseNLPDatast):
+class INews(BaseNLPDataset):
    """
    INews is a sentiment analysis dataset for Internet News
    """

--- a/paddlehub/dataset/lcqmc.py
+++ b/paddlehub/dataset/lcqmc.py
@@ -23,12 +23,12 @@ import csv
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
-class LCQMC(BaseNLPDatast):
+class LCQMC(BaseNLPDataset):
    def __init__(self):
        dataset_dir = os.path.join(DATA_HOME, "lcqmc")
        base_path = self._download_dataset(dataset_dir, url=_DATA_URL)

--- a/paddlehub/dataset/msra_ner.py
+++ b/paddlehub/dataset/msra_ner.py
@@ -23,12 +23,12 @@ import csv
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
-class MSRA_NER(BaseNLPDatast):
+class MSRA_NER(BaseNLPDataset):
    """
    A set of manually annotated Chinese word-segmentation data and
    specifications for training and testing a Chinese word-segmentation system

--- a/paddlehub/dataset/nlpcc_dbqa.py
+++ b/paddlehub/dataset/nlpcc_dbqa.py
@@ -23,12 +23,12 @@ import csv
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
-class NLPCC_DBQA(BaseNLPDatast):
+class NLPCC_DBQA(BaseNLPDataset):
    """
    Please refer to
    http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf

--- a/paddlehub/dataset/squad.py
+++ b/paddlehub/dataset/squad.py
@@ -20,7 +20,7 @@ import os
 from paddlehub.reader import tokenization
 from paddlehub.common.dir import DATA_HOME
 from paddlehub.common.logger import logger
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
@@ -65,7 +65,7 @@ class SquadExample(object):
        return s
-class SQUAD(BaseNLPDatast):
+class SQUAD(BaseNLPDataset):
    """A single set of features of data."""
    def __init__(self, version_2_with_negative=False):

--- a/paddlehub/dataset/stanford_dogs.py
+++ b/paddlehub/dataset/stanford_dogs.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 import os
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
-class StanfordDogsDataset(BaseCVDatast):
+class StanfordDogsDataset(BaseCVDataset):
    def __init__(self):
        dataset_path = os.path.join(hub.common.dir.DATA_HOME,
                                    "StanfordDogs-120")

--- a/paddlehub/dataset/thucnews.py
+++ b/paddlehub/dataset/thucnews.py
@@ -22,12 +22,12 @@ import os
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
-class THUCNEWS(BaseNLPDatast):
+class THUCNEWS(BaseNLPDataset):
    def __init__(self):
        dataset_dir = os.path.join(DATA_HOME, "thucnews")
        base_path = self._download_dataset(dataset_dir, url=_DATA_URL)

--- a/paddlehub/dataset/toxic.py
+++ b/paddlehub/dataset/toxic.py
@@ -22,12 +22,12 @@ import pandas as pd
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
-class Toxic(BaseNLPDatast):
+class Toxic(BaseNLPDataset):
    """
    The kaggle Toxic dataset:
    https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

--- a/paddlehub/dataset/xnli.py
+++ b/paddlehub/dataset/xnli.py
@@ -25,12 +25,12 @@ import csv
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
-class XNLI(BaseNLPDatast):
+class XNLI(BaseNLPDataset):
    """
    Please refer to
    https://arxiv.org/pdf/1809.05053.pdf

--- a/paddlehub/finetune/task/classifier_task.py
+++ b/paddlehub/finetune/task/classifier_task.py
@@ -142,7 +142,7 @@ class ClassifierTask(BaseTask):
            }
        except:
            raise Exception(
-                "ImageClassificationDataset does not support postprocessing, please use BaseCVDatast instead"
+                "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
            )
        results = []
        for batch_state in run_states: