From ff3bc5b85290a47ae385d9c0679e518163528688 Mon Sep 17 00:00:00 2001
From: Zeyu Chen <chenzeyu01@baidu.com>
Date: Thu, 11 Apr 2019 18:02:31 +0800
Subject: [PATCH] fix typo in ernie sequence labeling task

---
 demo/ernie-seq-labeling/sequence_labeling.py |  3 ++-
 paddlehub/dataset/chnsenticorp.py            |  1 -
 paddlehub/dataset/msra_ner.py                | 21 ++++++++++++++------
 paddlehub/dataset/nlpcc_dbqa.py              |  6 ++++++
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/demo/ernie-seq-labeling/sequence_labeling.py b/demo/ernie-seq-labeling/sequence_labeling.py
index 381198a0..39c9e91c 100644
--- a/demo/ernie-seq-labeling/sequence_labeling.py
+++ b/demo/ernie-seq-labeling/sequence_labeling.py
@@ -46,7 +46,7 @@ if __name__ == '__main__':
         trainable=True, max_seq_len=args.max_seq_len)
 
     # Step2: Download dataset and use SequenceLabelReader to read dataset
-    dataset = hub.dataset.MSRA_NER(),
+    dataset = hub.dataset.MSRA_NER()
     reader = hub.reader.SequenceLabelReader(
         dataset=dataset,
         vocab_path=module.get_vocab_path(),
@@ -91,6 +91,7 @@ if __name__ == '__main__':
             use_cuda=True,
             num_epoch=args.num_epoch,
             batch_size=args.batch_size,
+            checkpoint_dir=args.checkpoint_dir,
             strategy=strategy)
         # Finetune and evaluate model by PaddleHub's API
         # will finish training, evaluation, testing, save model automatically
diff --git a/paddlehub/dataset/chnsenticorp.py b/paddlehub/dataset/chnsenticorp.py
index 199ca8f1..c237a575 100644
--- a/paddlehub/dataset/chnsenticorp.py
+++ b/paddlehub/dataset/chnsenticorp.py
@@ -68,7 +68,6 @@ class ChnSentiCorp(HubDataset):
         return self.test_examples
 
     def get_labels(self):
-        """See base class."""
         return ["0", "1"]
 
     def _read_tsv(self, input_file, quotechar=None):
diff --git a/paddlehub/dataset/msra_ner.py b/paddlehub/dataset/msra_ner.py
index b04b530f..aeade65c 100644
--- a/paddlehub/dataset/msra_ner.py
+++ b/paddlehub/dataset/msra_ner.py
@@ -21,6 +21,7 @@ import csv
 import json
 from collections import namedtuple
 
+from paddlehub.dataset import InputExample, HubDataset
 from paddlehub.common.downloader import default_downloader
 from paddlehub.common.dir import DATA_HOME
 from paddlehub.common.logger import logger
@@ -28,7 +29,14 @@ from paddlehub.common.logger import logger
 DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/msra_ner.tar.gz"
 
 
-class MSRA_NER(object):
+class MSRA_NER(HubDataset):
+    """
+    A set of manually annotated Chinese word-segmentation data and
+    specifications for training and testing a Chinese word-segmentation system
+    for research purposes.  For more information please refer to
+    https://www.microsoft.com/en-us/download/details.aspx?id=52531
+    """
+
     def __init__(self):
         self.dataset_dir = os.path.join(DATA_HOME, "msra_ner")
         if not os.path.exists(self.dataset_dir):
@@ -78,12 +86,13 @@ class MSRA_NER(object):
         """Reads a tab separated value file."""
         with open(input_file, "r") as f:
             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            headers = next(reader)
-            Example = namedtuple('Example', headers)
-
             examples = []
+            seq_id = 0
+            header = next(reader)  # skip header
             for line in reader:
-                example = Example(*line)
+                example = InputExample(
+                    guid=seq_id, label=line[1], text_a=line[0])
+                seq_id += 1
                 examples.append(example)
 
             return examples
@@ -92,4 +101,4 @@ class MSRA_NER(object):
 if __name__ == "__main__":
     ds = MSRA_NER()
     for e in ds.get_train_examples():
-        print(e)
+        print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
diff --git a/paddlehub/dataset/nlpcc_dbqa.py b/paddlehub/dataset/nlpcc_dbqa.py
index 04e9e9e3..5ec57cba 100644
--- a/paddlehub/dataset/nlpcc_dbqa.py
+++ b/paddlehub/dataset/nlpcc_dbqa.py
@@ -29,6 +29,12 @@ DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/nlpcc-dbqa.tar.gz"
 
 
 class NLPCC_DBQA(HubDataset):
+    """
+    Please refer to
+    http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
+    for more information
+    """
+
     def __init__(self):
         self.dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
         if not os.path.exists(self.dataset_dir):
-- 
GitLab