update senta demo

82e1494a · Zeyu Chen · 5bd9a50a · 82e1494a · 82e1494a · 82e1494a
5 changed file
--- a/demo/senta/run_classifier.sh
+++ b/demo/senta/run_classifier.sh
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=2
 # User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task
 DATASET="chnsenticorp"
@@ -8,4 +8,4 @@ python -u text_classifier.py \
                   --batch_size=24 \
                   --use_gpu=True \
                   --checkpoint_dir=${CKPT_DIR} \
-                   --num_epoch=3
+                   --num_epoch=10
--- a/demo/senta/text_classifier.py
+++ b/demo/senta/text_classifier.py
@@ -21,29 +21,28 @@ if __name__ == '__main__':
    # Step2: Download dataset and use TextClassificationReader to read dataset
    dataset = hub.dataset.ChnSentiCorp()
-    reader = hub.reader.TextClassificationReader(
+    reader = hub.reader.LACTokenizeReader(
        dataset=dataset, vocab_path=module.get_vocab_path())
-    # Step3: construct transfer learning network
+    sent_feature = outputs["sequence_output"]
-    # Use "sequence_output" for classification tasks on an entire sentence.
-    # Use "sequence_output" for token-level output.
-    sequence_output = outputs["sequence_output"]
    # Define a classfication finetune task by PaddleHub's API
    cls_task = hub.create_text_cls_task(
-        feature=sequence_output, num_classes=dataset.num_labels)
+        feature=sent_feature, num_classes=dataset.num_labels)
    # Setup feed list for data feeder
    # Must feed all the tensor of senta's module need
    feed_list = [inputs["words"].name, cls_task.variable('label').name]
-    # Setup runing config for PaddleHub Finetune API
+    strategy = hub.finetune.strategy.AdamWeightDecayStrategy(
+        learning_rate=1e-3, weight_decay=0.01, warmup_proportion=0.01)
    config = hub.RunConfig(
        use_cuda=args.use_gpu,
        num_epoch=args.num_epoch,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
-        strategy=hub.finetune.strategy.DefaultFinetuneStrategy())
+        strategy=strategy)
    # Finetune and evaluate by PaddleHub's API
    # will finish training, evaluation, testing, save model automatically

--- a/demo/text-classification/simple_demo.py
+++ b/demo/text-classification/simple_demo.py
@@ -2,9 +2,9 @@ import paddle.fluid as fluid
 import paddlehub as hub
 module = hub.Module(name="ernie")
-inputs, outputs, program = module.context(trainable=True)
+inputs, outputs, program = module.context(trainable=True, max_seq_len=128)
-reader = hub.reader.ClassifyReader(hub.dataset.ChnSentiCorp(),
+reader = hub.reader.ClassifyReader(
-                                   module.get_vocab_path())
+    hub.dataset.ChnSentiCorp(), module.get_vocab_path(), max_seq_len=128)
 task = hub.create_text_cls_task(feature=outputs["pooled_output"], num_classes=2)
 strategy = hub.AdamWeightDecayStrategy(learning_rate=5e-5)
 config = hub.RunConfig(

--- a/paddlehub/reader/__init__.py
+++ b/paddlehub/reader/__init__.py
@@ -14,5 +14,5 @@
 from .nlp_reader import ClassifyReader
 from .nlp_reader import SequenceLabelReader
-from .nlp_reader import TextClassificationReader
+from .nlp_reader import LACTokenizeReader
 from .cv_reader import ImageClassificationReader
--- a/paddlehub/reader/nlp_reader.py
+++ b/paddlehub/reader/nlp_reader.py
@@ -58,7 +58,6 @@ class BaseReader(object):
        self.current_example = 0
        self.current_epoch = 0
-        self.num_examples = 0
        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
@@ -383,18 +382,47 @@ class ExtractEmbeddingReader(BaseReader):
        return return_list
-class TextClassificationReader(object):
+class LACTokenizeReader(object):
-    def __init__(self, dataset, vocab_path, do_lower_case=False):
+    def __init__(self, dataset, vocab_path):
        self.dataset = dataset
+        self.lac = hub.Module(name="lac")
        self.tokenizer = tokenization.FullTokenizer(
-            vocab_file=vocab_path, do_lower_case=do_lower_case)
+            vocab_file=vocab_path, do_lower_case=False)
        self.vocab = self.tokenizer.vocab
-        self.lac = hub.Module(name="lac")
        self.feed_key = list(
            self.lac.processor.data_format(
                sign_name="lexical_analysis").keys())[0]
+        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
+    def get_num_examples(self, phase):
+        """Get number of examples for train, dev or test."""
+        if phase not in ['train', 'val', 'dev', 'test']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
+            )
+        return self.num_examples[phase]
+    def get_train_examples(self):
+        """Gets a collection of `InputExample`s for the train set."""
+        return self.dataset.get_train_examples()
+    def get_dev_examples(self):
+        """Gets a collection of `InputExample`s for the dev set."""
+        return self.dataset.get_dev_examples()
+    def get_val_examples(self):
+        """Gets a collection of `InputExample`s for the val set."""
+        return self.dataset.get_val_examples()
+    def get_test_examples(self):
+        """Gets a collection of `InputExample`s for prediction."""
+        return self.dataset.get_test_examples()
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_example, self.current_epoch
    def data_generator(self,
                       batch_size=1,
                       phase="train",
@@ -402,14 +430,20 @@ class TextClassificationReader(object):
                       data=None):
        if phase == "train":
            data = self.dataset.get_train_examples()
+            self.num_examples['train'] = len(data)
        elif phase == "test":
            shuffle = False
            data = self.dataset.get_test_examples()
+            self.num_examples['train'] = len(data)
        elif phase == "val" or phase == "dev":
            shuffle = False
            data = self.dataset.get_dev_examples()
+            self.num_examples['test'] = len(data)
        elif phase == "predict":
            data = data
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
        def preprocess(text):
            data_dict = {self.feed_key: [text]}