Update text-cls demo and multi-lable cls demo to adopt to ernie v2

510f5407 · zhangxuefei · 7f1a2c0b · 510f5407 · 510f5407 · 510f5407
7 changed file
--- a/demo/multi-label-classification/multi_label_classifier.py
+++ b/demo/multi-label-classification/multi_label_classifier.py
@@ -30,37 +30,51 @@ parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup
 parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
 parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
 parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
+parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
 args = parser.parse_args()
 # yapf: enable.

 if __name__ == '__main__':
-    # Step1: load Paddlehub BERT pretrained model
-    module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+    # Load Paddlehub BERT pretrained model
+    if args.use_taskid:
+        module = hub.Module(name="ernie_eng_base.hub_module")

-    inputs, outputs, program = module.context(
-        trainable=True, max_seq_len=args.max_seq_len)
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)

-    # Step2: Download dataset and use MultiLabelReader to read dataset
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name, inputs["position_ids"].name,
+            inputs["segment_ids"].name, inputs["input_mask"].name,
+            inputs["task_ids"].name
+        ]
+    else:
+        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)
+
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name,
+            inputs["position_ids"].name,
+            inputs["segment_ids"].name,
+            inputs["input_mask"].name,
+        ]
+
+    # Download dataset and use MultiLabelReader to read dataset
    dataset = hub.dataset.Toxic()

    reader = hub.reader.MultiLabelClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
-        max_seq_len=args.max_seq_len)
+        max_seq_len=args.max_seq_len,
+        use_task_id=args.use_taskid)

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    pooled_output = outputs["pooled_output"]

-    # Setup feed list for data feeder
-    # Must feed all the tensor of ERNIE's module need
-    feed_list = [
-        inputs["input_ids"].name,
-        inputs["position_ids"].name,
-        inputs["segment_ids"].name,
-        inputs["input_mask"].name,
-    ]
-
    # Select finetune strategy, setup config and finetune
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,

--- a/demo/multi-label-classification/predict.py
+++ b/demo/multi-label-classification/predict.py
@@ -36,40 +36,52 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory
 parser.add_argument("--batch_size",     type=int,   default=1, help="Total examples' number in batch for training.")
 parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
 parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
+parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
 args = parser.parse_args()
 # yapf: enable.

 if __name__ == '__main__':
-    # loading Paddlehub ERNIE pretrained model
-    module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
-    inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
+    # Load Paddlehub BERT pretrained model
+    if args.use_taskid:
+        module = hub.Module(name="ernie_eng_base.hub_module")

-    # Sentence classification  dataset reader
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)
+
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name, inputs["position_ids"].name,
+            inputs["segment_ids"].name, inputs["input_mask"].name,
+            inputs["task_ids"].name
+        ]
+    else:
+        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)
+
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name,
+            inputs["position_ids"].name,
+            inputs["segment_ids"].name,
+            inputs["input_mask"].name,
+        ]
+
+    # Download dataset and use MultiLabelReader to read dataset
    dataset = hub.dataset.Toxic()
-    num_label = len(dataset.get_labels())

    reader = hub.reader.MultiLabelClassifyReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
-        max_seq_len=args.max_seq_len)
-
-    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
+        max_seq_len=args.max_seq_len,
+        use_task_id=args.use_taskid)

    # Construct transfer learning network
    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_output" for token-level output.
    pooled_output = outputs["pooled_output"]

-    # Setup feed list for data feeder
-    # Must feed all the tensor of ERNIE's module need
-    feed_list = [
-        inputs["input_ids"].name,
-        inputs["position_ids"].name,
-        inputs["segment_ids"].name,
-        inputs["input_mask"].name,
-    ]
-
    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_data_parallel=False,
@@ -104,7 +116,7 @@ if __name__ == '__main__':
    for result in results:
        # get predict index
        label_ids = []
-        for i in range(num_label):
+        for i in range(dataset.num_labels):
            label_val = np.argmax(result[i])
            label_ids.append(label_val)
        print("%s\tpredict=%s" % (data[index][0], label_ids))

--- a/demo/multi-label-classification/run_classifier.sh
+++ b/demo/multi-label-classification/run_classifier.sh
@@ -16,4 +16,5 @@ python -u multi_label_classifier.py \
                   --learning_rate=5e-5 \
                   --weight_decay=0.01 \
                   --max_seq_len=128 \
-                   --num_epoch=3
+                   --num_epoch=3 \
+                   --use_taskid=False
--- a/demo/multi-label-classification/run_predict.sh
+++ b/demo/multi-label-classification/run_predict.sh
@@ -2,4 +2,4 @@ export FLAGS_eager_delete_tensor_gb=0.0
 export CUDA_VISIBLE_DEVICES=0

 CKPT_DIR="./ckpt_toxic"
-python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True
+python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True --use_taskid False
--- a/paddlehub/module/module.py
+++ b/paddlehub/module/module.py
@@ -581,7 +581,8 @@ class Module(object):
                "Set maximum sequence length of input tensor to {}".format(
                    max_seq_len))
            for tensor_name in [
-                    "input_ids", "position_ids", "segment_ids", "input_mask"
+                    "input_ids", "position_ids", "segment_ids", "input_mask",
+                    "task_ids"
            ]:
                seq_tensor_shape = [-1, max_seq_len, 1]
                logger.info("The shape of input tensor[{}] set to {}".format(

--- a/paddlehub/reader/nlp_reader.py
+++ b/paddlehub/reader/nlp_reader.py
@@ -690,8 +690,11 @@ class MultiLabelClassifyReader(BaseReader):
        position_ids = list(range(len(token_ids)))

        label_ids = []
-        for label in example.label:
-            label_ids.append(int(label))
+        if phase == "predict":
+            label_ids = [0, 0, 0, 0, 0, 0]
+        else:
+            for label in example.label:
+                label_ids.append(self.label_map[label])

        if phase != "predict":
            Record = namedtuple(

--- a/paddlehub/reader/tokenization.py
+++ b/paddlehub/reader/tokenization.py
@@ -71,7 +71,7 @@ def printable_text(text):
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
-    fin = io.open(vocab_file, "r", "UTF-8")
+    fin = io.open(vocab_file, "r", encoding="UTF-8")
    for num, line in enumerate(fin):
        items = convert_to_unicode(line.strip()).split("\t")
        if len(items) > 2: