From 510f540704bf56287882aa4f88572c499dd0698a Mon Sep 17 00:00:00 2001
From: zhangxuefei <zhangxuefei@baidu.com>
Date: Fri, 2 Aug 2019 15:25:57 +0800
Subject: [PATCH] Update text-cls demo and multi-lable cls demo to adopt to
 ernie v2

---
 .../multi_label_classifier.py                 | 44 ++++++++++------
 demo/multi-label-classification/predict.py    | 50 ++++++++++++-------
 .../run_classifier.sh                         |  3 +-
 .../multi-label-classification/run_predict.sh |  2 +-
 paddlehub/module/module.py                    |  3 +-
 paddlehub/reader/nlp_reader.py                |  7 ++-
 paddlehub/reader/tokenization.py              |  2 +-
 7 files changed, 71 insertions(+), 40 deletions(-)

diff --git a/demo/multi-label-classification/multi_label_classifier.py b/demo/multi-label-classification/multi_label_classifier.py
index e11cc0fc..b1e7086f 100644
--- a/demo/multi-label-classification/multi_label_classifier.py
+++ b/demo/multi-label-classification/multi_label_classifier.py
@@ -30,37 +30,51 @@ parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup
 parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
 parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
 parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
+parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
 args = parser.parse_args()
 # yapf: enable.
 
 if __name__ == '__main__':
-    # Step1: load Paddlehub BERT pretrained model
-    module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+    # Load Paddlehub BERT pretrained model
+    if args.use_taskid:
+        module = hub.Module(name="ernie_eng_base.hub_module")
 
-    inputs, outputs, program = module.context(
-        trainable=True, max_seq_len=args.max_seq_len)
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)
 
-    # Step2: Download dataset and use MultiLabelReader to read dataset
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name, inputs["position_ids"].name,
+            inputs["segment_ids"].name, inputs["input_mask"].name,
+            inputs["task_ids"].name
+        ]
+    else:
+        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)
+
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name,
+            inputs["position_ids"].name,
+            inputs["segment_ids"].name,
+            inputs["input_mask"].name,
+        ]
+
+    # Download dataset and use MultiLabelReader to read dataset
     dataset = hub.dataset.Toxic()
 
     reader = hub.reader.MultiLabelClassifyReader(
         dataset=dataset,
         vocab_path=module.get_vocab_path(),
-        max_seq_len=args.max_seq_len)
+        max_seq_len=args.max_seq_len,
+        use_task_id=args.use_taskid)
 
     # Construct transfer learning network
     # Use "pooled_output" for classification tasks on an entire sentence.
     pooled_output = outputs["pooled_output"]
 
-    # Setup feed list for data feeder
-    # Must feed all the tensor of ERNIE's module need
-    feed_list = [
-        inputs["input_ids"].name,
-        inputs["position_ids"].name,
-        inputs["segment_ids"].name,
-        inputs["input_mask"].name,
-    ]
-
     # Select finetune strategy, setup config and finetune
     strategy = hub.AdamWeightDecayStrategy(
         weight_decay=args.weight_decay,
diff --git a/demo/multi-label-classification/predict.py b/demo/multi-label-classification/predict.py
index 138c5ade..6a6ef240 100644
--- a/demo/multi-label-classification/predict.py
+++ b/demo/multi-label-classification/predict.py
@@ -36,40 +36,52 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory
 parser.add_argument("--batch_size",     type=int,   default=1, help="Total examples' number in batch for training.")
 parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
 parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
+parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
 args = parser.parse_args()
 # yapf: enable.
 
 if __name__ == '__main__':
-    # loading Paddlehub ERNIE pretrained model
-    module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
-    inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
+    # Load Paddlehub BERT pretrained model
+    if args.use_taskid:
+        module = hub.Module(name="ernie_eng_base.hub_module")
 
-    # Sentence classification  dataset reader
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)
+
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name, inputs["position_ids"].name,
+            inputs["segment_ids"].name, inputs["input_mask"].name,
+            inputs["task_ids"].name
+        ]
+    else:
+        module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
+
+        inputs, outputs, program = module.context(
+            trainable=True, max_seq_len=args.max_seq_len)
+
+        # Setup feed list for data feeder
+        feed_list = [
+            inputs["input_ids"].name,
+            inputs["position_ids"].name,
+            inputs["segment_ids"].name,
+            inputs["input_mask"].name,
+        ]
+
+    # Download dataset and use MultiLabelReader to read dataset
     dataset = hub.dataset.Toxic()
-    num_label = len(dataset.get_labels())
 
     reader = hub.reader.MultiLabelClassifyReader(
         dataset=dataset,
         vocab_path=module.get_vocab_path(),
-        max_seq_len=args.max_seq_len)
-
-    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
+        max_seq_len=args.max_seq_len,
+        use_task_id=args.use_taskid)
 
     # Construct transfer learning network
     # Use "pooled_output" for classification tasks on an entire sentence.
     # Use "sequence_output" for token-level output.
     pooled_output = outputs["pooled_output"]
 
-    # Setup feed list for data feeder
-    # Must feed all the tensor of ERNIE's module need
-    feed_list = [
-        inputs["input_ids"].name,
-        inputs["position_ids"].name,
-        inputs["segment_ids"].name,
-        inputs["input_mask"].name,
-    ]
-
     # Setup runing config for PaddleHub Finetune API
     config = hub.RunConfig(
         use_data_parallel=False,
@@ -104,7 +116,7 @@ if __name__ == '__main__':
     for result in results:
         # get predict index
         label_ids = []
-        for i in range(num_label):
+        for i in range(dataset.num_labels):
             label_val = np.argmax(result[i])
             label_ids.append(label_val)
         print("%s\tpredict=%s" % (data[index][0], label_ids))
diff --git a/demo/multi-label-classification/run_classifier.sh b/demo/multi-label-classification/run_classifier.sh
index 93b88833..f08026a4 100644
--- a/demo/multi-label-classification/run_classifier.sh
+++ b/demo/multi-label-classification/run_classifier.sh
@@ -16,4 +16,5 @@ python -u multi_label_classifier.py \
                    --learning_rate=5e-5 \
                    --weight_decay=0.01 \
                    --max_seq_len=128 \
-                   --num_epoch=3
+                   --num_epoch=3 \
+                   --use_taskid=False
diff --git a/demo/multi-label-classification/run_predict.sh b/demo/multi-label-classification/run_predict.sh
index ea28d8d9..f0976fe1 100644
--- a/demo/multi-label-classification/run_predict.sh
+++ b/demo/multi-label-classification/run_predict.sh
@@ -2,4 +2,4 @@ export FLAGS_eager_delete_tensor_gb=0.0
 export CUDA_VISIBLE_DEVICES=0
 
 CKPT_DIR="./ckpt_toxic"
-python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True
+python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True --use_taskid False
diff --git a/paddlehub/module/module.py b/paddlehub/module/module.py
index e218222c..fce447b1 100644
--- a/paddlehub/module/module.py
+++ b/paddlehub/module/module.py
@@ -581,7 +581,8 @@ class Module(object):
                 "Set maximum sequence length of input tensor to {}".format(
                     max_seq_len))
             for tensor_name in [
-                    "input_ids", "position_ids", "segment_ids", "input_mask"
+                    "input_ids", "position_ids", "segment_ids", "input_mask",
+                    "task_ids"
             ]:
                 seq_tensor_shape = [-1, max_seq_len, 1]
                 logger.info("The shape of input tensor[{}] set to {}".format(
diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py
index e102756e..ccc1ea34 100644
--- a/paddlehub/reader/nlp_reader.py
+++ b/paddlehub/reader/nlp_reader.py
@@ -690,8 +690,11 @@ class MultiLabelClassifyReader(BaseReader):
         position_ids = list(range(len(token_ids)))
 
         label_ids = []
-        for label in example.label:
-            label_ids.append(int(label))
+        if phase == "predict":
+            label_ids = [0, 0, 0, 0, 0, 0]
+        else:
+            for label in example.label:
+                label_ids.append(self.label_map[label])
 
         if phase != "predict":
             Record = namedtuple(
diff --git a/paddlehub/reader/tokenization.py b/paddlehub/reader/tokenization.py
index 276f0798..80c1856b 100644
--- a/paddlehub/reader/tokenization.py
+++ b/paddlehub/reader/tokenization.py
@@ -71,7 +71,7 @@ def printable_text(text):
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
-    fin = io.open(vocab_file, "r", "UTF-8")
+    fin = io.open(vocab_file, "r", encoding="UTF-8")
     for num, line in enumerate(fin):
         items = convert_to_unicode(line.strip()).split("\t")
         if len(items) > 2:
-- 
GitLab