Add elmo demo (#39)

* Add the elmo demo * Fix the bug that coding type between py2 and py3 in lac-reader * Add the lib 'chardet' for detecting the coding type. * Modify the requirement.txt

Add elmo demo (#39)
* Add the elmo demo * Fix the bug that coding type between py2 and py3 in lac-reader * Add the lib 'chardet' for detecting the coding type. * Modify the requirement.txt
81ef11f8 · Steffy-zxf · wuzewu · f70b8358 · 81ef11f8 · 81ef11f8
6 changed file
--- a/demo/elmo/elmo_finetune.py
+++ b/demo/elmo/elmo_finetune.py
+import argparse
+import ast
+import io
+import numpy as np
+from paddle.fluid.framework import switch_main_program
+import paddle.fluid as fluid
+import paddlehub as hub
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
+parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
+parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
+parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
+parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate used to train with warmup.")
+parser.add_argument("--weight_decay", type=float, default=5, help="Weight decay rate for L2 regularizer.")
+parser.add_argument("--warmup_proportion", type=float, default=0.05, help="Warmup proportion params for warmup strategy")
+args = parser.parse_args()
+# yapf: enable.
+def bow_net(program, input_feature, hid_dim=128, hid_dim2=96):
+    switch_main_program(program)
+    bow = fluid.layers.sequence_pool(input=input_feature, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    return fc
+def cnn_net(program, input_feature, win_size=3, hid_dim=128, hid_dim2=96):
+    switch_main_program(program)
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=input_feature,
+        num_filters=hid_dim,
+        filter_size=win_size,
+        act="relu",
+        pool_type="max")
+    fc = fluid.layers.fc(input=conv_3, size=hid_dim2)
+    return fc
+def gru_net(program, input_feature, hid_dim=128, hid_dim2=96):
+    switch_main_program(program)
+    fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 3)
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+    fc = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+    return fc
+def bilstm_net(program, input_feature, hid_dim=128, hid_dim2=96):
+    switch_main_program(program)
+    fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4)
+    rfc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4)
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    rlstm_h, c = fluid.layers.dynamic_lstm(
+        input=rfc0, size=hid_dim * 4, is_reverse=True)
+    # extract last step
+    lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
+    rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
+    lstm_last_tanh = fluid.layers.tanh(lstm_last)
+    rlstm_last_tanh = fluid.layers.tanh(rlstm_last)
+    # concat layer
+    lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
+    # full connect layer
+    fc = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
+    return fc
+def lstm_net(program, input_feature, hid_dim=128, hid_dim2=96):
+    switch_main_program(program)
+    fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4)
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    fc = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    return fc
+if __name__ == '__main__':
+    # Step1: load Paddlehub elmo pretrained model
+    module = hub.Module(name="elmo.hub_module")
+    inputs, outputs, program = module.context(trainable=True)
+    # Step2: Download dataset and use TextClassificationReader to read dataset
+    dataset = hub.dataset.ChnSentiCorp()
+    reader = hub.reader.LACClassifyReader(
+        dataset=dataset, vocab_path=module.get_vocab_path())
+    word_dict_len = len(reader.vocab)
+    word_ids = inputs["word_ids"]
+    elmo_embedding = outputs["elmo_embed"]
+    #Step3: switch program and build network
+    #choose the net which you would like: bow, cnn, gru, bilstm, lstm
+    switch_main_program(program)
+    # embedding layer
+    word_embed_dims = 128
+    word_embedding = fluid.layers.embedding(
+        input=word_ids,
+        size=[word_dict_len, word_embed_dims],
+        param_attr=fluid.ParamAttr(
+            learning_rate=30,
+            initializer=fluid.initializer.Uniform(low=-0.1, high=0.1)))
+    # add elmo embedding
+    input_feature = fluid.layers.concat(
+        input=[elmo_embedding, word_embedding], axis=1)
+    #choose the net which you would like: bow, cnn, gru, bilstm, lstm
+    #we recommend you to choose the gru_net
+    fc = gru_net(program, input_feature)
+    # Define a classfication finetune task by PaddleHub's API
+    elmo_task = hub.create_text_cls_task(
+        feature=fc, num_classes=dataset.num_labels)
+    # Setup feed list for data feeder
+    # Must feed all the tensor of senta's module need
+    feed_list = [inputs["word_ids"].name, elmo_task.variable("label").name]
+    # Step4: Select finetune strategy, setup config and finetune
+    strategy = hub.AdamWeightDecayStrategy(
+        weight_decay=args.weight_decay,
+        learning_rate=args.learning_rate,
+        lr_scheduler="linear_decay",
+        warmup_proportion=args.warmup_proportion)
+    # Setup runing config for PaddleHub Finetune API
+    config = hub.RunConfig(
+        use_cuda=args.use_gpu,
+        num_epoch=args.num_epoch,
+        batch_size=args.batch_size,
+        checkpoint_dir=args.checkpoint_dir,
+        strategy=strategy)
+    # Finetune and evaluate by PaddleHub's API
+    # will finish training, evaluation, testing, save model automatically
+    hub.finetune_and_eval(
+        task=elmo_task, data_reader=reader, feed_list=feed_list, config=config)
--- a/demo/elmo/run_elmo_finetune.sh
+++ b/demo/elmo/run_elmo_finetune.sh
+export CUDA_VISIBLE_DEVICES=0
+python -u elmo_finetune.py \
+                   --batch_size=32 \
+                   --use_gpu=True \
+                   --checkpoint_dir="./ckpt_chnsenticorp" \
+                   --learning_rate=1e-4 \
+                   --weight_decay=1 \
+                   --num_epoch=3
--- a/paddlehub/__init__.py
+++ b/paddlehub/__init__.py
@@ -46,6 +46,7 @@ from .finetune.finetune import finetune_and_eval
 from .finetune.config import RunConfig
 from .finetune.strategy import AdamWeightDecayStrategy
 from .finetune.strategy import DefaultStrategy
+from .finetune.strategy import DefaultFinetuneStrategy
 if six.PY2:
    import sys

--- a/paddlehub/finetune/finetune.py
+++ b/paddlehub/finetune/finetune.py
@@ -229,7 +229,10 @@ def _finetune_cls_task(task, data_reader, feed_list, config=None,
                train_time_begin = time.time()
                loss_v, accuracy_v = exe.run(
                    feed=data_feeder.feed(batch),
-                    fetch_list=[loss.name, accuracy.name])
+                    fetch_list=[loss.name, accuracy.name],
+                    return_numpy=False)
+                loss_v = np.array(loss_v)
+                accuracy_v = np.array(accuracy_v)
                train_time_used += time.time() - train_time_begin
                global_step += 1
                num_trained_examples += num_batch_examples

--- a/paddlehub/reader/nlp_reader.py
+++ b/paddlehub/reader/nlp_reader.py
@@ -18,6 +18,8 @@ from __future__ import print_function
 import csv
 import json
+import platform
+import six
 from collections import namedtuple
 import paddle
@@ -29,6 +31,12 @@ from .batching import pad_batch_data
 import paddlehub as hub
+def get_encoding():
+    if platform.platform().lower().startswith("windows"):
+        return "gbk"
+    return "utf8"
 class BaseReader(object):
    def __init__(self,
                 dataset,
@@ -398,16 +406,17 @@ class LACClassifyReader(object):
                       shuffle=False,
                       data=None):
        if phase == "train":
+            shuffle = True
            data = self.dataset.get_train_examples()
            self.num_examples['train'] = len(data)
        elif phase == "test":
            shuffle = False
            data = self.dataset.get_test_examples()
-            self.num_examples['train'] = len(data)
+            self.num_examples['test'] = len(data)
        elif phase == "val" or phase == "dev":
            shuffle = False
            data = self.dataset.get_dev_examples()
-            self.num_examples['test'] = len(data)
+            self.num_examples['dev'] = len(data)
        elif phase == "predict":
            data = data
        else:
@@ -417,20 +426,35 @@ class LACClassifyReader(object):
        def preprocess(text):
            data_dict = {self.feed_key: [text]}
            processed = self.lac.lexical_analysis(data=data_dict)
+            for data in processed:
+                for index, word in enumerate(data['word']):
+                    if six.PY2 and type(word) == str:
+                        data['word'][index] = word.decode(get_encoding())
            processed = [
                self.vocab[word] for word in processed[0]['word']
                if word in self.vocab
            ]
+            if len(processed) == 0:
+                logger.warning(
+                    "The words in text %s can't be found in the vocabulary." %
+                    (text))
            return processed
        def _data_reader():
+            if shuffle:
+                np.random.shuffle(data)
            if phase == "predict":
                for text in data:
                    text = preprocess(text)
+                    if not text:
+                        continue
                    yield (text, )
            else:
                for item in data:
                    text = preprocess(item.text_a)
+                    if not text:
+                        continue
                    yield (text, item.label)
        return paddle.batch(_data_reader, batch_size=batch_size)

--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ pyyaml
 numpy >= 1.12.0
 Pillow
 six >= 1.10.0
+chardet == 3.0.4