提交 81ef11f8 编写于 作者: S Steffy-zxf 提交者: wuzewu

Add elmo demo (#39)

* Add the elmo demo

* Fix the bug that coding type between py2 and py3 in lac-reader

* Add the lib 'chardet' for detecting the coding type.

* Modify the requirement.txt
上级 f70b8358
import argparse
import ast
import io
import numpy as np
from paddle.fluid.framework import switch_main_program
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.")
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate used to train with warmup.")
parser.add_argument("--weight_decay", type=float, default=5, help="Weight decay rate for L2 regularizer.")
parser.add_argument("--warmup_proportion", type=float, default=0.05, help="Warmup proportion params for warmup strategy")
args = parser.parse_args()
# yapf: enable.
def bow_net(program, input_feature, hid_dim=128, hid_dim2=96):
switch_main_program(program)
bow = fluid.layers.sequence_pool(input=input_feature, pool_type='sum')
bow_tanh = fluid.layers.tanh(bow)
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
fc = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
return fc
def cnn_net(program, input_feature, win_size=3, hid_dim=128, hid_dim2=96):
switch_main_program(program)
conv_3 = fluid.nets.sequence_conv_pool(
input=input_feature,
num_filters=hid_dim,
filter_size=win_size,
act="relu",
pool_type="max")
fc = fluid.layers.fc(input=conv_3, size=hid_dim2)
return fc
def gru_net(program, input_feature, hid_dim=128, hid_dim2=96):
switch_main_program(program)
fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 3)
gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
gru_max_tanh = fluid.layers.tanh(gru_max)
fc = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
return fc
def bilstm_net(program, input_feature, hid_dim=128, hid_dim2=96):
switch_main_program(program)
fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4)
rfc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4)
lstm_h, c = fluid.layers.dynamic_lstm(
input=fc0, size=hid_dim * 4, is_reverse=False)
rlstm_h, c = fluid.layers.dynamic_lstm(
input=rfc0, size=hid_dim * 4, is_reverse=True)
# extract last step
lstm_last = fluid.layers.sequence_last_step(input=lstm_h)
rlstm_last = fluid.layers.sequence_last_step(input=rlstm_h)
lstm_last_tanh = fluid.layers.tanh(lstm_last)
rlstm_last_tanh = fluid.layers.tanh(rlstm_last)
# concat layer
lstm_concat = fluid.layers.concat(input=[lstm_last, rlstm_last], axis=1)
# full connect layer
fc = fluid.layers.fc(input=lstm_concat, size=hid_dim2, act='tanh')
return fc
def lstm_net(program, input_feature, hid_dim=128, hid_dim2=96):
switch_main_program(program)
fc0 = fluid.layers.fc(input=input_feature, size=hid_dim * 4)
lstm_h, c = fluid.layers.dynamic_lstm(
input=fc0, size=hid_dim * 4, is_reverse=False)
lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
lstm_max_tanh = fluid.layers.tanh(lstm_max)
fc = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
return fc
if __name__ == '__main__':
# Step1: load Paddlehub elmo pretrained model
module = hub.Module(name="elmo.hub_module")
inputs, outputs, program = module.context(trainable=True)
# Step2: Download dataset and use TextClassificationReader to read dataset
dataset = hub.dataset.ChnSentiCorp()
reader = hub.reader.LACClassifyReader(
dataset=dataset, vocab_path=module.get_vocab_path())
word_dict_len = len(reader.vocab)
word_ids = inputs["word_ids"]
elmo_embedding = outputs["elmo_embed"]
#Step3: switch program and build network
#choose the net which you would like: bow, cnn, gru, bilstm, lstm
switch_main_program(program)
# embedding layer
word_embed_dims = 128
word_embedding = fluid.layers.embedding(
input=word_ids,
size=[word_dict_len, word_embed_dims],
param_attr=fluid.ParamAttr(
learning_rate=30,
initializer=fluid.initializer.Uniform(low=-0.1, high=0.1)))
# add elmo embedding
input_feature = fluid.layers.concat(
input=[elmo_embedding, word_embedding], axis=1)
#choose the net which you would like: bow, cnn, gru, bilstm, lstm
#we recommend you to choose the gru_net
fc = gru_net(program, input_feature)
# Define a classfication finetune task by PaddleHub's API
elmo_task = hub.create_text_cls_task(
feature=fc, num_classes=dataset.num_labels)
# Setup feed list for data feeder
# Must feed all the tensor of senta's module need
feed_list = [inputs["word_ids"].name, elmo_task.variable("label").name]
# Step4: Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
lr_scheduler="linear_decay",
warmup_proportion=args.warmup_proportion)
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
batch_size=args.batch_size,
checkpoint_dir=args.checkpoint_dir,
strategy=strategy)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub.finetune_and_eval(
task=elmo_task, data_reader=reader, feed_list=feed_list, config=config)
export CUDA_VISIBLE_DEVICES=0
python -u elmo_finetune.py \
--batch_size=32 \
--use_gpu=True \
--checkpoint_dir="./ckpt_chnsenticorp" \
--learning_rate=1e-4 \
--weight_decay=1 \
--num_epoch=3
...@@ -46,6 +46,7 @@ from .finetune.finetune import finetune_and_eval ...@@ -46,6 +46,7 @@ from .finetune.finetune import finetune_and_eval
from .finetune.config import RunConfig from .finetune.config import RunConfig
from .finetune.strategy import AdamWeightDecayStrategy from .finetune.strategy import AdamWeightDecayStrategy
from .finetune.strategy import DefaultStrategy from .finetune.strategy import DefaultStrategy
from .finetune.strategy import DefaultFinetuneStrategy
if six.PY2: if six.PY2:
import sys import sys
......
...@@ -229,7 +229,10 @@ def _finetune_cls_task(task, data_reader, feed_list, config=None, ...@@ -229,7 +229,10 @@ def _finetune_cls_task(task, data_reader, feed_list, config=None,
train_time_begin = time.time() train_time_begin = time.time()
loss_v, accuracy_v = exe.run( loss_v, accuracy_v = exe.run(
feed=data_feeder.feed(batch), feed=data_feeder.feed(batch),
fetch_list=[loss.name, accuracy.name]) fetch_list=[loss.name, accuracy.name],
return_numpy=False)
loss_v = np.array(loss_v)
accuracy_v = np.array(accuracy_v)
train_time_used += time.time() - train_time_begin train_time_used += time.time() - train_time_begin
global_step += 1 global_step += 1
num_trained_examples += num_batch_examples num_trained_examples += num_batch_examples
......
...@@ -18,6 +18,8 @@ from __future__ import print_function ...@@ -18,6 +18,8 @@ from __future__ import print_function
import csv import csv
import json import json
import platform
import six
from collections import namedtuple from collections import namedtuple
import paddle import paddle
...@@ -29,6 +31,12 @@ from .batching import pad_batch_data ...@@ -29,6 +31,12 @@ from .batching import pad_batch_data
import paddlehub as hub import paddlehub as hub
def get_encoding():
if platform.platform().lower().startswith("windows"):
return "gbk"
return "utf8"
class BaseReader(object): class BaseReader(object):
def __init__(self, def __init__(self,
dataset, dataset,
...@@ -398,16 +406,17 @@ class LACClassifyReader(object): ...@@ -398,16 +406,17 @@ class LACClassifyReader(object):
shuffle=False, shuffle=False,
data=None): data=None):
if phase == "train": if phase == "train":
shuffle = True
data = self.dataset.get_train_examples() data = self.dataset.get_train_examples()
self.num_examples['train'] = len(data) self.num_examples['train'] = len(data)
elif phase == "test": elif phase == "test":
shuffle = False shuffle = False
data = self.dataset.get_test_examples() data = self.dataset.get_test_examples()
self.num_examples['train'] = len(data) self.num_examples['test'] = len(data)
elif phase == "val" or phase == "dev": elif phase == "val" or phase == "dev":
shuffle = False shuffle = False
data = self.dataset.get_dev_examples() data = self.dataset.get_dev_examples()
self.num_examples['test'] = len(data) self.num_examples['dev'] = len(data)
elif phase == "predict": elif phase == "predict":
data = data data = data
else: else:
...@@ -417,20 +426,35 @@ class LACClassifyReader(object): ...@@ -417,20 +426,35 @@ class LACClassifyReader(object):
def preprocess(text): def preprocess(text):
data_dict = {self.feed_key: [text]} data_dict = {self.feed_key: [text]}
processed = self.lac.lexical_analysis(data=data_dict) processed = self.lac.lexical_analysis(data=data_dict)
for data in processed:
for index, word in enumerate(data['word']):
if six.PY2 and type(word) == str:
data['word'][index] = word.decode(get_encoding())
processed = [ processed = [
self.vocab[word] for word in processed[0]['word'] self.vocab[word] for word in processed[0]['word']
if word in self.vocab if word in self.vocab
] ]
if len(processed) == 0:
logger.warning(
"The words in text %s can't be found in the vocabulary." %
(text))
return processed return processed
def _data_reader(): def _data_reader():
if shuffle:
np.random.shuffle(data)
if phase == "predict": if phase == "predict":
for text in data: for text in data:
text = preprocess(text) text = preprocess(text)
if not text:
continue
yield (text, ) yield (text, )
else: else:
for item in data: for item in data:
text = preprocess(item.text_a) text = preprocess(item.text_a)
if not text:
continue
yield (text, item.label) yield (text, item.label)
return paddle.batch(_data_reader, batch_size=batch_size) return paddle.batch(_data_reader, batch_size=batch_size)
......
...@@ -6,3 +6,4 @@ pyyaml ...@@ -6,3 +6,4 @@ pyyaml
numpy >= 1.12.0 numpy >= 1.12.0
Pillow Pillow
six >= 1.10.0 six >= 1.10.0
chardet == 3.0.4
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册