提交 510f5407 编写于 作者: Z zhangxuefei

Update text-cls demo and multi-lable cls demo to adopt to ernie v2

上级 7f1a2c0b
......@@ -30,37 +30,51 @@ parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# Step1: load Paddlehub BERT pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
# Load Paddlehub BERT pretrained model
if args.use_taskid:
module = hub.Module(name="ernie_eng_base.hub_module")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Step2: Download dataset and use MultiLabelReader to read dataset
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name,
inputs["task_ids"].name
]
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Download dataset and use MultiLabelReader to read dataset
dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
max_seq_len=args.max_seq_len,
use_task_id=args.use_taskid)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
......
......@@ -36,40 +36,52 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
args = parser.parse_args()
# yapf: enable.
if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Load Paddlehub BERT pretrained model
if args.use_taskid:
module = hub.Module(name="ernie_eng_base.hub_module")
# Sentence classification dataset reader
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name,
inputs["task_ids"].name
]
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Download dataset and use MultiLabelReader to read dataset
dataset = hub.dataset.Toxic()
num_label = len(dataset.get_labels())
reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
max_seq_len=args.max_seq_len,
use_task_id=args.use_taskid)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup runing config for PaddleHub Finetune API
config = hub.RunConfig(
use_data_parallel=False,
......@@ -104,7 +116,7 @@ if __name__ == '__main__':
for result in results:
# get predict index
label_ids = []
for i in range(num_label):
for i in range(dataset.num_labels):
label_val = np.argmax(result[i])
label_ids.append(label_val)
print("%s\tpredict=%s" % (data[index][0], label_ids))
......
......@@ -16,4 +16,5 @@ python -u multi_label_classifier.py \
--learning_rate=5e-5 \
--weight_decay=0.01 \
--max_seq_len=128 \
--num_epoch=3
--num_epoch=3 \
--use_taskid=False
......@@ -2,4 +2,4 @@ export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_toxic"
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True --use_taskid False
......@@ -581,7 +581,8 @@ class Module(object):
"Set maximum sequence length of input tensor to {}".format(
max_seq_len))
for tensor_name in [
"input_ids", "position_ids", "segment_ids", "input_mask"
"input_ids", "position_ids", "segment_ids", "input_mask",
"task_ids"
]:
seq_tensor_shape = [-1, max_seq_len, 1]
logger.info("The shape of input tensor[{}] set to {}".format(
......
......@@ -690,8 +690,11 @@ class MultiLabelClassifyReader(BaseReader):
position_ids = list(range(len(token_ids)))
label_ids = []
for label in example.label:
label_ids.append(int(label))
if phase == "predict":
label_ids = [0, 0, 0, 0, 0, 0]
else:
for label in example.label:
label_ids.append(self.label_map[label])
if phase != "predict":
Record = namedtuple(
......
......@@ -71,7 +71,7 @@ def printable_text(text):
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = io.open(vocab_file, "r", "UTF-8")
fin = io.open(vocab_file, "r", encoding="UTF-8")
for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册