提交 510f5407 编写于 作者: Z zhangxuefei

Update text-cls demo and multi-lable cls demo to adopt to ernie v2

上级 7f1a2c0b
...@@ -30,37 +30,51 @@ parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup ...@@ -30,37 +30,51 @@ parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup
parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
if __name__ == '__main__': if __name__ == '__main__':
# Step1: load Paddlehub BERT pretrained model # Load Paddlehub BERT pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12") if args.use_taskid:
module = hub.Module(name="ernie_eng_base.hub_module")
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Step2: Download dataset and use MultiLabelReader to read dataset # Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name,
inputs["task_ids"].name
]
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Download dataset and use MultiLabelReader to read dataset
dataset = hub.dataset.Toxic() dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader( reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset, dataset=dataset,
vocab_path=module.get_vocab_path(), vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len) max_seq_len=args.max_seq_len,
use_task_id=args.use_taskid)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select finetune strategy, setup config and finetune # Select finetune strategy, setup config and finetune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay, weight_decay=args.weight_decay,
......
...@@ -36,40 +36,52 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory ...@@ -36,40 +36,52 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory
parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.")
parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.") parser.add_argument("--max_seq_len", type=int, default=128, help="Number of words of the longest seqence.")
parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False")
parser.add_argument("--use_taskid", type=ast.literal_eval, default=False, help="Whether to user ernie v2 , if not to use bert.")
args = parser.parse_args() args = parser.parse_args()
# yapf: enable. # yapf: enable.
if __name__ == '__main__': if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model # Load Paddlehub BERT pretrained model
module = hub.Module(name="bert_uncased_L-12_H-768_A-12") if args.use_taskid:
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) module = hub.Module(name="ernie_eng_base.hub_module")
# Sentence classification dataset reader inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name,
inputs["task_ids"].name
]
else:
module = hub.Module(name="bert_uncased_L-12_H-768_A-12")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Download dataset and use MultiLabelReader to read dataset
dataset = hub.dataset.Toxic() dataset = hub.dataset.Toxic()
num_label = len(dataset.get_labels())
reader = hub.reader.MultiLabelClassifyReader( reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset, dataset=dataset,
vocab_path=module.get_vocab_path(), vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len) max_seq_len=args.max_seq_len,
use_task_id=args.use_taskid)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup runing config for PaddleHub Finetune API # Setup runing config for PaddleHub Finetune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
...@@ -104,7 +116,7 @@ if __name__ == '__main__': ...@@ -104,7 +116,7 @@ if __name__ == '__main__':
for result in results: for result in results:
# get predict index # get predict index
label_ids = [] label_ids = []
for i in range(num_label): for i in range(dataset.num_labels):
label_val = np.argmax(result[i]) label_val = np.argmax(result[i])
label_ids.append(label_val) label_ids.append(label_val)
print("%s\tpredict=%s" % (data[index][0], label_ids)) print("%s\tpredict=%s" % (data[index][0], label_ids))
......
...@@ -16,4 +16,5 @@ python -u multi_label_classifier.py \ ...@@ -16,4 +16,5 @@ python -u multi_label_classifier.py \
--learning_rate=5e-5 \ --learning_rate=5e-5 \
--weight_decay=0.01 \ --weight_decay=0.01 \
--max_seq_len=128 \ --max_seq_len=128 \
--num_epoch=3 --num_epoch=3 \
--use_taskid=False
...@@ -2,4 +2,4 @@ export FLAGS_eager_delete_tensor_gb=0.0 ...@@ -2,4 +2,4 @@ export FLAGS_eager_delete_tensor_gb=0.0
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
CKPT_DIR="./ckpt_toxic" CKPT_DIR="./ckpt_toxic"
python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True python -u predict.py --checkpoint_dir $CKPT_DIR --max_seq_len 128 --use_gpu True --use_taskid False
...@@ -581,7 +581,8 @@ class Module(object): ...@@ -581,7 +581,8 @@ class Module(object):
"Set maximum sequence length of input tensor to {}".format( "Set maximum sequence length of input tensor to {}".format(
max_seq_len)) max_seq_len))
for tensor_name in [ for tensor_name in [
"input_ids", "position_ids", "segment_ids", "input_mask" "input_ids", "position_ids", "segment_ids", "input_mask",
"task_ids"
]: ]:
seq_tensor_shape = [-1, max_seq_len, 1] seq_tensor_shape = [-1, max_seq_len, 1]
logger.info("The shape of input tensor[{}] set to {}".format( logger.info("The shape of input tensor[{}] set to {}".format(
......
...@@ -690,8 +690,11 @@ class MultiLabelClassifyReader(BaseReader): ...@@ -690,8 +690,11 @@ class MultiLabelClassifyReader(BaseReader):
position_ids = list(range(len(token_ids))) position_ids = list(range(len(token_ids)))
label_ids = [] label_ids = []
for label in example.label: if phase == "predict":
label_ids.append(int(label)) label_ids = [0, 0, 0, 0, 0, 0]
else:
for label in example.label:
label_ids.append(self.label_map[label])
if phase != "predict": if phase != "predict":
Record = namedtuple( Record = namedtuple(
......
...@@ -71,7 +71,7 @@ def printable_text(text): ...@@ -71,7 +71,7 @@ def printable_text(text):
def load_vocab(vocab_file): def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict() vocab = collections.OrderedDict()
fin = io.open(vocab_file, "r", "UTF-8") fin = io.open(vocab_file, "r", encoding="UTF-8")
for num, line in enumerate(fin): for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t") items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2: if len(items) > 2:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册