提交 57dbe135 编写于 作者: Z Zeyu Chen

add finetune.py and networks.py

上级 5b884607
......@@ -101,10 +101,10 @@ def test_hub_api(args, config):
input_dict["sent_ids"].name, input_dict["input_mask"].name,
label.name
]
task = hub.append_mlp_classifier(
cls_task = hub.append_mlp_classifier(
pooled_output, label, num_classes=num_labels)
hub.finetune_and_eval(task, feed_list, processor, config)
hub.finetune_and_eval(cls_task, feed_list, processor, config)
if __name__ == '__main__':
......
......@@ -17,7 +17,7 @@ import types
import csv
import numpy as np
import tokenization
from batching import prepare_batch_data
from .batching import prepare_batch_data
class DataProcessor(object):
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from __future__ import division
import os
import numpy as np
import types
import gzip
import logging
import re
import six
import collections
import tokenization
import paddle
import paddle.fluid as fluid
from batching import prepare_batch_data
class DataReader(object):
def __init__(self,
data_dir,
vocab_path,
batch_size=4096,
in_tokens=True,
max_seq_len=512,
shuffle_files=True,
epoch=100,
voc_size=0,
is_test=False,
generate_neg_sample=False):
self.vocab = self.load_vocab(vocab_path)
self.data_dir = data_dir
self.batch_size = batch_size
self.in_tokens = in_tokens
self.shuffle_files = shuffle_files
self.epoch = epoch
self.current_epoch = 0
self.current_file_index = 0
self.total_file = 0
self.current_file = None
self.voc_size = voc_size
self.max_seq_len = max_seq_len
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.is_test = is_test
self.generate_neg_sample = generate_neg_sample
if self.in_tokens:
assert self.batch_size >= self.max_seq_len, "The number of " \
"tokens in batch should not be smaller than max seq length."
if self.is_test:
self.epoch = 1
self.shuffle_files = False
def get_progress(self):
"""return current progress of traning data
"""
return self.current_epoch, self.current_file_index, self.total_file, self.current_file
def parse_line(self, line, max_seq_len=512):
""" parse one line to token_ids, sentence_ids, pos_ids, label
"""
line = line.strip().split(";")
assert len(line) == 4, "One sample must have 4 fields!"
(token_ids, sent_ids, pos_ids, label) = line
token_ids = [int(token) for token in token_ids.split(" ")]
sent_ids = [int(token) for token in sent_ids.split(" ")]
pos_ids = [int(token) for token in pos_ids.split(" ")]
assert len(token_ids) == len(sent_ids) == len(
pos_ids
), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)"
label = int(label)
if len(token_ids) > max_seq_len:
return None
return [token_ids, sent_ids, pos_ids, label]
def read_file(self, file):
assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
file_path = self.data_dir + "/" + file
with gzip.open(file_path, "rb") as f:
for line in f:
parsed_line = self.parse_line(
line, max_seq_len=self.max_seq_len)
if parsed_line is None:
continue
yield parsed_line
def convert_to_unicode(self, text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(self, vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = open(vocab_file)
for num, line in enumerate(fin):
items = self.convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def random_pair_neg_samples(self, pos_samples):
""" randomly generate negtive samples using pos_samples
Args:
pos_samples: list of positive samples
Returns:
neg_samples: list of negtive samples
"""
np.random.shuffle(pos_samples)
num_sample = len(pos_samples)
neg_samples = []
miss_num = 0
for i in range(num_sample):
pair_index = (i + 1) % num_sample
origin_src_ids = pos_samples[i][0]
origin_sep_index = origin_src_ids.index(2)
pair_src_ids = pos_samples[pair_index][0]
pair_sep_index = pair_src_ids.index(2)
src_ids = origin_src_ids[:origin_sep_index +
1] + pair_src_ids[pair_sep_index + 1:]
if len(src_ids) >= self.max_seq_len:
miss_num += 1
continue
sent_ids = [0] * len(origin_src_ids[:origin_sep_index + 1]) + [
1
] * len(pair_src_ids[pair_sep_index + 1:])
pos_ids = list(range(len(src_ids)))
neg_sample = [src_ids, sent_ids, pos_ids, 0]
assert len(src_ids) == len(sent_ids) == len(
pos_ids
), "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
neg_samples.append(neg_sample)
return neg_samples, miss_num
def mixin_negtive_samples(self, pos_sample_generator, buffer=1000):
""" 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples
2. combine negtive samples and positive samples
Args:
pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
Returns:
sample: one sample from shuffled positive samples and negtive samples
"""
pos_samples = []
num_total_miss = 0
pos_sample_num = 0
try:
while True:
while len(pos_samples) < buffer:
pos_sample = next(pos_sample_generator)
label = pos_sample[3]
assert label == 1, "positive sample's label must be 1"
pos_samples.append(pos_sample)
pos_sample_num += 1
neg_samples, miss_num = self.random_pair_neg_samples(
pos_samples)
num_total_miss += miss_num
samples = pos_samples + neg_samples
pos_samples = []
np.random.shuffle(samples)
for sample in samples:
yield sample
except StopIteration:
print("stopiteration: reach end of file")
if len(pos_samples) == 1:
yield pos_samples[0]
elif len(pos_samples) == 0:
yield None
else:
neg_samples, miss_num = self.random_pair_neg_samples(
pos_samples)
num_total_miss += miss_num
samples = pos_samples + neg_samples
pos_samples = []
np.random.shuffle(samples)
for sample in samples:
yield sample
print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" %
(num_total_miss, pos_sample_num * 2, num_total_miss /
(pos_sample_num * 2)))
def data_generator(self):
"""
data_generator
"""
files = os.listdir(self.data_dir)
self.total_file = len(files)
assert self.total_file > 0, "[Error] data_dir is empty"
def wrapper():
def reader():
for epoch in range(self.epoch):
self.current_epoch = epoch + 1
if self.shuffle_files:
np.random.shuffle(files)
for index, file in enumerate(files):
self.current_file_index = index + 1
self.current_file = file
sample_generator = self.read_file(file)
if not self.is_test and self.generate_neg_sample:
sample_generator = self.mixin_negtive_samples(
sample_generator)
for sample in sample_generator:
if sample is None:
continue
yield sample
def batch_reader(reader, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for parsed_line in reader():
token_ids, sent_ids, pos_ids, label = parsed_line
max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(parsed_line)
total_token_num += len(token_ids)
else:
yield batch, total_token_num
batch, total_token_num, max_len = [
parsed_line
], len(token_ids), len(token_ids)
if len(batch) > 0:
yield batch, total_token_num
for batch_data, total_token_num in batch_reader(
reader, self.batch_size, self.in_tokens):
yield prepare_batch_data(
batch_data,
total_token_num,
voc_size=self.voc_size,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=self.mask_id,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
return wrapper
if __name__ == "__main__":
pass
......@@ -26,4 +26,7 @@ from .tools.logger import logger
from .tools.paddle_helper import connect_program
from .io.type import DataType
from .hub_server import default_hub_server
from .finetune.task import append_mlp_classifier, finetune_and_eval
from .finetune.network import append_mlp_classifier
from .finetune.finetune import finetune_and_eval
from .finetune.config import FinetuneConfig
from .finetune.task import Task
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle.fluid as fluid
import time
import numpy as np
import multiprocessing
from paddle_hub.finetune.optimization import bert_optimization
from paddle_hub.finetune.config import FinetuneConfig
def finetune_and_eval(task, feed_list, data_processor, config=None):
# environment setup
if config.use_cuda:
place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
# data generator
data_generator = {
'train':
data_processor.data_generator(
batch_size=config.batch_size,
phase='train',
epoch=config.epoch,
shuffle=False),
'test':
data_processor.data_generator(
batch_size=config.batch_size, phase='test', shuffle=False),
'dev':
data_processor.data_generator(
batch_size=config.batch_size, phase='dev', shuffle=False)
}
# hub.finetune_and_eval start here
#TODO: to simplify
loss = task.variable("loss")
probs = task.variable("probs")
accuracy = task.variable("accuracy")
num_example = task.variable("num_example")
num_train_examples = data_processor.get_num_examples(phase='train')
if config.in_tokens:
max_train_steps = config.epoch * num_train_examples // (
config.batch_size // config.max_seq_len) // dev_count
else:
max_train_steps = config.epoch * num_train_examples // config.batch_size // dev_count
warmup_steps = int(max_train_steps * config.warmup_proportion)
# obtain main program from Task class
train_program = task.main_program()
startup_program = task.startup_program()
# clone test program before optimize
test_program = train_program.clone(for_test=True)
bert_optimization(loss, warmup_steps, max_train_steps, config.learning_rate,
train_program, config.weight_decay)
# memory optimization
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[
# skip task graph variable memory optimization
loss.name,
probs.name,
accuracy.name,
num_example.name
])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_program)
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
# Traning block
# prepare training dataset
train_data_generator = data_generator['train']
total_loss, total_acc, total_num_example = [], [], []
step = 0
time_begin = time.time()
train_time_used = 0.0
for example in train_data_generator():
step += 1
train_time_begin = time.time()
np_loss, np_acc, np_num_example = exe.run(
program=train_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
train_time_used += time.time() - train_time_begin
# Statistic Block
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
if step % config.stat_interval == 0:
# get training progress
accum_num_example = np.sum(total_num_example)
print("step {}: loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
step,
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example,
config.stat_interval / train_time_used))
# reset statistic variables
total_loss, total_acc, total_num_example = [], [], []
train_time_used = 0.0
# Evaluation block
if step % config.eval_interval == 0:
evaluate(test_program, exe, data_generator)
if step % config.eval_interval == 0:
# Final Test Block
total_loss, total_acc, total_num_example = [], [], []
test_data_generator = data_generator['test']
for example in test_data_generator():
np_loss, np_acc, np_num_example = exe.run(
program=test_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
accum_num_example = np.sum(total_num_example)
print("[Final Test] loss={:.5f} acc={:.5f}".format(
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example))
def evaluate(test_program, exe, feeder, data_generator):
print("Evaluation start")
total_loss, total_acc, total_num_example = [], [], []
dev_data_generator = data_generator['dev']
eval_step = 0
eval_time_begin = time.time()
for example in dev_data_generator():
eval_step += 1
np_loss, np_acc, np_num_example = exe.run(
program=test_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
eval_time_used = time.time() - eval_time_begin
accum_num_example = np.sum(total_num_example)
print("[Evaluation] loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example, eval_step / eval_time_used))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import collections
import paddle.fluid as fluid
import time
import numpy as np
import multiprocessing
from paddle_hub.finetune.optimization import bert_optimization
from .task import Task
__all__ = ['append_mlp_classifier']
def append_mlp_classifier(feature, label, num_classes=2, hidden_units=None):
"""
Append a multi-layer perceptron classifier for binary classification base
on input feature
"""
cls_feats = fluid.layers.dropout(
x=feature, dropout_prob=0.1, dropout_implementation="upscale_in_train")
# append fully connected layer according to hidden_units
if hidden_units is not None:
for n_hidden in hidden_units:
cls_feats = fluid.layers.fc(input=cls_feats, size=n_hidden)
logits = fluid.layers.fc(
input=cls_feats,
size=num_classes,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=label, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
num_example = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=label, total=num_example)
# TODO: encapsulate to Task
graph_var_dict = {
"loss": loss,
"probs": probs,
"accuracy": accuracy,
"num_example": num_example
}
task = Task("text_classification", graph_var_dict,
fluid.default_main_program(), fluid.default_startup_program())
return task
def append_mlp_multi_classifier(feature,
label,
num_classes,
hidden_units=None,
act=None):
pass
def append_sequence_labler(feature, label):
pass
......@@ -19,192 +19,9 @@ import time
import numpy as np
import multiprocessing
from paddle_hub.tools.logger import logger
from paddle_hub.finetune.optimization import bert_optimization
from paddle_hub.finetune.config import FinetuneConfig
__all__ = ['append_mlp_classifier']
def append_mlp_classifier(feature, label, num_classes=2, hidden_units=None):
cls_feats = fluid.layers.dropout(
x=feature, dropout_prob=0.1, dropout_implementation="upscale_in_train")
# append fully connected layer according to hidden_units
if hidden_units != None:
for n_hidden in hidden_units:
cls_feats = fluid.layers.fc(input=cls_feats, size=n_hidden)
logits = fluid.layers.fc(
input=cls_feats,
size=num_classes,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=label, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
num_example = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=label, total=num_example)
# TODO: encapsulate to Task
graph_var_dict = {
"loss": loss,
"probs": probs,
"accuracy": accuracy,
"num_example": num_example
}
task = Task("text_classification", graph_var_dict,
fluid.default_main_program(), fluid.default_startup_program())
return task
def finetune_and_eval(task, feed_list, data_processor, config=None):
if config.use_cuda:
place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
# data generator
data_generator = {
'train':
data_processor.data_generator(
batch_size=config.batch_size,
phase='train',
epoch=config.epoch,
shuffle=False),
'test':
data_processor.data_generator(
batch_size=config.batch_size, phase='test', shuffle=False),
'dev':
data_processor.data_generator(
batch_size=config.batch_size, phase='dev', shuffle=False)
}
# hub.finetune_and_eval start here
#TODO: to simplify
loss = task.variable("loss")
probs = task.variable("probs")
accuracy = task.variable("accuracy")
num_example = task.variable("num_example")
num_train_examples = data_processor.get_num_examples(phase='train')
if config.in_tokens:
max_train_steps = config.epoch * num_train_examples // (
config.batch_size // config.max_seq_len) // dev_count
else:
max_train_steps = config.epoch * num_train_examples // config.batch_size // dev_count
warmup_steps = int(max_train_steps * config.warmup_proportion)
# obtain main program from Task class
train_program = task.main_program()
startup_program = task.startup_program()
# clone test program before optimize
test_program = train_program.clone(for_test=True)
bert_optimization(loss, warmup_steps, max_train_steps, config.learning_rate,
train_program, config.weight_decay)
# memory optimization
fluid.memory_optimize(
input_program=train_program,
skip_opt_set=[
# skip task graph variable memory optimization
loss.name,
probs.name,
accuracy.name,
num_example.name
])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_program)
feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
# Traning block
# prepare training dataset
train_data_generator = data_generator['train']
total_loss, total_acc, total_num_example = [], [], []
step = 0
time_begin = time.time()
train_time_used = 0.0
for example in train_data_generator():
step += 1
train_time_begin = time.time()
np_loss, np_acc, np_num_example = exe.run(
program=train_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
train_time_used += time.time() - train_time_begin
# Statistic Block
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
if step % config.stat_interval == 0:
# get training progress
accum_num_example = np.sum(total_num_example)
print("step {}: loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
step,
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example,
config.stat_interval / train_time_used))
# reset statistic variables
total_loss, total_acc, total_num_example = [], [], []
train_time_used = 0.0
# Evaluation block
if step % config.eval_interval == 0:
print("Evaluation start")
total_loss, total_acc, total_num_example = [], [], []
dev_data_generator = data_generator['dev']
eval_step = 0
eval_time_begin = time.time()
for example in dev_data_generator():
eval_step += 1
np_loss, np_acc, np_num_example = exe.run(
program=test_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
eval_time_used = time.time() - eval_time_begin
accum_num_example = np.sum(total_num_example)
print(
"[Evaluation] loss={:.5f} acc={:.5f} [step/sec: {:.2f}]".format(
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example,
eval_step / eval_time_used))
if step % config.eval_interval == 0:
# Final Test Block
total_loss, total_acc, total_num_example = [], [], []
test_data_generator = data_generator['test']
for example in test_data_generator():
np_loss, np_acc, np_num_example = exe.run(
program=test_program,
feed=feeder.feed([example]),
fetch_list=[loss, accuracy, num_example])
total_loss.extend(np_loss * np_num_example)
total_acc.extend(np_acc * np_num_example)
total_num_example.extend(np_num_example)
accum_num_example = np.sum(total_num_example)
print("[Final Test] loss={:.5f} acc={:.5f}".format(
np.sum(total_loss) / accum_num_example,
np.sum(total_acc) / accum_num_example))
class Task(object):
def __init__(self, task_type, graph_var_dict, main_program,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册