提交 601db3ab 编写于 作者: X xyzhou-puck

add bert, refine text.py

上级 d90f7cc6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT fine-tuning in Paddle Dygraph Mode."""
import os
import io
import time
import argparse
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable
from hapi.text.text import PrePostProcessLayer
from hapi.text.bert.bert import BertConfig
from cls import ClsModelLayer
from hapi.text.bert.optimization import Optimizer
from hapi.text.bert.utils.args import ArgumentGroup, print_arguments, check_cuda
from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
from hapi.metrics import Accuracy
from hapi.text.bert.dataloader import SingleSentenceDataLoader, BertInputExample
import hapi.text.tokenizer.tokenization as tokenization
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("bert_config_path", str, "./config/bert_config.json", "Path to the json file for bert model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 100, "Number of epoches for training.")
train_g.add_arg("learning_rate", float, 0.0001, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay",
"scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1, "Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.")
train_g.add_arg("loss_scaling", float, 1.0,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("data_dir", str, None, "Path to training data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("max_seq_len", int, 512, "Tokens' number of the longest seqence allowed.")
data_g.add_arg("batch_size", int, 32,
"The total number of examples in one batch for training, see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True,
"Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("random_seed", int, 5512, "Random seed.")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("shuffle", bool, True, "")
run_type_g.add_arg("task_name", str, None,
"The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.")
run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
run_type_g.add_arg("do_test", bool, False, "Whether to perform evaluation on test data set.")
run_type_g.add_arg("use_data_parallel", bool, False, "The flag indicating whether to shuffle instances in each pass.")
run_type_g.add_arg("enable_ce", bool, False, help="The flag indicating whether to run the task for continuous evaluation.")
args = parser.parse_args()
def create_data(batch):
"""
convert data to variable
"""
src_ids = to_variable(batch[0], "src_ids")
position_ids = to_variable(batch[1], "position_ids")
sentence_ids = to_variable(batch[2], "sentence_ids")
input_mask = to_variable(batch[3], "input_mask")
labels = to_variable(batch[4], "labels")
labels.stop_gradient = True
return src_ids, position_ids, sentence_ids, input_mask, labels
def train(args):
device = set_device("gpu" if args.use_cuda else "cpu")
fluid.enable_dygraph(device)
bert_config = BertConfig(args.bert_config_path)
bert_config.print_config()
if not (args.do_train or args.do_test):
raise ValueError("For args `do_train`, `do_test`, at "
"least one of them must be True.")
trainer_count = fluid.dygraph.parallel.Env().nranks
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
def mnli_line_processor(line_id, line):
if line_id == "0":
return None
uid = tokenization.convert_to_unicode(line[0])
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
label = tokenization.convert_to_unicode(line[-1])
if label not in ["contradiction", "entailment", "neutral"]:
label = "contradiction"
return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
bert_dataloader = SingleSentenceDataLoader("./data/glue_data/MNLI/train.tsv", tokenizer, ["contradiction", "entailment", "neutral"],
max_seq_length=64, batch_size=32, line_processor=mnli_line_processor)
num_train_examples = len(bert_dataloader.dataset)
max_train_steps = args.epoch * num_train_examples // args.batch_size // trainer_count
warmup_steps = int(max_train_steps * args.warmup_proportion)
print("Trainer count: %d" % trainer_count)
print("Num train examples: %d" % num_train_examples)
print("Max train steps: %d" % max_train_steps)
print("Num warmup steps: %d" % warmup_steps)
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
inputs = [Input([None, None], 'int64', name='src_ids'),
Input([None, None], 'int64', name='pos_ids'),
Input([None, None], 'int64', name='sent_ids'),
Input([None, None], 'float32', name='input_mask')]
labels = [Input([None, 1], 'int64', name='label')]
cls_model = ClsModelLayer(
args,
bert_config,
3,
is_training=True,
return_pooled_out=True)
optimizer = Optimizer(
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=args.learning_rate,
model_cls=cls_model,
weight_decay=args.weight_decay,
scheduler=args.lr_scheduler,
loss_scaling=args.loss_scaling,
parameter_list=cls_model.parameters())
cls_model.prepare(
optimizer,
SoftmaxWithCrossEntropy(),
Accuracy(topk=(1, 2)),
inputs,
labels,
device=device)
cls_model.bert_layer.init_parameters(args.init_pretraining_params, verbose=True)
cls_model.fit(train_data=bert_dataloader.dataloader, epochs=args.epoch)
return cls_model
if __name__ == '__main__':
print_arguments(args)
check_cuda(args.use_cuda)
if args.do_train:
cls_model = train(args)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear, Layer
from hapi.text.bert.bert import BertEncoder
from hapi.model import Model
class ClsModelLayer(Model):
"""
classify model
"""
def __init__(self,
args,
config,
num_labels,
is_training=True,
return_pooled_out=True,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.is_training = is_training
self.use_fp16 = use_fp16
self.loss_scaling = args.loss_scaling
self.bert_layer = BertEncoder(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_w",
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
#src_ids = data_ids[0]
#position_ids = data_ids[1]
#sentence_ids = data_ids[2]
#input_mask = data_ids[3]
#labels = data_ids[4]
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
cls_feats = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logits = self.cls_fc(cls_feats)
return logits
"""
logits = self.cls_fc(cls_feats)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logits, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
if self.use_fp16 and self.loss_scaling > 1.0:
loss *= self.loss_scaling
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
"""
return loss, accuracy
#!/bin/bash
BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
TASK_NAME='MNLI'
DATA_PATH="./data/glue_data/MNLI/"
CKPT_PATH="./data/saved_model/mnli_models"
export CUDA_VISIBLE_DEVICES=0
# start fine-tuning
python3.7 bert_classifier.py\
--task_name ${TASK_NAME} \
--use_cuda true \
--do_train true \
--do_test true \
--batch_size 64 \
--init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
--data_dir ${DATA_PATH} \
--vocab_path ${BERT_BASE_PATH}/vocab.txt \
--checkpoints ${CKPT_PATH} \
--save_steps 1000 \
--weight_decay 0.01 \
--warmup_proportion 0.1 \
--validation_steps 100 \
--epoch 3 \
--max_seq_len 128 \
--bert_config_path ${BERT_BASE_PATH}/bert_config.json \
--learning_rate 5e-5 \
--skip_steps 10 \
--shuffle true
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import six
import copy
from hapi.progressbar import ProgressBar
from paddle.fluid.dygraph.parallel import ParallelEnv
def config_callbacks(callbacks=None,
model=None,
batch_size=None,
epochs=None,
steps=None,
log_freq=2,
verbose=2,
save_freq=1,
save_dir=None,
metrics=None,
mode='train'):
cbks = callbacks or []
cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
cbks = cbks + [ProgBarLogger(log_freq, verbose=verbose)]
if not any(isinstance(k, ModelCheckpoint) for k in cbks):
cbks = cbks + [ModelCheckpoint(save_freq, save_dir)]
cbk_list = CallbackList(cbks)
cbk_list.set_model(model)
metrics = metrics or [] if mode != 'test' else []
params = {
'batch_size': batch_size,
'epochs': epochs,
'steps': steps,
'verbose': verbose,
'metrics': metrics,
}
cbk_list.set_params(params)
return cbk_list
class CallbackList(object):
def __init__(self, callbacks=None):
# copy
self.callbacks = [c for c in callbacks]
self.params = {}
self.model = None
def append(self, callback):
self.callbacks.append(callback)
def __iter__(self):
return iter(self.callbacks)
def set_params(self, params):
for c in self.callbacks:
c.set_params(params)
def set_model(self, model):
for c in self.callbacks:
c.set_model(model)
def _call(self, name, *args):
for c in self.callbacks:
func = getattr(c, name)
func(*args)
def _check_mode(self, mode):
assert mode in ['train', 'eval', 'test'], \
'mode should be train, eval or test'
def on_begin(self, mode, logs=None):
self._check_mode(mode)
name = 'on_{}_begin'.format(mode)
self._call(name, logs)
def on_end(self, mode, logs=None):
self._check_mode(mode)
name = 'on_{}_end'.format(mode)
self._call(name, logs)
def on_epoch_begin(self, epoch=None, logs=None):
self._call('on_epoch_begin', epoch, logs)
def on_epoch_end(self, epoch=None, logs=None):
self._call('on_epoch_end', epoch, logs)
def on_batch_begin(self, mode, step=None, logs=None):
self._check_mode(mode)
name = 'on_{}_batch_begin'.format(mode)
self._call(name, step, logs)
def on_batch_end(self, mode, step=None, logs=None):
self._check_mode(mode)
name = 'on_{}_batch_end'.format(mode)
self._call(name, step, logs)
class Callback(object):
def __init__(self):
self.model = None
self.params = {}
def set_params(self, params):
self.params = params
def set_model(self, model):
self.model = model
def on_train_begin(self, logs=None):
"""
"""
def on_train_end(self, logs=None):
"""
"""
def on_eval_begin(self, logs=None):
"""
"""
def on_eval_end(self, logs=None):
"""
"""
def on_test_begin(self, logs=None):
"""
"""
def on_test_end(self, logs=None):
"""
"""
def on_epoch_begin(self, epoch, logs=None):
"""
"""
def on_epoch_end(self, epoch, logs=None):
"""
"""
def on_train_batch_begin(self, step, logs=None):
"""
"""
def on_train_batch_end(self, step, logs=None):
"""
"""
def on_eval_batch_begin(self, step, logs=None):
"""
"""
def on_eval_batch_end(self, step, logs=None):
"""
"""
def on_eval_batch_begin(self, step, logs=None):
"""
"""
def on_eval_batch_end(self, step, logs=None):
"""
"""
class ProgBarLogger(Callback):
def __init__(self, log_freq=1, verbose=2):
self.epochs = None
self.steps = None
self.progbar = None
self.verbose = verbose
self.log_freq = log_freq
def on_train_begin(self, logs=None):
self.epochs = self.params['epochs']
assert self.epochs
self.train_metrics = self.params['metrics']
assert self.train_metrics
def on_epoch_begin(self, epoch=None, logs=None):
self.steps = self.params['steps']
self.epoch = epoch
self.train_step = 0
if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
print('Epoch %d/%d' % (epoch + 1, self.epochs))
self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
def _updates(self, logs, mode):
values = []
metrics = getattr(self, '%s_metrics' % (mode))
progbar = getattr(self, '%s_progbar' % (mode))
steps = getattr(self, '%s_step' % (mode))
for k in metrics:
if k in logs:
values.append((k, logs[k]))
progbar.update(steps, values)
def on_train_batch_end(self, step, logs=None):
logs = logs or {}
self.train_step += 1
if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end
if self.steps and self.train_step < self.steps:
self._updates(logs, 'train')
else:
self._updates(logs, 'train')
def on_epoch_end(self, epoch, logs=None):
logs = logs or {}
if self.verbose and ParallelEnv().local_rank == 0:
self._updates(logs, 'train')
def on_eval_begin(self, logs=None):
self.eval_steps = logs.get('steps', None)
self.eval_metrics = logs.get('metrics_name', [])
self.eval_step = 0
self.evaled_samples = 0
self.eval_progbar = ProgressBar(
num=self.eval_steps, verbose=self.verbose)
if ParallelEnv().local_rank == 0:
print('Eval begin...')
def on_eval_batch_end(self, step, logs=None):
logs = logs or {}
self.eval_step = step
samples = logs.get('batch_size', 1)
self.evaled_samples += samples
if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
).local_rank == 0:
# if steps is not None, last step will update in on_epoch_end
if self.eval_steps and self.eval_step < self.eval_steps:
self._updates(logs, 'eval')
def on_eval_end(self, logs=None):
logs = logs or {}
if self.verbose and ParallelEnv().local_rank == 0:
self._updates(logs, 'eval')
print('Eval samples: %d' % (self.evaled_samples))
class ModelCheckpoint(Callback):
def __init__(self, save_freq=1, save_dir=None):
self.save_freq = save_freq
self.save_dir = save_dir
def on_epoch_begin(self, epoch=None, logs=None):
self.epoch = epoch
def _is_save(self):
return self.model and self.save_dir and ParallelEnv().local_rank == 0
def on_epoch_end(self, epoch, logs=None):
if self._is_save() and self.epoch % self.save_freq == 0:
path = '{}/{}'.format(self.save_dir, epoch)
print('save checkpoint at {}'.format(path))
self.model.save(path)
def on_train_end(self, logs=None):
if self._is_save():
path = '{}/final'.format(self.save_dir)
print('save checkpoint at {}'.format(path))
self.model.save(path)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import six
import time
import math
import socket
import contextlib
import numpy as np
from paddle import fluid
from paddle.fluid.layers import collective
from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
from paddle.fluid.io import BatchSampler
_parallel_context_initialized = False
class DistributedBatchSampler(BatchSampler):
"""Sampler that restricts data loading to a subset of the dataset.
In such case, each process can pass a DistributedBatchSampler instance
as a DataLoader sampler, and load a subset of the original dataset that
is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Args:
data_source: this could be a `fluid.io.Dataset` implement
or other python object which implemented
`__len__` for BatchSampler to get sample
number of data source.
batch_size(int): sample indice number in a mini-batch indices.
shuffle(bool): whther to shuffle indices order before genrating
batch indices. Default False.
drop_last(bool): whether drop the last incomplete batch dataset size
is not divisible by the batch size. Default False
"""
def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
self.dataset = dataset
assert isinstance(batch_size, int) and batch_size > 0, \
"batch_size should be a positive integer"
self.batch_size = batch_size
assert isinstance(shuffle, bool), \
"shuffle should be a boolean value"
self.shuffle = shuffle
assert isinstance(drop_last, bool), \
"drop_last should be a boolean number"
self.drop_last = drop_last
self.nranks = ParallelEnv().nranks
self.local_rank = ParallelEnv().local_rank
self.epoch = 0
self.num_samples = int(
math.ceil(len(self.dataset) * 1.0 / self.nranks))
self.total_size = self.num_samples * self.nranks
def __iter__(self):
num_samples = len(self.dataset)
indices = np.arange(num_samples).tolist()
indices += indices[:(self.total_size - len(indices))]
assert len(indices) == self.total_size
if self.shuffle:
np.random.RandomState(self.epoch).shuffle(indices)
self.epoch += 1
# subsample
def _get_indices_by_batch_size(indices):
subsampled_indices = []
last_batch_size = self.total_size % (self.batch_size * self.nranks)
assert last_batch_size % self.nranks == 0
last_local_batch_size = last_batch_size // self.nranks
for i in range(self.local_rank * self.batch_size,
len(indices) - last_batch_size,
self.batch_size * self.nranks):
subsampled_indices.extend(indices[i:i + self.batch_size])
indices = indices[len(indices) - last_batch_size:]
subsampled_indices.extend(indices[
self.local_rank * last_local_batch_size:(
self.local_rank + 1) * last_local_batch_size])
return subsampled_indices
if self.nranks > 1:
indices = _get_indices_by_batch_size(indices)
assert len(indices) == self.num_samples
_sample_iter = iter(indices)
batch_indices = []
for idx in _sample_iter:
batch_indices.append(idx)
if len(batch_indices) == self.batch_size:
yield batch_indices
batch_indices = []
if not self.drop_last and len(batch_indices) > 0:
yield batch_indices
def __len__(self):
num_samples = self.num_samples
num_samples += int(not self.drop_last) * (self.batch_size - 1)
return num_samples // self.batch_size
def set_epoch(self, epoch):
self.epoch = epoch
def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
return collective._c_allgather(
x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
def wait_server_ready(endpoints):
assert not isinstance(endpoints, six.string_types)
while True:
all_ok = True
not_ready_endpoints = []
for ep in endpoints:
ip_port = ep.split(":")
with contextlib.closing(
socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0:
all_ok = False
not_ready_endpoints.append(ep)
if not all_ok:
time.sleep(3)
else:
break
def init_communicator(program, rank, nranks, wait_port, current_endpoint,
endpoints):
if nranks < 2:
return
other_endpoints = endpoints[:]
other_endpoints.remove(current_endpoint)
if rank == 0 and wait_port:
wait_server_ready(other_endpoints)
block = program.global_block()
nccl_id_var = block.create_var(
name=fluid.unique_name.generate('nccl_id'),
persistable=True,
type=fluid.core.VarDesc.VarType.RAW)
block.append_op(
type='c_gen_nccl_id',
inputs={},
outputs={'Out': nccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints
})
block.append_op(
type='c_comm_init',
inputs={'X': nccl_id_var},
outputs={},
attrs={
'nranks': nranks,
'rank': rank,
'ring_id': 0,
})
def prepare_distributed_context(place=None):
if place is None:
place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
else fluid.CUDAPlace(0)
strategy = ParallelStrategy()
strategy.nranks = ParallelEnv().nranks
strategy.local_rank = ParallelEnv().local_rank
strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
strategy.current_endpoint = ParallelEnv().current_endpoint
if strategy.nranks < 2:
return
global _parallel_context_initialized
if not _parallel_context_initialized and isinstance(place,
fluid.CUDAPlace):
def _init_context():
communicator_prog = fluid.Program()
init_communicator(communicator_prog, strategy.local_rank,
strategy.nranks, True, strategy.current_endpoint,
strategy.trainer_endpoints)
exe = fluid.Executor(place)
exe.run(communicator_prog)
if fluid.in_dygraph_mode():
fluid.disable_dygraph()
_init_context()
fluid.enable_dygraph(place)
else:
_init_context()
else:
assert ("Only support CUDAPlace for now.")
_parallel_context_initialized = True
return strategy
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
import six
import abc
import numpy as np
import paddle.fluid as fluid
import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
__all__ = ['Metric', 'Accuracy']
@six.add_metaclass(abc.ABCMeta)
class Metric(object):
"""
Base class for metric, encapsulates metric logic and APIs
Usage:
m = SomeMetric()
for prediction, label in ...:
m.update(prediction, label)
m.accumulate()
"""
@abc.abstractmethod
def reset(self):
"""
Reset states and result
"""
raise NotImplementedError("function 'reset' not implemented in {}.".
format(self.__class__.__name__))
@abc.abstractmethod
def update(self, *args, **kwargs):
"""
Update states for metric
"""
raise NotImplementedError("function 'update' not implemented in {}.".
format(self.__class__.__name__))
@abc.abstractmethod
def accumulate(self):
"""
Accumulates statistics, computes and returns the metric value
"""
raise NotImplementedError(
"function 'accumulate' not implemented in {}.".format(
self.__class__.__name__))
@abc.abstractmethod
def name(self):
"""
Returns metric name
"""
raise NotImplementedError("function 'name' not implemented in {}.".
format(self.__class__.__name__))
def add_metric_op(self, pred, label):
"""
Add process op for metric in program
"""
return pred, label
class Accuracy(Metric):
"""
Encapsulates accuracy metric logic
"""
def __init__(self, topk=(1, ), name=None, *args, **kwargs):
super(Accuracy, self).__init__(*args, **kwargs)
self.topk = topk
self.maxk = max(topk)
self._init_name(name)
self.reset()
def add_metric_op(self, pred, label, *args, **kwargs):
pred = fluid.layers.argsort(pred[0], descending=True)[1][:, :self.maxk]
correct = pred == label[0]
return correct
def update(self, correct, *args, **kwargs):
accs = []
for i, k in enumerate(self.topk):
num_corrects = correct[:, :k].sum()
num_samples = len(correct)
accs.append(float(num_corrects) / num_samples)
self.total[i] += num_corrects
self.count[i] += num_samples
return accs
def reset(self):
self.total = [0.] * len(self.topk)
self.count = [0] * len(self.topk)
def accumulate(self):
res = []
for t, c in zip(self.total, self.count):
res.append(float(t) / c)
return res
def _init_name(self, name):
name = name or 'acc'
if self.maxk != 1:
self._name = ['{}_top{}'.format(name, k) for k in self.topk]
else:
self._name = ['acc']
def name(self):
return self._name
此差异已折叠。
import sys
import time
import numpy as np
class ProgressBar(object):
"""progress bar """
def __init__(self,
num=None,
width=30,
verbose=1,
start=True,
file=sys.stdout):
self._num = num
if isinstance(num, int) and num <= 0:
raise TypeError('num should be None or integer (> 0)')
max_width = self._get_max_width()
self._width = width if width <= max_width else max_width
self._total_width = 0
self._verbose = verbose
self.file = file
self._values = {}
self._values_order = []
if start:
self._start = time.time()
self._last_update = 0
self._dynamic_display = (
(hasattr(self.file, 'isatty') and
self.file.isatty()) or 'ipykernel' in sys.modules or
'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
def _get_max_width(self):
if sys.version_info > (3, 3):
from shutil import get_terminal_size
else:
from backports.shutil_get_terminal_size import get_terminal_size
terminal_width, _ = get_terminal_size()
max_width = min(int(terminal_width * 0.6), terminal_width - 50)
return max_width
def start(self):
self.file.flush()
self._start = time.time()
def update(self, current_num, values=None):
now = time.time()
if current_num:
time_per_unit = (now - self._start) / current_num
else:
time_per_unit = 0
if time_per_unit >= 1 or time_per_unit == 0:
fps = ' - %.0fs/%s' % (time_per_unit, 'step')
elif time_per_unit >= 1e-3:
fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
else:
fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
info = ''
if self._verbose == 1:
prev_total_width = self._total_width
if self._dynamic_display:
sys.stdout.write('\b' * prev_total_width)
sys.stdout.write('\r')
else:
sys.stdout.write('\n')
if self._num is not None:
numdigits = int(np.log10(self._num)) + 1
bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
current_num, self._num)
prog = float(current_num) / self._num
prog_width = int(self._width * prog)
if prog_width > 0:
bar_chars += ('=' * (prog_width - 1))
if current_num < self._num:
bar_chars += '>'
else:
bar_chars += '='
bar_chars += ('.' * (self._width - prog_width))
bar_chars += ']'
else:
bar_chars = 'step %3d' % current_num
self._total_width = len(bar_chars)
sys.stdout.write(bar_chars)
for k, val in values:
info += ' - %s:' % k
val = val if isinstance(val, list) else [val]
for i, v in enumerate(val):
if isinstance(v, (float, np.float32, np.float64)):
if abs(v) > 1e-3:
info += ' %.4f' % v
else:
info += ' %.4e' % v
else:
info += ' %s' % v
if self._num is not None and current_num < self._num:
eta = time_per_unit * (self._num - current_num)
if eta > 3600:
eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) //
60, eta % 60)
elif eta > 60:
eta_format = '%d:%02d' % (eta // 60, eta % 60)
else:
eta_format = '%ds' % eta
info += ' - ETA: %s' % eta_format
info += fps
self._total_width += len(info)
if prev_total_width > self._total_width:
info += (' ' * (prev_total_width - self._total_width))
# newline for another epoch
if self._num is not None and current_num >= self._num:
info += '\n'
if self._num is None:
info += '\n'
sys.stdout.write(info)
sys.stdout.flush()
self._last_update = now
elif self._verbose == 2:
if self._num:
numdigits = int(np.log10(self._num)) + 1
count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
self._num)
else:
count = 'step %3d' % current_num
info = count + info
for k, val in values:
info += ' - %s:' % k
val = val if isinstance(val, list) else [val]
for v in val:
if isinstance(v, (float, np.float32, np.float64)):
if abs(v) > 1e-3:
info += ' %.4f' % v
else:
info += ' %.4e' % v
elif isinstance(v, np.ndarray) and \
v.size == 1 and \
isinstance(v.dtype, (np.float32, np.float64)):
if abs(v[0]) > 1e-3:
info += ' %.4f' % v[0]
else:
info += ' %.4e' % v[0]
else:
info += ' %s' % v
info += fps
info += '\n'
sys.stdout.write(info)
sys.stdout.flush()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_pos_ids = [inst[1] for inst in insts]
batch_sent_ids = [inst[2] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(3, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"bert"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
from hapi.text.text import PrePostProcessLayer, TransformerEncoder
from hapi.text.bert.utils.init import init_from_static_model
class BertConfig(object):
def __init__(self, config_path):
self._config_dict = self._parse(config_path)
def _parse(self, config_path):
try:
with open(config_path) as json_file:
config_dict = json.load(json_file)
except Exception:
raise IOError("Error in parsing bert model config file '%s'" %
config_path)
else:
return config_dict
def __getitem__(self, key):
return self._config_dict[key]
def print_config(self):
for arg, value in sorted(six.iteritems(self._config_dict)):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
class BertEncoder(Layer):
"""
bert
"""
def __init__(self, config, return_pooled_out=True, use_fp16=False):
super(BertEncoder, self).__init__()
self.config = config
self._emb_size = config['hidden_size']
self._n_layer = config['num_hidden_layers']
self._n_head = config['num_attention_heads']
self._voc_size = config['vocab_size']
self._max_position_seq_len = config['max_position_embeddings']
self._sent_types = config['type_vocab_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._attention_dropout = config['attention_probs_dropout_prob']
self.return_pooled_out = return_pooled_out
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._dtype = "float16" if use_fp16 else "float32"
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
self._src_emb = Embedding(
size=[self._voc_size, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._word_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self._pos_emb = Embedding(
size=[self._max_position_seq_len, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._pos_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self._sent_emb = Embedding(
size=[self._sent_types, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._sent_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self.pooled_fc = Linear(
input_dim=self._emb_size,
output_dim=self._emb_size,
param_attr=fluid.ParamAttr(
name="pooled_fc.w_0", initializer=self._param_initializer),
bias_attr="pooled_fc.b_0",
act="tanh")
self.pre_process_layer = PrePostProcessLayer(
"nd", self._emb_size, self._prepostprocess_dropout, None)
self._encoder = TransformerEncoder(
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
preprocess_cmd="",
postprocess_cmd="dan",
ffn_fc1_act=self._hidden_act)
def init_parameters(self, param_path="", verbose=False):
init_from_static_model(param_path, self, self.config, verbose)
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
src_emb = self._src_emb(src_ids)
pos_emb = self._pos_emb(position_ids)
sent_emb = self._sent_emb(sentence_ids)
emb_out = src_emb + pos_emb
emb_out = emb_out + sent_emb
emb_out = self.pre_process_layer(emb_out)
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_output = self._encoder(emb_out, n_head_self_attn_mask)
if not self.return_pooled_out:
return enc_output
next_sent_feat = fluid.layers.slice(
input=enc_output, axes=[1], starts=[0], ends=[1])
next_sent_feat = self.pooled_fc(next_sent_feat)
next_sent_feat = fluid.layers.reshape(
next_sent_feat, shape=[-1, self._emb_size])
return enc_output, next_sent_feat
此差异已折叠。
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
import six
import csv
import glob
import tarfile
import itertools
from functools import partial
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.fluid.io import BatchSampler, DataLoader, Dataset
from hapi.distributed import DistributedBatchSampler
from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
from hapi.text.bert.batching import prepare_batch_data
import hapi.text.tokenizer.tokenization as tokenization
__all__ = [
'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
'SentencePairDataset'
]
class BertInputExample(object):
def __init__(self, uid, text_a, text_b=None, label=None):
self.uid = uid
self.text_a = text_a
self.text_b = text_b
self.label = label
class BertInputFeatures(object):
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.pos_ids = list(range(len(self.input_ids)))
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def convert_single_example_to_unicode(guid, single_example):
text_a = tokenization.convert_to_unicode(single_example[0])
text_b = tokenization.convert_to_unicode(single_example[1])
label = tokenization.convert_to_unicode(single_example[2])
return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `BertInputExample` into a single `BertInputFeatures`."""
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
label_id = label_map[example.label]
feature = BertInputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id)
return feature
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
print("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
features.append(feature)
return features
def _read_tsv(input_file, delimiter="\t", quotechar=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="utf8") as f:
reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines
class SingleSentenceDataset(Dataset):
def __init__(self,
tokenizer,
label_list,
max_seq_length,
mode="all_in_memory"):
assert isinstance(mode,
str), "mode of SingleSentenceDataset should be str"
assert mode in [
"all_in_memory", "leveldb"
], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb], but get" % mode
self.examples = []
def load_all_data_in_memory(self,
input_file,
label_list,
max_seq_length,
tokenizer,
line_processor=None,
delimiter="\t",
quotechar=None):
lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
def default_line_processor(line_id, line):
assert len(line) == 2
text_a = line[0]
label = line[1]
return BertInputExample(
str(line_id), text_a=text_a, text_b=None, label=label)
if line_processor is None:
line_processor = default_line_processor
for (line_id, line) in enumerate(lines):
input_example = line_processor(line_id, line)
if not input_example:
continue
input_feature = convert_single_example(
str(line_id), input_example, label_list, max_seq_length,
tokenizer)
self.examples.append(input_feature)
def __getitem__(self, idx):
return self.examples[idx].input_ids, self.examples[
idx].pos_ids, self.examples[idx].segment_ids, self.examples[
idx].label_id
def __len__(self):
return len(self.examples)
class SentencePairDataset(Dataset):
def __init__(self,
tokenizer,
label_ist,
max_seq_length,
mode="all_in_memory"):
assert isinstance(mode,
str), "mode of SentencePairDataset should be str"
assert mode in [
"all_in_memory", "leveldb"
], "mode of SentencePairDataset should be in [all_in_memory, leveldb], but get" % mode
self.examples = []
def load_all_data_in_memory(self,
input_file,
label_list,
max_seq_length,
tokenizer,
line_processor=None,
delimiter="\t",
quotechar=None):
lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
def default_line_processor(line_id, line):
assert len(line) == 3
text_a = line[0]
text_b = line[1]
label = line[2]
return BertInputExample(
str(line_id), text_a=text_a, text_b=text_b, label=label)
if line_processor is None:
line_processor = default_line_processor
for (line_id, line) in enumerate(lines):
input_example = line_processor(line_id, line)
if not input_example:
continue
input_feature = convert_single_example(
str(line_id), input_example, label_list, max_seq_length,
tokenizer)
self.examples.append(input_feature)
def __getitem__(self, idx):
return self.examples[idx].input_ids, self.examples[
idx].pos_ids, self.examples[idx].segment_ids, self.examples[
idx].label_id
def __len__(self):
return len(self.examples)
def _prepare_train_batch(insts,
vocab_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=-1,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
return prepare_batch_data(
insts,
0,
voc_size=vocab_size,
pad_id=pad_id,
cls_id=cls_id,
sep_id=sep_id,
mask_id=mask_id,
return_input_mask=return_input_mask,
return_max_len=return_max_len,
return_num_token=return_num_token)
class SingleSentenceDataLoader(object):
def __init__(self,
input_file,
tokenizer,
label_list,
max_seq_length,
batch_size,
shuffle=False,
drop_last=False,
mode="all_in_memory",
line_processor=None,
delimiter="\t",
quotechar=None,
device=fluid.CPUPlace(),
num_workers=0,
return_list=True):
self.dataset = SingleSentenceDataset(tokenizer, label_list,
max_seq_length, mode)
if mode == "all_in_memory":
self.dataset.load_all_data_in_memory(
input_file, label_list, max_seq_length, tokenizer,
line_processor, delimiter, quotechar)
elif mode == "leveldb":
#TODO add leveldb reader
pass
else:
raise ValueError("mode should be in [all_in_memory, leveldb]")
self.sampler = DistributedBatchSampler(
self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
self.dataloader = DataLoader(
dataset=self.dataset,
batch_sampler=self.sampler,
places=device,
collate_fn=partial(
_prepare_train_batch,
vocab_size=-1,
pad_id=tokenizer.vocab["[PAD]"],
cls_id=tokenizer.vocab["[CLS]"],
sep_id=tokenizer.vocab["[SEP]"],
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False),
num_workers=num_workers,
return_list=return_list)
if __name__ == "__main__":
print("hello world.")
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
class ConstantLR(LearningRateDecay):
def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
super(ConstantLR, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
def step(self):
return self.learning_rate
class LinearDecay(LearningRateDecay):
def __init__(self,
learning_rate,
warmup_steps,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False,
begin=0,
step=1,
dtype='float32'):
super(LinearDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.warmup_steps = warmup_steps
self.decay_steps = decay_steps
self.end_learning_rate = end_learning_rate
self.power = power
self.cycle = cycle
def step(self):
if self.step_num < self.warmup_steps:
decayed_lr = self.learning_rate * (self.step_num /
self.warmup_steps)
decayed_lr = self.create_lr_var(decayed_lr)
else:
tmp_step_num = self.step_num
tmp_decay_steps = self.decay_steps
if self.cycle:
div_res = fluid.layers.ceil(
self.create_lr_var(tmp_step_num / float(self.decay_steps)))
if tmp_step_num == 0:
div_res = self.create_lr_var(1.0)
tmp_decay_steps = self.decay_steps * div_res
else:
tmp_step_num = self.create_lr_var(
tmp_step_num
if tmp_step_num < self.decay_steps else self.decay_steps)
decayed_lr = (self.learning_rate - self.end_learning_rate) * \
((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
return decayed_lr
class Optimizer(object):
def __init__(self,
warmup_steps,
num_train_steps,
learning_rate,
model_cls,
weight_decay,
scheduler='linear_warmup_decay',
loss_scaling=1.0,
parameter_list=None):
self.warmup_steps = warmup_steps
self.num_train_steps = num_train_steps
self.learning_rate = learning_rate
self.model_cls = model_cls
self.weight_decay = weight_decay
self.scheduler = scheduler
self.loss_scaling = loss_scaling
self.parameter_list = parameter_list
self.scheduled_lr = 0.0
self.optimizer = self.lr_schedule()
def lr_schedule(self):
if self.warmup_steps > 0:
if self.scheduler == 'noam_decay':
self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
self.warmup_steps * (self.learning_rate**2)),
self.warmup_steps)
elif self.scheduler == 'linear_warmup_decay':
self.scheduled_lr = LinearDecay(self.learning_rate,
self.warmup_steps,
self.num_train_steps, 0.0)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
else:
self.scheduled_lr = ConstantLR(self.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
return optimizer
def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
def minimize(self, loss, use_data_parallel=False, model=None):
param_list = dict()
clip_norm_thres = 1.0
#grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
if use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
if self.weight_decay > 0:
for param in self.model_cls.parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
if use_data_parallel:
assert model is not None
model.apply_collective_grads()
#_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
_, param_grads = self.optimizer.minimize(loss)
if self.weight_decay > 0:
for param, grad in param_grads:
if self.exclude_from_weight_decay(param.name):
continue
if isinstance(self.scheduled_lr.step(), float):
updated_param = param.numpy() - param_list[
param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step()
else:
updated_param = param.numpy(
) - param_list[param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step().numpy()
updated_param_var = fluid.dygraph.to_variable(updated_param)
param = updated_param_var
#param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
""" Applies linear warmup of learning rate from 0 and decay to 0."""
with fluid.default_main_program()._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
shape=[1],
value=0.0,
dtype='float32',
persistable=True,
name="scheduled_learning_rate")
global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
)
with fluid.layers.control_flow.Switch() as switch:
with switch.case(global_step < warmup_steps):
warmup_lr = learning_rate * (global_step / warmup_steps)
fluid.layers.tensor.assign(warmup_lr, lr)
with switch.default():
decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
learning_rate=learning_rate,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
fluid.layers.tensor.assign(decayed_lr, lr)
return lr
def optimization(loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
startup_prog,
weight_decay,
scheduler='linear_warmup_decay',
use_fp16=False,
use_dynamic_loss_scaling=False,
init_loss_scaling=1.0,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=2,
incr_ratio=2.0,
decr_ratio=0.8):
scheduled_lr, loss_scaling = None, None
if scheduler == 'noam_decay':
if warmup_steps > 0:
scheduled_lr = fluid.layers.learning_rate_scheduler\
.noam_decay(1/(warmup_steps *(learning_rate ** 2)),
warmup_steps)
else:
print(
"WARNING: noam decay of learning rate should have postive warmup "
"steps but given {}, using constant learning rate instead!"
.format(warmup_steps))
scheduled_lr = fluid.layers.create_global_var(
name=fluid.unique_name.generate("learning_rate"),
shape=[1],
value=learning_rate,
dtype='float32',
persistable=True)
elif scheduler == 'linear_warmup_decay':
if warmup_steps > 0:
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
num_train_steps)
else:
print(
"WARNING: linear warmup decay of learning rate should have "
"postive warmup steps but given {}, use constant learning rate "
"instead!".format(warmup_steps))
scheduled_lr = fluid.layers.create_global_var(
name=fluid.unique_name.generate("learning_rate"),
shape=[1],
value=learning_rate,
dtype='float32',
persistable=True)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
def exclude_from_weight_decay(param):
name = param.name.rstrip(".master")
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
if use_fp16:
loss_scaling = fluid.layers.create_global_var(
name=fluid.unique_name.generate("loss_scaling"),
shape=[1],
value=init_loss_scaling,
dtype='float32',
persistable=True)
loss *= loss_scaling
param_grads = optimizer.backward(loss)
master_param_grads = create_master_params_grads(
param_grads, train_program, startup_prog, loss_scaling)
if weight_decay > 0:
for param, _ in master_param_grads:
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
if use_dynamic_loss_scaling:
apply_dynamic_loss_scaling(
loss_scaling, master_param_grads, incr_every_n_steps,
decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
optimizer.apply_gradients(master_param_grads)
if weight_decay > 0:
for param, grad in master_param_grads:
if exclude_from_weight_decay(param):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
master_param_to_train_param(master_param_grads, param_grads,
train_program)
else:
if weight_decay > 0:
for param in train_program.all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param):
continue
with param.block.program._optimized_guard(
[param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[
param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr, loss_scaling
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from hapi.text.bert.utils.args import str2bool as str2bool
from hapi.text.bert.utils.args import ArgumentGroup as ArgumentGroup
from hapi.text.bert.utils.args import print_arguments as print_arguments
from hapi.text.bert.utils.args import check_cuda as check_cuda
from hapi.text.bert.utils.cards import get_cards as get_cards
from hapi.text.bert.utils.fp16 import cast_fp16_to_fp32 as cast_fp16_to_fp32
from hapi.text.bert.utils.fp16 import cast_fp32_to_fp16 as cast_fp32_to_fp16
from hapi.text.bert.utils.fp16 import copy_to_master_param as copy_to_master_param
from hapi.text.bert.utils.fp16 import create_master_params_grads as create_master_params_grads
from hapi.text.bert.utils.fp16 import master_param_to_train_param as master_param_to_train_param
from hapi.text.bert.utils.init import init_checkpoint as init_checkpoint
from hapi.text.bert.utils.init import init_pretraining_params as init_pretraining_params
from hapi.text.bert.utils.init import init_from_static_model as init_from_static_model
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Arguments for configuration."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import argparse
import paddle.fluid as fluid
def str2bool(v):
# because argparse does not support to parse "true, False" as python
# boolean directly
return v.lower() in ("true", "t", "1")
class ArgumentGroup(object):
def __init__(self, parser, title, des):
self._group = parser.add_argument_group(title=title, description=des)
def add_arg(self, name, type, default, help, **kwargs):
type = str2bool if type == bool else type
self._group.add_argument(
"--" + name,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(six.iteritems(vars(args))):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
def check_cuda(use_cuda, err = \
"\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
):
try:
if use_cuda == True and fluid.is_compiled_with_cuda() == False:
print(err)
sys.exit(1)
except Exception as e:
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
def get_cards():
"""
get gpu cards number
"""
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import sys
import os
def usage():
"""
usage information
"""
print
print("please use command: ")
print(
"python convert_static_to_dygraph.py input_params_dir output_params_dir"
)
print
def convert_static_to_dygraph(static_model_path, dygraph_model_path):
"""
convert paddle static bert model to dygraph model
"""
def mkdir(path):
if not os.path.isdir(path):
if os.path.split(path)[0]:
mkdir(os.path.split(path)[0])
else:
return
os.mkdir(path)
if os.path.exists(dygraph_model_path):
shutil.rmtree(dygraph_model_path)
mkdir(dygraph_model_path)
if not os.path.exists(static_model_path):
print("paddle static model path doesn't exist.....")
return -1
file_list = []
for root, dirs, files in os.walk(static_model_path):
file_list.extend(files)
os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0"))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
#os.chdir(static_model_path)
#convert embedding file
embedding_type = ["word", "pos", "sent"]
for i in range(3):
src_name = embedding_type[i] + "_embedding"
trg_name = "Embedding_" + str(i) + "." + src_name
shutil.copyfile(
os.path.join(static_model_path, src_name),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
#convert pre_encoder file
shutil.copyfile(
os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil.copyfile(
os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
#convert mask lm params file
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil.copyfile(
os.path.join(static_model_path, "next_sent_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "next_sent_fc.w_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
shutil.copyfile(
os.path.join(static_model_path, "pooled_fc.b_0"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "pooled_fc.w_0"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
encoder_num = 0
for f in file_list:
if not f.startswith("encoder_layer"):
continue
layer_num = f.split('_')[2]
if int(layer_num) > encoder_num:
encoder_num = int(layer_num)
encoder_num += 1
for i in range(encoder_num):
encoder_dir = "EncoderSubLayer_" + str(i)
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir +
"/PositionwiseFeedForwardLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
encoder_map_dict = {
"ffn_fc_0.b_0":
("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
"ffn_fc_0.w_0":
("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
"ffn_fc_1.b_0":
("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
"ffn_fc_1.w_0":
("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
"multi_head_att_key_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
"multi_head_att_key_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
"multi_head_att_output_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
"multi_head_att_output_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
"multi_head_att_query_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
"multi_head_att_query_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
"multi_head_att_value_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
"multi_head_att_value_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
"post_att_layer_norm_bias":
("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
"post_att_layer_norm_scale":
("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
"post_ffn_layer_norm_bias":
("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
"post_ffn_layer_norm_scale":
("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
}
for f in file_list:
if not f.startswith("encoder_layer"):
continue
layer_num = f.split('_')[2]
suffix_name = "_".join(f.split('_')[3:])
in_dir = encoder_map_dict[suffix_name][0]
rename = encoder_map_dict[suffix_name][1]
encoder_layer = "EncoderSubLayer_" + layer_num
shutil.copyfile(
os.path.join(static_model_path, f),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
encoder_layer + "/" + in_dir + "/" + rename))
if __name__ == "__main__":
if len(sys.argv) < 3:
usage()
exit(1)
static_model_path = sys.argv[1]
dygraph_model_path = sys.argv[2]
convert_static_to_dygraph(static_model_path, dygraph_model_path)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
def cast_fp16_to_fp32(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP16,
"out_dtype": fluid.core.VarDesc.VarType.FP32
})
def cast_fp32_to_fp16(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP32,
"out_dtype": fluid.core.VarDesc.VarType.FP16
})
def copy_to_master_param(p, block):
v = block.vars.get(p.name, None)
if v is None:
raise ValueError("no param name %s found!" % p.name)
new_p = fluid.framework.Parameter(
block=block,
shape=v.shape,
dtype=fluid.core.VarDesc.VarType.FP32,
type=v.type,
lod_level=v.lod_level,
stop_gradient=p.stop_gradient,
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip,
name=v.name + ".master")
return new_p
def create_master_params_grads(params_grads, main_prog, startup_prog,
loss_scaling):
master_params_grads = []
tmp_role = main_prog._current_role
OpRole = fluid.core.op_proto_and_checker_maker.OpRole
main_prog._current_role = OpRole.Backward
for p, g in params_grads:
# create master parameters
master_param = copy_to_master_param(p, main_prog.global_block())
startup_master_param = startup_prog.global_block()._clone_variable(
master_param)
startup_p = startup_prog.global_block().var(p.name)
cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
# cast fp16 gradients to fp32 before apply gradients
if g.name.find("layer_norm") > -1:
if loss_scaling > 1:
scaled_g = g / float(loss_scaling)
else:
scaled_g = g
master_params_grads.append([p, scaled_g])
continue
master_grad = fluid.layers.cast(g, "float32")
if loss_scaling > 1:
master_grad = master_grad / float(loss_scaling)
master_params_grads.append([master_param, master_grad])
main_prog._current_role = tmp_role
return master_params_grads
def master_param_to_train_param(master_params_grads, params_grads, main_prog):
for idx, m_p_g in enumerate(master_params_grads):
train_p, _ = params_grads[idx]
if train_p.name.find("layer_norm") > -1:
continue
with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import six
import ast
import copy
import numpy as np
import paddle.fluid as fluid
def cast_fp32_to_fp16(exe, main_program):
print("Cast parameters to float16 data format.")
for param in main_program.global_block().all_parameters():
if not param.name.endswith(".master"):
param_t = fluid.global_scope().find_var(param.name).get_tensor()
data = np.array(param_t)
if param.name.find("layer_norm") == -1:
param_t.set(np.float16(data).view(np.uint16), exe.place)
master_param_var = fluid.global_scope().find_var(param.name +
".master")
if master_param_var is not None:
master_param_var.get_tensor().set(data, exe.place)
def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
assert os.path.exists(
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
def existed_persitables(var):
if not fluid.io.is_persistable(var):
return False
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
fluid.io.load_vars(
exe,
init_checkpoint_path,
main_program=main_program,
predicate=existed_persitables)
print("Load model from {}".format(init_checkpoint_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_pretraining_params(exe,
pretraining_params_path,
main_program,
use_fp16=False):
assert os.path.exists(pretraining_params_path
), "[%s] cann't be found." % pretraining_params_path
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(os.path.join(pretraining_params_path, var.name))
fluid.io.load_vars(
exe,
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
print("Load pretraining parameters from {}.".format(
pretraining_params_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_from_static_model(dir_path,
backbone_model,
bert_config,
verbose=False):
def load_numpy_weight(file_name):
if six.PY2:
res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
else:
res = np.load(
os.path.join(dir_path, file_name),
allow_pickle=True,
encoding='latin1')
assert res is not None
return res
# load word embedding
_param = load_numpy_weight("word_embedding")
backbone_model._src_emb.set_dict({"weight": _param})
if verbose:
print("INIT word embedding")
_param = load_numpy_weight("pos_embedding")
backbone_model._pos_emb.set_dict({"weight": _param})
if verbose:
print("INIT pos embedding")
_param = load_numpy_weight("sent_embedding")
backbone_model._sent_emb.set_dict({"weight": _param})
if verbose:
print("INIT sent embedding")
_param0 = load_numpy_weight("pooled_fc.w_0")
_param1 = load_numpy_weight("pooled_fc.b_0")
backbone_model.pooled_fc.set_dict({"weight": _param0, "bias": _param1})
if verbose:
print("INIT pooled_fc")
_param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
_param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
backbone_model.pre_process_layer._sub_layers["layer_norm_0"].set_dict({
"weight": _param0,
"bias": _param1
})
if verbose:
print("INIT pre_encoder layer norm")
for _i in range(bert_config["num_hidden_layers"]):
_param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers["layer_%d" %
_i].self_attn.q_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT multi_head_att_query_fc %d" % _i)
_param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers["layer_%d" %
_i].self_attn.k_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT multi_head_att_key_fc %d" % _i)
_param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers["layer_%d" %
_i].self_attn.v_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT multi_head_att_value_fc %d" % _i)
# init output fc
_param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers["layer_%d" %
_i].self_attn.proj_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT multi_head_att_output_fc %d" % _i)
# init layer_norm 1
_param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
_param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers[
"layer_%d" % _i].postprocesser1.layer_norm_0.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT layer norm in attention at %d layer" % _i)
# init layer_norm 2
_param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
_param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers[
"layer_%d" % _i].postprocesser2.layer_norm_0.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT layer norm in FFN at %d layer" % _i)
# init FFN 1
_param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
_param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc1.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT FFN-1 at %d layer" % _i)
# init FFN 2
_param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
_param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc2.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
if verbose:
print("INIT FFN-2 at %d layer" % _i)
return True
此差异已折叠。
此差异已折叠。
[metadata]
name = hapi
author = zhouxiangyang
author_email = zhouxiangyang@baidu.com
version = 0.0.1
description = HAPI
long_description = file: README.md
long_description_content_type = text/markdown
home_page = https://github.com/PaddlePaddle/hapi
license = Apache 2.0
classifier =
Private :: Do Not Upload
Programming Language :: Python
Programming Language :: Python :: 2
Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
Programming Language :: Python :: 3.5
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
keywords =
paddlepaddle
paddle
high-level-api
[options]
packages = find:
#install_requires =
# paddlepaddle-gpu >= 1.5.2
include_package_data = True
zip_safe = False
[sdist]
dist_dir = output/dist
[bdist_wheel]
dist_dir = output/dist
[easy_install]
index_url = http://pip.baidu.com/root/baidu/+simple/
# -*- coding: UTF-8 -*-
################################################################################
#
# Copyright (c) 2019 Baidu.com, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
"""
Setup script.
Authors: zhouxiangyang(zhouxiangyang@baidu.com)
Date: 2020/2/4 00:00:01
"""
import setuptools
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
name="hapi",
version="0.0.1",
author="PaddlePaddle",
author_email="zhouxiangyang@baidu.com",
description="A Paddle High-level API that supports both static and dynamic execution modes (still under development)",
# long_description=long_description,
# long_description_content_type="text/markdown",
url="https://github.com/PaddlePaddle/hapi",
# packages=setuptools.find_packages(),
packages=[
'hapi', 'hapi.text', 'hapi.text.tokenizer', 'hapi.text.bert',
'hapi.text.bert.utils'
],
package_dir={
'hapi': './hapi',
'hapi.text': './hapi/text',
'hapi.text.tokenizer': './hapi/text/tokenizer',
'hapi.text.bert': './hapi/text/bert',
'hapi.text.bert.utils': './hapi/text/bert/utils',
},
platforms="any",
license='Apache 2.0',
classifiers=[
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
], )
import paddle
from hapi.model import set_device
from hapi.text.bert.dataloader import SingleSentenceDataLoader
import hapi.text.tokenizer.tokenization as tokenization
device = set_device("cpu")
paddle.fluid.enable_dygraph(device)
tokenizer = tokenization.FullTokenizer(
vocab_file="./tmp/hapi/data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt",
do_lower_case=True)
bert_dataloader = SingleSentenceDataLoader(
"./tmp/hapi/aaa.txt",
tokenizer, ["1", "2"],
max_seq_length=32,
batch_size=1)
for data in bert_dataloader.dataloader():
print(data)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册