未验证 提交 e4d61e5e 编写于 作者: B Bai Yifan 提交者: GitHub

Add adabert cell based model (#327)

上级 0024cfc5
CUDA_VISIBLE_DEVICES=0 python2 -u train_cell_base.py
import paddle.fluid as fluid
from paddleslim.teachers.bert.reader.cls import *
from paddleslim.nas.darts.search_space import AdaBERTClassifier
from paddleslim.nas.darts import DARTSearch
def main():
place = fluid.CUDAPlace(0)
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/"
bert_config_path = BERT_BASE_PATH + "/bert_config.json"
vocab_path = BERT_BASE_PATH + "/vocab.txt"
data_dir = "./data/glue_data/MNLI/"
max_seq_len = 512
do_lower_case = True
batch_size = 32
epoch = 30
processor = MnliProcessor(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase='train',
epoch=epoch,
dev_count=1,
shuffle=True)
val_reader = processor.data_generator(
batch_size=batch_size,
phase='train',
epoch=epoch,
dev_count=1,
shuffle=True)
with fluid.dygraph.guard(place):
model = AdaBERTClassifier(
3,
teacher_model="/work/PaddleSlim/demo/bert_1/checkpoints/steps_23000"
)
searcher = DARTSearch(
model,
train_reader,
val_reader,
batchsize=batch_size,
num_epochs=epoch,
log_freq=10)
searcher.train()
if __name__ == '__main__':
main()
import numpy as np
from itertools import izip
import paddle.fluid as fluid
from paddleslim.teachers.bert.reader.cls import *
from paddleslim.nas.darts.search_space import AdaBERTClassifier
from paddleslim.nas.darts.architect_for_bert import Architect
import logging
from paddleslim.common import AvgrageMeter, get_logger
logger = get_logger(__name__, level=logging.INFO)
def count_parameters_in_MB(all_params):
parameters_number = 0
for param in all_params:
if param.trainable:
parameters_number += np.prod(param.shape)
return parameters_number / 1e6
def model_loss(model, data_ids):
# src_ids = data_ids[0]
# position_ids = data_ids[1]
# sentence_ids = data_ids[2]
# input_mask = data_ids[3]
labels = data_ids[4]
labels.stop_gradient = True
enc_output = model(data_ids)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=enc_output, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
return loss, accuracy
def train_one_epoch(model, architect, train_loader, valid_loader, optimizer,
epoch, use_data_parallel, log_freq):
ce_losses = AvgrageMeter()
accs = AvgrageMeter()
model.train()
step_id = 0
for train_data, valid_data in izip(train_loader(), valid_loader):
architect.step(train_data, valid_data)
loss, acc = model_loss(model, train_data)
if use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
else:
loss.backward()
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)
optimizer.minimize(loss, grad_clip=grad_clip)
model.clear_gradients()
batch_size = train_data[0].shape[0]
ce_losses.update(loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
if step_id % log_freq == 0:
logger.info(
"Train Epoch {}, Step {}, Lr {:.6f} loss {:.6f}; acc: {:.6f};".
format(epoch, step_id,
optimizer.current_step_lr(), ce_losses.avg[0], accs.avg[
0]))
step_id += 1
def valid_one_epoch(model, valid_loader, epoch, log_freq):
ce_losses = AvgrageMeter()
accs = AvgrageMeter()
model.eval()
step_id = 0
for valid_data in valid_loader():
loss, acc = model_loss(model, valid_data)
batch_size = valid_data[0].shape[0]
ce_losses.update(loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
if step_id % log_freq == 0:
logger.info("Valid Epoch {}, Step {}, loss {:.6f}; acc: {:.6f};".
format(epoch, step_id, ce_losses.avg[0], accs.avg[0]))
step_id += 1
def main():
use_data_parallel = False
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12"
bert_config_path = BERT_BASE_PATH + "/bert_config.json"
vocab_path = BERT_BASE_PATH + "/vocab.txt"
data_dir = "./data/glue_data/MNLI/"
teacher_model_dir = "./teacher_model/steps_23000"
num_samples = 392702
max_seq_len = 128
do_lower_case = True
batch_size = 128
hidden_size = 768
emb_size = 768
max_layer = 8
epoch = 80
log_freq = 10
use_fixed_gumbel = True
processor = MnliProcessor(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase='search_train',
epoch=1,
dev_count=1,
shuffle=True)
val_reader = processor.data_generator(
batch_size=batch_size,
phase='search_valid',
epoch=1,
dev_count=1,
shuffle=True)
if use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
valid_reader = fluid.contrib.reader.distributed_batch_reader(
valid_reader)
with fluid.dygraph.guard(place):
model = AdaBERTClassifier(
3,
n_layer=max_layer,
hidden_size=hidden_size,
emb_size=emb_size,
teacher_model=teacher_model_dir,
data_dir=data_dir,
use_fixed_gumbel=use_fixed_gumbel)
if use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
device_num = fluid.dygraph.parallel.Env().nranks
step_per_epoch = int(num_samples / (batch_size * device_num))
learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch)
model_parameters = [
p for p in model.parameters()
if p.name not in [a.name for a in model.arch_parameters()]
]
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
0.9,
regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
parameter_list=model_parameters)
train_loader = fluid.io.DataLoader.from_generator(
capacity=1024,
use_double_buffer=True,
iterable=True,
return_list=True)
valid_loader = fluid.io.DataLoader.from_generator(
capacity=1024,
use_double_buffer=True,
iterable=True,
return_list=True)
train_loader.set_batch_generator(train_reader, places=place)
valid_loader.set_batch_generator(val_reader, places=place)
architect = Architect(model, learning_rate, 3e-4, place, False)
for epoch_id in range(epoch):
train_one_epoch(model, architect, train_loader, valid_loader,
optimizer, epoch_id, use_data_parallel, log_freq)
valid_one_epoch(model, valid_loader, epoch_id, log_freq)
print(model.student._encoder.alphas.numpy())
print("=" * 100)
if __name__ == '__main__':
main()
import paddle.fluid as fluid
from paddleslim.teachers.bert import BERTClassifier
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
with fluid.dygraph.guard(place):
bert = BERTClassifier(3)
bert.fit("./data/glue_data/MNLI/",
5,
batch_size=32,
use_data_parallel=True,
learning_rate=0.00005,
save_steps=1000)
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
from __future__ import absolute_import from __future__ import absolute_import
from ..darts import train_search from ..darts import train_search
from .train_search import * from .train_search import *
from ..darts import search_space
from .search_space import *
__all__ = [] __all__ = []
__all__ += train_search.__all__ __all__ += train_search.__all__
__all__ += search_space.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
class Architect(object):
def __init__(self, model, eta, arch_learning_rate, place, unrolled):
self.network_momentum = 0.9
self.network_weight_decay = 1e-3
self.eta = eta
self.model = model
self.optimizer = fluid.optimizer.Adam(
arch_learning_rate,
0.5,
0.999,
regularization=fluid.regularizer.L2Decay(1e-3),
parameter_list=self.model.arch_parameters())
self.place = place
self.unrolled = unrolled
if self.unrolled:
self.unrolled_model = self.model.new()
self.unrolled_model_params = [
p for p in self.unrolled_model.parameters()
if p.name not in [
a.name for a in self.unrolled_model.arch_parameters()
] and p.trainable
]
self.unrolled_optimizer = fluid.optimizer.MomentumOptimizer(
self.eta,
self.network_momentum,
regularization=fluid.regularizer.L2DecayRegularizer(
self.network_weight_decay),
parameter_list=self.unrolled_model_params)
def step(self, train_data, valid_data):
if self.unrolled:
params_grads = self._backward_step_unrolled(train_data, valid_data)
self.optimizer.apply_gradients(params_grads)
else:
loss = self._backward_step(valid_data)
self.optimizer.minimize(loss)
self.optimizer.clear_gradients()
def _backward_step(self, valid_data):
loss = self.model.loss(valid_data)
loss[0].backward()
return loss[0]
def _backward_step_unrolled(self, train_data, valid_data):
self._compute_unrolled_model(train_data)
unrolled_loss = self.unrolled_model.loss(valid_data)
unrolled_loss.backward()
vector = [
to_variable(param._grad_ivar().numpy())
for param in self.unrolled_model_params
]
arch_params_grads = [
(alpha, to_variable(ualpha._grad_ivar().numpy()))
for alpha, ualpha in zip(self.model.arch_parameters(),
self.unrolled_model.arch_parameters())
]
self.unrolled_model.clear_gradients()
implicit_grads = self._hessian_vector_product(vector, train_data)
for (p, g), ig in zip(arch_params_grads, implicit_grads):
new_g = g - (ig * self.unrolled_optimizer.current_step_lr())
g.value().get_tensor().set(new_g.numpy(), self.place)
return arch_params_grads
def _compute_unrolled_model(self, data):
for x, y in zip(self.unrolled_model.parameters(),
self.model.parameters()):
x.value().get_tensor().set(y.numpy(), self.place)
loss = self.unrolled_model._loss(data)
loss.backward()
self.unrolled_optimizer.minimize(loss)
self.unrolled_model.clear_gradients()
def _hessian_vector_product(self, vector, data, r=1e-2):
R = r * fluid.layers.rsqrt(
fluid.layers.sum([
fluid.layers.reduce_sum(fluid.layers.square(v)) for v in vector
]))
model_params = [
p for p in self.model.parameters()
if p.name not in [a.name for a in self.model.arch_parameters()] and
p.trainable
]
for param, grad in zip(model_params, vector):
param_p = param + grad * R
param.value().get_tensor().set(param_p.numpy(), self.place)
loss = self.model.loss(data)
loss.backward()
grads_p = [
to_variable(param._grad_ivar().numpy())
for param in self.model.arch_parameters()
]
for param, grad in zip(model_params, vector):
param_n = param - grad * R * 2
param.value().get_tensor().set(param_n.numpy(), self.place)
self.model.clear_gradients()
loss = self.model.loss(data)
loss.backward()
grads_n = [
to_variable(param._grad_ivar().numpy())
for param in self.model.arch_parameters()
]
for param, grad in zip(model_params, vector):
param_o = param + grad * R
param.value().get_tensor().set(param_o.numpy(), self.place)
self.model.clear_gradients()
arch_grad = [(p - n) / (2 * R) for p, n in zip(grads_p, grads_n)]
return arch_grad
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from ..search_space import conv_bert
from .conv_bert import *
__all__ = []
__all__ += conv_bert.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from ..conv_bert import cls
from .cls import *
__all__ = []
__all__ += cls.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT fine-tuning in Paddle Dygraph Mode."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import six
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf8')
import ast
import time
import argparse
import numpy as np
import multiprocessing
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable, Layer, Linear
from .reader.cls import *
from .model.bert import BertModelLayer
from .optimization import Optimizer
from .utils.init import init_from_static_model
from paddleslim.teachers.bert import BERTClassifier
__all__ = ["AdaBERTClassifier"]
class AdaBERTClassifier(Layer):
def __init__(self,
num_labels,
n_layer=8,
emb_size=768,
hidden_size=768,
gamma=0.8,
beta=4,
conv_type="conv_bn",
search_layer=False,
teacher_model=None,
data_dir=None,
use_fixed_gumbel=False):
super(AdaBERTClassifier, self).__init__()
self._n_layer = n_layer
self._num_labels = num_labels
self._emb_size = emb_size
self._hidden_size = hidden_size
self._gamma = gamma
self._beta = beta
self._conv_type = conv_type
self._search_layer = search_layer
self._teacher_model = teacher_model
self._data_dir = data_dir
self.use_fixed_gumbel = use_fixed_gumbel
#print(
# "----------------------load teacher model and test----------------------------------------"
#)
#self.teacher = BERTClassifier(num_labels, model_path=self._teacher_model)
#self.teacher.test(self._data_dir)
#print(
# "----------------------finish load teacher model and test----------------------------------------"
#)
self.student = BertModelLayer(
n_layer=self._n_layer,
emb_size=self._emb_size,
hidden_size=self._hidden_size,
conv_type=self._conv_type,
search_layer=self._search_layer,
use_fixed_gumbel=self.use_fixed_gumbel)
self.cls_fc = list()
for i in range(self._n_layer):
fc = Linear(
input_dim=self._hidden_size,
output_dim=self._num_labels,
param_attr=fluid.ParamAttr(
name="s_cls_out_%d_w" % i,
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="s_cls_out_%d_b" % i,
initializer=fluid.initializer.Constant(0.)))
fc = self.add_sublayer("cls_fc_%d" % i, fc)
self.cls_fc.append(fc)
def forward(self, data_ids):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
return self.student(src_ids, position_ids, sentence_ids)
def arch_parameters(self):
return self.student.arch_parameters()
def genotype(self):
return self.arch_parameters()
def loss(self, data_ids):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
input_mask = data_ids[3]
labels = data_ids[4]
enc_output = self.student(
src_ids, position_ids, sentence_ids, flops=[], model_size=[])
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=enc_output, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
return loss, accuracy
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid import ParamAttr
from paddle.fluid.initializer import MSRA
from .transformer_encoder import EncoderLayer
class BertModelLayer(Layer):
def __init__(self,
emb_size=128,
hidden_size=768,
n_layer=12,
voc_size=30522,
max_position_seq_len=512,
sent_types=2,
return_pooled_out=True,
initializer_range=1.0,
conv_type="conv_bn",
search_layer=False,
use_fp16=False,
use_fixed_gumbel=False):
super(BertModelLayer, self).__init__()
self._emb_size = emb_size
self._hidden_size = hidden_size
self._n_layer = n_layer
self._voc_size = voc_size
self._max_position_seq_len = max_position_seq_len
self._sent_types = sent_types
self.return_pooled_out = return_pooled_out
self.use_fixed_gumbel = use_fixed_gumbel
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._dtype = "float16" if use_fp16 else "float32"
self._conv_type = conv_type
self._search_layer = search_layer
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=initializer_range)
self._src_emb = Embedding(
size=[self._voc_size, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._word_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self._pos_emb = Embedding(
size=[self._max_position_seq_len, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._pos_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self._sent_emb = Embedding(
size=[self._sent_types, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._sent_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self._emb_fac = Linear(
input_dim=self._emb_size,
output_dim=self._hidden_size,
param_attr=fluid.ParamAttr(name="s_emb_factorization"))
self._encoder = EncoderLayer(
n_layer=self._n_layer,
hidden_size=self._hidden_size,
search_layer=self._search_layer,
use_fixed_gumbel=self.use_fixed_gumbel)
def max_flops(self):
return self._encoder.max_flops
def max_model_size(self):
return self._encoder.max_model_size
def arch_parameters(self):
return [self._encoder.alphas] #, self._encoder.k]
def forward(self,
src_ids,
position_ids,
sentence_ids,
flops=[],
model_size=[]):
"""
forward
"""
src_emb = self._src_emb(src_ids)
pos_emb = self._pos_emb(position_ids)
sent_emb = self._sent_emb(sentence_ids)
emb_out = src_emb + pos_emb
emb_out = emb_out + sent_emb
emb_out = self._emb_fac(emb_out)
# (bs, seq_len, 768)
enc_output = self._encoder(emb_out, flops=flops, model_size=model_size)
return enc_output
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear, Layer
from .bert import BertModelLayer
from paddleslim.teachers.bert import BERTClassifier
class ClsModelLayer(Layer):
"""
classify model
"""
def __init__(self,
config,
num_labels,
n_layers=12,
is_training=True,
return_pooled_out=True,
loss_scaling=1.0,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.is_training = is_training
self.use_fp16 = use_fp16
self.loss_scaling = loss_scaling
self.n_layers = n_layers
self.bert_layer = BertModelLayer(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = list()
for i in range(self.n_layers):
fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_%d_w" % i,
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_%d_b" % i,
initializer=fluid.initializer.Constant(0.)))
fc = self.add_sublayer("cls_fc_%d" % i, fc)
self.cls_fc.append(fc)
def forward(self, data_ids):
"""
forward
"""
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
input_mask = data_ids[3]
labels = data_ids[4]
enc_outputs, next_sent_feats = self.bert_layer(
src_ids, position_ids, sentence_ids, input_mask)
logits = []
losses = []
accuracys = []
for next_sent_feat, fc in zip(next_sent_feats, self.cls_fc):
cls_feat = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logit = fc(cls_feat)
logits.append(logit)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logit, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
losses.append(loss)
if self.use_fp16 and self.loss_scaling > 1.0:
loss *= self.loss_scaling
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
accuracys.append(accuracy)
total_loss = fluid.layers.sum(losses)
return total_loss, logits, losses, accuracys, num_seqs
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, BatchNorm, Pool2D, to_variable
from paddle.fluid.initializer import NormalInitializer
from paddle.fluid import ParamAttr
from paddle.fluid.initializer import MSRA, ConstantInitializer
ConvBN_PRIMITIVES = [
'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3',
'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', 'none',
'skip_connect'
]
OPS = {
'std_conv_bn_3': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[3, 1], dilation=1, name=name),
'std_conv_bn_5': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[5, 1], dilation=1, name=name),
'std_conv_bn_7': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[7, 1], dilation=1, name=name),
'dil_conv_bn_3': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[3, 1], dilation=2, name=name),
'dil_conv_bn_5': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[5, 1], dilation=2, name=name),
'dil_conv_bn_7': lambda n_channel, name: ReluConvBN(n_channel, n_channel, filter_size=[7, 1], dilation=2, name=name),
'avg_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='avg'),
'max_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='max'),
'none': lambda n_channel, name: Zero(),
'skip_connect': lambda n_channel, name: Identity(),
}
class MixedOp(fluid.dygraph.Layer):
def __init__(self, n_channel, name=None):
super(MixedOp, self).__init__()
PRIMITIVES = ConvBN_PRIMITIVES
# ops = [
# OPS[primitive](n_channel, name
# if name is None else name + "/" + primitive)
# for primitive in PRIMITIVES
# ]
ops = []
for primitive in PRIMITIVES:
op = OPS[primitive](n_channel, name
if name is None else name + "/" + primitive)
if 'pool' in primitive:
gama = ParamAttr(
initializer=fluid.initializer.Constant(value=1),
trainable=False)
beta = ParamAttr(
initializer=fluid.initializer.Constant(value=0),
trainable=False)
BN = BatchNorm(n_channel, param_attr=gama, bias_attr=beta)
op = fluid.dygraph.Sequential(op, BN)
ops.append(op)
self._ops = fluid.dygraph.LayerList(ops)
def forward(self, x, weights):
#out = weights[0] * self._ops[0](x)
# out = fluid.layers.sums(
# [weights[i] * op(x) for i, op in enumerate(self._ops)])
# return out
for i in range(len(self._ops)):
if weights[i].numpy() != 0:
return self._ops[i](x) * weights[i]
def gumbel_softmax(logits, temperature=1, hard=True, eps=1e-10):
#U = np.random.uniform(0, 1, logits.shape)
#U = - to_variable(
# np.log(-np.log(U + eps) + eps).astype("float32"))
U = np.random.gumbel(0, 1, logits.shape).astype("float32")
logits = logits + to_variable(U)
logits = logits / temperature
logits = fluid.layers.softmax(logits)
if hard:
maxes = fluid.layers.reduce_max(logits, dim=1, keep_dim=True)
hard = fluid.layers.cast((logits == maxes), logits.dtype)
# out = hard - logits.detach() + logits
tmp = hard - logits
tmp.stop_gradient = True
out = tmp + logits
else:
out = logits
return out
class Zero(fluid.dygraph.Layer):
def __init__(self):
super(Zero, self).__init__()
def forward(self, x):
x = fluid.layers.zeros_like(x)
return x
class Identity(fluid.dygraph.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
class ReluConvBN(fluid.dygraph.Layer):
def __init__(self,
in_c=768,
out_c=768,
filter_size=[3, 1],
dilation=1,
stride=1,
affine=False,
use_cudnn=True,
name=None):
super(ReluConvBN, self).__init__()
#conv_std = (2.0 /
# (filter_size[0] * filter_size[1] * out_c * in_c))**0.5
conv_param = fluid.ParamAttr(
name=name if name is None else (name + "_conv.weights"),
initializer=fluid.initializer.MSRA())
self.conv = Conv2D(
in_c,
out_c,
filter_size,
dilation=[dilation, 1],
stride=stride,
padding=[(filter_size[0] - 1) * dilation // 2, 0],
param_attr=conv_param,
act=None,
bias_attr=False,
use_cudnn=use_cudnn)
gama = ParamAttr(
initializer=fluid.initializer.Constant(value=1), trainable=affine)
beta = ParamAttr(
initializer=fluid.initializer.Constant(value=0), trainable=affine)
self.bn = BatchNorm(out_c, param_attr=gama, bias_attr=beta)
def forward(self, inputs):
inputs = fluid.layers.relu(inputs)
conv = self.conv(inputs)
bn = self.bn(conv)
return bn
class Cell(fluid.dygraph.Layer):
def __init__(self, steps, n_channel, name=None):
super(Cell, self).__init__()
self._steps = steps
self.preprocess0 = ReluConvBN(in_c=n_channel, out_c=n_channel)
self.preprocess1 = ReluConvBN(in_c=n_channel, out_c=n_channel)
ops = []
for i in range(self._steps):
for j in range(2 + i):
op = MixedOp(
n_channel,
name=name
if name is None else "%s/step%d_edge%d" % (name, i, j))
ops.append(op)
self._ops = fluid.dygraph.LayerList(ops)
def forward(self, s0, s1, weights):
s0 = self.preprocess0(s0)
s1 = self.preprocess1(s1)
states = [s0, s1]
offset = 0
for i in range(self._steps):
s = fluid.layers.sums([
self._ops[offset + j](h, weights[offset + j])
for j, h in enumerate(states)
])
offset += len(states)
states.append(s)
out = fluid.layers.sums(states[-self._steps:])
#out = fluid.layers.concat(input=states[-self._steps:], axis=1)
return out
class EncoderLayer(Layer):
"""
encoder
"""
def __init__(self,
n_layer,
hidden_size=768,
name="encoder",
search_layer=True,
use_fixed_gumbel=False):
super(EncoderLayer, self).__init__()
self._n_layer = n_layer
self._hidden_size = hidden_size
self._n_channel = 256
self._steps = 3
self._n_ops = len(ConvBN_PRIMITIVES)
self.use_fixed_gumbel = use_fixed_gumbel
self.stem = fluid.dygraph.Sequential(
Conv2D(
num_channels=1,
num_filters=self._n_channel,
filter_size=[3, self._hidden_size],
padding=[1, 0],
param_attr=fluid.ParamAttr(initializer=MSRA()),
bias_attr=False),
BatchNorm(
num_channels=self._n_channel,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0))))
cells = []
for i in range(n_layer):
cell = Cell(
steps=self._steps,
n_channel=self._n_channel,
name="%s/layer_%d" % (name, i))
cells.append(cell)
self._cells = fluid.dygraph.LayerList(cells)
k = sum(1 for i in range(self._steps) for n in range(2 + i))
num_ops = self._n_ops
self.alphas = fluid.layers.create_parameter(
shape=[k, num_ops],
dtype="float32",
default_initializer=NormalInitializer(
loc=0.0, scale=1e-3))
# self.k = fluid.layers.create_parameter(
# shape=[1, self._n_layer],
# dtype="float32",
# default_initializer=NormalInitializer(
# loc=0.0, scale=1e-3))
self.BN = BatchNorm(
num_channels=self._n_channel,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1),
trainable=False),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0),
trainable=False))
self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
self.out = Linear(
self._n_channel,
3,
param_attr=ParamAttr(initializer=MSRA()),
bias_attr=ParamAttr(initializer=MSRA()))
self.use_fixed_gumbel = use_fixed_gumbel
self.gumbel_alphas = gumbel_softmax(self.alphas)
def forward(self, enc_input, flops=[], model_size=[]):
tmp = fluid.layers.reshape(
enc_input, [-1, 1, enc_input.shape[1], enc_input.shape[2]])
# (bs, 1, seq_len, hidden_size)
tmp = self.stem(tmp)
# (bs, n_channel, seq_len, 1)
alphas = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax(
self.alphas)
s0 = s1 = tmp
for i in range(self._n_layer):
s0, s1 = s1, self._cells[i](s0, s1, alphas)
# (bs, n_channel, seq_len, 1)
s1 = self.BN(s1)
outputs = self.pool2d_avg(s1)
outputs = fluid.layers.reshape(outputs, shape=[-1, 0])
outputs = self.out(outputs)
return outputs
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
class ConstantLR(LearningRateDecay):
def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
super(ConstantLR, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
def step(self):
return self.learning_rate
class LinearDecay(LearningRateDecay):
def __init__(self,
learning_rate,
warmup_steps,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False,
begin=0,
step=1,
dtype='float32'):
super(LinearDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.warmup_steps = warmup_steps
self.decay_steps = decay_steps
self.end_learning_rate = end_learning_rate
self.power = power
self.cycle = cycle
def step(self):
if self.step_num < self.warmup_steps:
decayed_lr = self.learning_rate * (self.step_num /
self.warmup_steps)
decayed_lr = self.create_lr_var(decayed_lr)
else:
tmp_step_num = self.step_num
tmp_decay_steps = self.decay_steps
if self.cycle:
div_res = fluid.layers.ceil(
self.create_lr_var(tmp_step_num / float(self.decay_steps)))
if tmp_step_num == 0:
div_res = self.create_lr_var(1.0)
tmp_decay_steps = self.decay_steps * div_res
else:
tmp_step_num = self.create_lr_var(
tmp_step_num
if tmp_step_num < self.decay_steps else self.decay_steps)
decayed_lr = (self.learning_rate - self.end_learning_rate) * \
((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
return decayed_lr
class Optimizer(object):
def __init__(self,
warmup_steps,
num_train_steps,
learning_rate,
model_cls,
weight_decay,
scheduler='linear_warmup_decay',
loss_scaling=1.0,
parameter_list=None):
self.warmup_steps = warmup_steps
self.num_train_steps = num_train_steps
self.learning_rate = learning_rate
self.model_cls = model_cls
self.weight_decay = weight_decay
self.scheduler = scheduler
self.loss_scaling = loss_scaling
self.parameter_list = parameter_list
self.scheduled_lr = 0.0
self.optimizer = self.lr_schedule()
def lr_schedule(self):
if self.warmup_steps > 0:
if self.scheduler == 'noam_decay':
self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
self.warmup_steps * (self.learning_rate**2)),
self.warmup_steps)
elif self.scheduler == 'linear_warmup_decay':
self.scheduled_lr = LinearDecay(self.learning_rate,
self.warmup_steps,
self.num_train_steps, 0.0)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
else:
self.scheduled_lr = ConstantLR(self.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
return optimizer
def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
def optimization(self, loss, use_data_parallel=False, model=None):
param_list = dict()
clip_norm_thres = 1.0
#grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
if use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
if self.weight_decay > 0:
for param in self.model_cls.parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
if use_data_parallel:
assert model is not None
model.apply_collective_grads()
#_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
_, param_grads = self.optimizer.minimize(loss)
if self.weight_decay > 0:
for param, grad in param_grads:
if self.exclude_from_weight_decay(param.name):
continue
if isinstance(self.scheduled_lr.step(), float):
updated_param = param.numpy() - param_list[
param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step()
else:
updated_param = param.numpy(
) - param_list[param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step().numpy()
updated_param_var = fluid.dygraph.to_variable(updated_param)
param = updated_param_var
#param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(3, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
import types
import csv
import numpy as np
import tokenization
from batching import prepare_batch_data
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def __init__(self,
data_dir,
vocab_path,
max_seq_len,
do_lower_case,
in_tokens,
random_seed=None):
self.data_dir = data_dir
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.in_tokens = in_tokens
np.random.seed(random_seed)
self.current_train_example = -1
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
self.current_train_epoch = -1
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for prediction."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
def convert_example(self, index, example, labels, max_seq_len, tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
feature = convert_single_example(index, example, labels, max_seq_len,
tokenizer)
return feature
def generate_instance(self, feature):
"""
generate instance with given feature
Args:
feature: InputFeatures(object). A single set of features of data.
"""
input_pos = list(range(len(feature.input_ids)))
return [
feature.input_ids, feature.segment_ids, input_pos, feature.label_id
]
def generate_batch_data(self,
batch_data,
total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False):
return prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
pad_id=self.vocab["[PAD]"],
cls_id=self.vocab["[CLS]"],
sep_id=self.vocab["[SEP]"],
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="utf8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
return self.num_examples[phase]
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example, self.current_train_epoch
def data_generator(self,
batch_size,
phase='train',
epoch=1,
dev_count=1,
shuffle=True,
shuffle_seed=None):
"""
Generate data for train, dev or test.
Args:
batch_size: int. The batch size of generated data.
phase: string. The phase for which to generate data.
epoch: int. Total epoches to generate data.
shuffle: bool. Whether to shuffle examples.
"""
if phase == 'train':
examples = self.get_train_examples(self.data_dir)
self.num_examples['train'] = len(examples)
elif phase == 'dev':
examples = self.get_dev_examples(self.data_dir)
self.num_examples['dev'] = len(examples)
elif phase == 'test':
examples = self.get_test_examples(self.data_dir)
self.num_examples['test'] = len(examples)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def instance_reader():
for epoch_index in range(epoch):
if shuffle:
if shuffle_seed is not None:
np.random.seed(shuffle_seed)
np.random.shuffle(examples)
if phase == 'train':
self.current_train_epoch = epoch_index
for (index, example) in enumerate(examples):
if phase == 'train':
self.current_train_example = index + 1
feature = self.convert_example(
index, example,
self.get_labels(), self.max_seq_len, self.tokenizer)
instance = self.generate_instance(feature)
yield instance
def batch_reader(reader, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for instance in reader():
token_ids, sent_ids, pos_ids, label = instance[:4]
max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(instance)
total_token_num += len(token_ids)
else:
yield batch, total_token_num
batch, total_token_num, max_len = [instance], len(
token_ids), len(token_ids)
if len(batch) > 0:
yield batch, total_token_num
def wrapper():
all_dev_batches = []
for batch_data, total_token_num in batch_reader(
instance_reader, batch_size, self.in_tokens):
batch_data = self.generate_batch_data(
batch_data,
total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
class XnliProcessor(DataProcessor):
"""Processor for the XNLI data set."""
def get_train_examples(self, data_dir):
"""See base class."""
self.language = "zh"
lines = self._read_tsv(
os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" %
self.language))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_dev_examples(self, data_dir):
"""See base class."""
self.language = "zh"
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "dev-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_test_examples(self, data_dir):
"""See base class."""
self.language = "zh"
lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "test-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
class MnliProcessor(DataProcessor):
"""Processor for the MultiNLI data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
"dev_matched")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type,
tokenization.convert_to_unicode(line[0]))
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class MrpcProcessor(DataProcessor):
"""Processor for the MRPC data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[3])
text_b = tokenization.convert_to_unicode(line[4])
if set_type == "test":
label = "0"
else:
label = tokenization.convert_to_unicode(line[0])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class ColaProcessor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
# Only the test set has a header
if set_type == "test" and i == 0:
continue
guid = "%s-%s" % (set_type, i)
if set_type == "test":
text_a = tokenization.convert_to_unicode(line[1])
label = "0"
else:
text_a = tokenization.convert_to_unicode(line[3])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=None, label=label))
return examples
def convert_single_example_to_unicode(guid, single_example):
text_a = tokenization.convert_to_unicode(single_example[0])
text_b = tokenization.convert_to_unicode(single_example[1])
label = tokenization.convert_to_unicode(single_example[2])
return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
label_id = label_map[example.label]
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id)
return feature
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
print("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
features.append(feature)
return features
if __name__ == '__main__':
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from __future__ import division
import os
import numpy as np
import types
import gzip
import logging
import re
import six
import collections
import tokenization
import paddle
import paddle.fluid as fluid
from batching import prepare_batch_data
class DataReader(object):
def __init__(self,
data_dir,
vocab_path,
batch_size=4096,
in_tokens=True,
max_seq_len=512,
shuffle_files=True,
epoch=100,
voc_size=0,
is_test=False,
generate_neg_sample=False):
self.vocab = self.load_vocab(vocab_path)
self.data_dir = data_dir
self.batch_size = batch_size
self.in_tokens = in_tokens
self.shuffle_files = shuffle_files
self.epoch = epoch
self.current_epoch = 0
self.current_file_index = 0
self.total_file = 0
self.current_file = None
self.voc_size = voc_size
self.max_seq_len = max_seq_len
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.is_test = is_test
self.generate_neg_sample = generate_neg_sample
if self.in_tokens:
assert self.batch_size >= self.max_seq_len, "The number of " \
"tokens in batch should not be smaller than max seq length."
if self.is_test:
self.epoch = 1
self.shuffle_files = False
def get_progress(self):
"""return current progress of traning data
"""
return self.current_epoch, self.current_file_index, self.total_file, self.current_file
def parse_line(self, line, max_seq_len=512):
""" parse one line to token_ids, sentence_ids, pos_ids, label
"""
line = line.strip().decode().split(";")
assert len(line) == 4, "One sample must have 4 fields!"
(token_ids, sent_ids, pos_ids, label) = line
token_ids = [int(token) for token in token_ids.split(" ")]
sent_ids = [int(token) for token in sent_ids.split(" ")]
pos_ids = [int(token) for token in pos_ids.split(" ")]
assert len(token_ids) == len(sent_ids) == len(
pos_ids
), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)"
label = int(label)
if len(token_ids) > max_seq_len:
return None
return [token_ids, sent_ids, pos_ids, label]
def read_file(self, file):
assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
file_path = self.data_dir + "/" + file
with gzip.open(file_path, "rb") as f:
for line in f:
parsed_line = self.parse_line(
line, max_seq_len=self.max_seq_len)
if parsed_line is None:
continue
yield parsed_line
def convert_to_unicode(self, text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(self, vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = open(vocab_file)
for num, line in enumerate(fin):
items = self.convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def random_pair_neg_samples(self, pos_samples):
""" randomly generate negtive samples using pos_samples
Args:
pos_samples: list of positive samples
Returns:
neg_samples: list of negtive samples
"""
np.random.shuffle(pos_samples)
num_sample = len(pos_samples)
neg_samples = []
miss_num = 0
for i in range(num_sample):
pair_index = (i + 1) % num_sample
origin_src_ids = pos_samples[i][0]
origin_sep_index = origin_src_ids.index(2)
pair_src_ids = pos_samples[pair_index][0]
pair_sep_index = pair_src_ids.index(2)
src_ids = origin_src_ids[:origin_sep_index + 1] + pair_src_ids[
pair_sep_index + 1:]
if len(src_ids) >= self.max_seq_len:
miss_num += 1
continue
sent_ids = [0] * len(origin_src_ids[:origin_sep_index + 1]) + [
1
] * len(pair_src_ids[pair_sep_index + 1:])
pos_ids = list(range(len(src_ids)))
neg_sample = [src_ids, sent_ids, pos_ids, 0]
assert len(src_ids) == len(sent_ids) == len(
pos_ids
), "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
neg_samples.append(neg_sample)
return neg_samples, miss_num
def mixin_negtive_samples(self, pos_sample_generator, buffer=1000):
""" 1. generate negtive samples by randomly group sentence_1 and sentence_2 of positive samples
2. combine negtive samples and positive samples
Args:
pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
Returns:
sample: one sample from shuffled positive samples and negtive samples
"""
pos_samples = []
num_total_miss = 0
pos_sample_num = 0
try:
while True:
while len(pos_samples) < buffer:
pos_sample = next(pos_sample_generator)
label = pos_sample[3]
assert label == 1, "positive sample's label must be 1"
pos_samples.append(pos_sample)
pos_sample_num += 1
neg_samples, miss_num = self.random_pair_neg_samples(
pos_samples)
num_total_miss += miss_num
samples = pos_samples + neg_samples
pos_samples = []
np.random.shuffle(samples)
for sample in samples:
yield sample
except StopIteration:
print("stopiteration: reach end of file")
if len(pos_samples) == 1:
yield pos_samples[0]
elif len(pos_samples) == 0:
yield None
else:
neg_samples, miss_num = self.random_pair_neg_samples(
pos_samples)
num_total_miss += miss_num
samples = pos_samples + neg_samples
pos_samples = []
np.random.shuffle(samples)
for sample in samples:
yield sample
print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" %
(num_total_miss, pos_sample_num * 2,
num_total_miss / (pos_sample_num * 2)))
def data_generator(self):
"""
data_generator
"""
files = os.listdir(self.data_dir)
self.total_file = len(files)
assert self.total_file > 0, "[Error] data_dir is empty"
def wrapper():
def reader():
for epoch in range(self.epoch):
self.current_epoch = epoch + 1
if self.shuffle_files:
np.random.shuffle(files)
for index, file in enumerate(files):
self.current_file_index = index + 1
self.current_file = file
sample_generator = self.read_file(file)
if not self.is_test and self.generate_neg_sample:
sample_generator = self.mixin_negtive_samples(
sample_generator)
for sample in sample_generator:
if sample is None:
continue
yield sample
def batch_reader(reader, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for parsed_line in reader():
token_ids, sent_ids, pos_ids, label = parsed_line
max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(parsed_line)
total_token_num += len(token_ids)
else:
yield batch, total_token_num
batch, total_token_num, max_len = [parsed_line], len(
token_ids), len(token_ids)
if len(batch) > 0:
yield batch, total_token_num
for batch_data, total_token_num in batch_reader(
reader, self.batch_size, self.in_tokens):
yield prepare_batch_data(
batch_data,
total_token_num,
voc_size=self.voc_size,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=self.mask_id,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
return wrapper
if __name__ == "__main__":
pass
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run BERT on SQuAD 1.1 and SQuAD 2.0."""
import six
import math
import json
import random
import collections
import tokenization
from batching import prepare_batch_data
class SquadExample(object):
"""A single training/test example for simple sequence classification.
For examples without an answer, the start and end position are -1.
"""
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None,
is_impossible=False):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def __str__(self):
return self.__repr__()
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self,
unique_id,
example_index,
doc_span_index,
tokens,
token_to_orig_map,
token_is_max_context,
input_ids,
input_mask,
segment_ids,
start_position=None,
end_position=None,
is_impossible=None):
self.unique_id = unique_id
self.example_index = example_index
self.doc_span_index = doc_span_index
self.tokens = tokens
self.token_to_orig_map = token_to_orig_map
self.token_is_max_context = token_is_max_context
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.start_position = start_position
self.end_position = end_position
self.is_impossible = is_impossible
def read_squad_examples(input_file, is_training,
version_2_with_negative=False):
"""Read a SQuAD json file into a list of SquadExample."""
with open(input_file, "r") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if is_training:
if version_2_with_negative:
is_impossible = qa["is_impossible"]
if (len(qa["answers"]) != 1) and (not is_impossible):
raise ValueError(
"For training, each question should have exactly 1 answer."
)
if not is_impossible:
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[answer_offset +
answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(
end_position + 1)])
cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
print("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text)
continue
else:
start_position = -1
end_position = -1
orig_answer_text = ""
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
return examples
def convert_examples_to_features(
examples,
tokenizer,
max_seq_length,
doc_stride,
max_query_length,
is_training,
#output_fn
):
"""Loads a data file into a list of `InputBatch`s."""
unique_id = 1000000000
for (example_index, example) in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
if is_training and example.is_impossible:
tok_start_position = -1
tok_end_position = -1
if is_training and not example.is_impossible:
tok_start_position = orig_to_tok_index[example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[example.end_position +
1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position,
tokenizer, example.orig_answer_text)
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
_DocSpan = collections.namedtuple( # pylint: disable=invalid-name
"DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[
split_token_index]
is_max_context = _check_is_max_context(
doc_spans, doc_span_index, split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
#while len(input_ids) < max_seq_length:
# input_ids.append(0)
# input_mask.append(0)
# segment_ids.append(0)
#assert len(input_ids) == max_seq_length
#assert len(input_mask) == max_seq_length
#assert len(segment_ids) == max_seq_length
start_position = None
end_position = None
if is_training and not example.is_impossible:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start and
tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(query_tokens) + 2
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
if is_training and example.is_impossible:
start_position = 0
end_position = 0
"""
if example_index < 3:
print("*** Example ***")
print("unique_id: %s" % (unique_id))
print("example_index: %s" % (example_index))
print("doc_span_index: %s" % (doc_span_index))
print("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens]))
print("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y)
for (x, y) in six.iteritems(token_to_orig_map)
]))
print("token_is_max_context: %s" % " ".join([
"%d:%s" % (x, y)
for (x, y) in six.iteritems(token_is_max_context)
]))
print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
print("segment_ids: %s" %
" ".join([str(x) for x in segment_ids]))
if is_training and example.is_impossible:
print("impossible example")
if is_training and not example.is_impossible:
answer_text = " ".join(tokens[start_position:(end_position +
1)])
print("start_position: %d" % (start_position))
print("end_position: %d" % (end_position))
print("answer: %s" %
(tokenization.printable_text(answer_text)))
"""
feature = InputFeatures(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
start_position=start_position,
end_position=end_position,
is_impossible=example.is_impossible)
unique_id += 1
yield feature
def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def _check_is_max_context(doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
class DataProcessor(object):
def __init__(self, vocab_path, do_lower_case, max_seq_length, in_tokens,
doc_stride, max_query_length):
self._tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._doc_stride = doc_stride
self._max_query_length = max_query_length
self._in_tokens = in_tokens
self.vocab = self._tokenizer.vocab
self.vocab_size = len(self.vocab)
self.pad_id = self.vocab["[PAD]"]
self.cls_id = self.vocab["[CLS]"]
self.sep_id = self.vocab["[SEP]"]
self.mask_id = self.vocab["[MASK]"]
self.current_train_example = -1
self.num_train_examples = -1
self.current_train_epoch = -1
self.train_examples = None
self.predict_examples = None
self.num_examples = {'train': -1, 'predict': -1}
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example, self.current_train_epoch
def get_examples(self,
data_path,
is_training,
version_2_with_negative=False):
examples = read_squad_examples(
input_file=data_path,
is_training=is_training,
version_2_with_negative=version_2_with_negative)
return examples
def get_num_examples(self, phase):
if phase not in ['train', 'predict']:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
return self.num_examples[phase]
def get_features(self, examples, is_training):
features = convert_examples_to_features(
examples=examples,
tokenizer=self._tokenizer,
max_seq_length=self._max_seq_length,
doc_stride=self._doc_stride,
max_query_length=self._max_query_length,
is_training=is_training)
return features
def data_generator(self,
data_path,
batch_size,
phase='train',
shuffle=False,
dev_count=1,
version_2_with_negative=False,
epoch=1):
if phase == 'train':
self.train_examples = self.get_examples(
data_path,
is_training=True,
version_2_with_negative=version_2_with_negative)
examples = self.train_examples
self.num_examples['train'] = len(self.train_examples)
elif phase == 'predict':
self.predict_examples = self.get_examples(
data_path,
is_training=False,
version_2_with_negative=version_2_with_negative)
examples = self.predict_examples
self.num_examples['predict'] = len(self.predict_examples)
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'predict'].")
def batch_reader(features, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for (index, feature) in enumerate(features):
if phase == 'train':
self.current_train_example = index + 1
seq_len = len(feature.input_ids)
labels = [feature.unique_id
] if feature.start_position is None else [
feature.start_position, feature.end_position
]
example = [
feature.input_ids, feature.segment_ids, range(seq_len)
] + labels
max_len = max(max_len, seq_len)
#max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(example)
total_token_num += seq_len
else:
yield batch, total_token_num
batch, total_token_num, max_len = [example
], seq_len, seq_len
if len(batch) > 0:
yield batch, total_token_num
def wrapper():
for epoch_index in range(epoch):
if shuffle:
random.shuffle(examples)
if phase == 'train':
self.current_train_epoch = epoch_index
features = self.get_features(examples, is_training=True)
else:
features = self.get_features(examples, is_training=False)
all_dev_batches = []
for batch_data, total_token_num in batch_reader(
features, batch_size, self._in_tokens):
batch_data = prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
pad_id=self.pad_id,
cls_id=self.cls_id,
sep_id=self.sep_id,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file, output_null_log_odds_file,
version_2_with_negative, null_score_diff_threshold,
verbose):
"""Write final predictions to the json file and log-odds of null if needed."""
print("Writing predictions to: %s" % (output_prediction_file))
print("Writing nbest to: %s" % (output_nbest_file))
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive
min_null_feature_index = 0 # the paragraph slice with min mull score
null_start_logit = 0 # the start logit at the slice with min null score
null_end_logit = 0 # the end logit at the slice with min null score
for (feature_index, feature) in enumerate(features):
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[
0] + result.end_logits[0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index,
False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(pred.end_index +
1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
1)]
tok_text = " ".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case,
verbose)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
else:
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
# if we didn't inlude the empty option in the n-best, inlcude it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="",
start_logit=null_start_logit,
end_logit=null_end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(
text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
# debug
if best_non_null_entry is None:
print("Emmm..., sth wrong")
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (
best_non_null_entry.end_logit)
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
with open(output_null_log_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
def get_final_text(pred_text, orig_text, do_lower_case, verbose):
"""Project the tokenized prediction back to the original text."""
# When we created the data, we kept track of the alignment between original
# (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
# now `orig_text` contains the span of our original text corresponding to the
# span that we predicted.
#
# However, `orig_text` may contain extra characters that we don't want in
# our prediction.
#
# For example, let's say:
# pred_text = steve smith
# orig_text = Steve Smith's
#
# We don't want to return `orig_text` because it contains the extra "'s".
#
# We don't want to return `pred_text` because it's already been normalized
# (the SQuAD eval script also does punctuation stripping/lower casing but
# our tokenizer does additional normalization like stripping accent
# characters).
#
# What we really want to return is "Steve Smith".
#
# Therefore, we have to apply a semi-complicated alignment heruistic between
# `pred_text` and `orig_text` to get a character-to-charcter alignment. This
# can fail in certain cases in which case we just return `orig_text`.
def _strip_spaces(text):
ns_chars = []
ns_to_s_map = collections.OrderedDict()
for (i, c) in enumerate(text):
if c == " ":
continue
ns_to_s_map[len(ns_chars)] = i
ns_chars.append(c)
ns_text = "".join(ns_chars)
return (ns_text, ns_to_s_map)
# We first tokenize `orig_text`, strip whitespace from the result
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
if verbose:
print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
return orig_text
end_position = start_position + len(pred_text) - 1
(orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
(tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
if len(orig_ns_text) != len(tok_ns_text):
if verbose:
print("Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using
# the character-to-character alignment.
tok_s_to_ns_map = {}
for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
tok_s_to_ns_map[tok_index] = i
orig_start_position = None
if start_position in tok_s_to_ns_map:
ns_start_position = tok_s_to_ns_map[start_position]
if ns_start_position in orig_ns_to_s_map:
orig_start_position = orig_ns_to_s_map[ns_start_position]
if orig_start_position is None:
if verbose:
print("Couldn't map start position")
return orig_text
orig_end_position = None
if end_position in tok_s_to_ns_map:
ns_end_position = tok_s_to_ns_map[end_position]
if ns_end_position in orig_ns_to_s_map:
orig_end_position = orig_ns_to_s_map[ns_end_position]
if orig_end_position is None:
if verbose:
print("Couldn't map end position")
return orig_text
output_text = orig_text[orig_start_position:(orig_end_position + 1)]
return output_text
def _get_best_indexes(logits, n_best_size):
"""Get the n-best logits from a list."""
index_and_score = sorted(
enumerate(logits), key=lambda x: x[1], reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
def _compute_softmax(scores):
"""Compute softmax probability over raw logits."""
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
if __name__ == '__main__':
train_file = 'squad/train-v1.1.json'
vocab_file = 'uncased_L-12_H-768_A-12/vocab.txt'
do_lower_case = True
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
train_examples = read_squad_examples(
input_file=train_file, is_training=True)
print("begin converting")
for (index, feature) in enumerate(
convert_examples_to_features(
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=384,
doc_stride=128,
max_query_length=64,
is_training=True,
#output_fn=train_writer.process_feature
)):
if index < 10:
print(index, feature.input_ids, feature.input_mask,
feature.segment_ids)
#for (index, example) in enumerate(train_examples):
# if index < 5:
# print(example)
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
import io
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = io.open(vocab_file, encoding="utf8")
for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import sys
import os
def usage():
"""
usage information
"""
print
print("please use command: ")
print(
"python convert_static_to_dygraph.py input_params_dir output_params_dir"
)
print
def convert_static_to_dygraph(static_model_path, dygraph_model_path):
"""
convert paddle static bert model to dygraph model
"""
def mkdir(path):
if not os.path.isdir(path):
if os.path.split(path)[0]:
mkdir(os.path.split(path)[0])
else:
return
os.mkdir(path)
if os.path.exists(dygraph_model_path):
shutil.rmtree(dygraph_model_path)
mkdir(dygraph_model_path)
if not os.path.exists(static_model_path):
print("paddle static model path doesn't exist.....")
return -1
file_list = []
for root, dirs, files in os.walk(static_model_path):
file_list.extend(files)
os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0"))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
#os.chdir(static_model_path)
#convert embedding file
embedding_type = ["word", "pos", "sent"]
for i in range(3):
src_name = embedding_type[i] + "_embedding"
trg_name = "Embedding_" + str(i) + "." + src_name
shutil.copyfile(
os.path.join(static_model_path, src_name),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
#convert pre_encoder file
shutil.copyfile(
os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil.copyfile(
os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
#convert mask lm params file
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil.copyfile(
os.path.join(static_model_path, "next_sent_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "next_sent_fc.w_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
shutil.copyfile(
os.path.join(static_model_path, "pooled_fc.b_0"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "pooled_fc.w_0"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
encoder_num = 0
for f in file_list:
if not f.startswith("encoder_layer"):
continue
layer_num = f.split('_')[2]
if int(layer_num) > encoder_num:
encoder_num = int(layer_num)
encoder_num += 1
for i in range(encoder_num):
encoder_dir = "EncoderSubLayer_" + str(i)
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir +
"/PositionwiseFeedForwardLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
encoder_map_dict = {
"ffn_fc_0.b_0":
("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
"ffn_fc_0.w_0":
("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
"ffn_fc_1.b_0":
("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
"ffn_fc_1.w_0":
("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
"multi_head_att_key_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
"multi_head_att_key_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
"multi_head_att_output_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
"multi_head_att_output_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
"multi_head_att_query_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
"multi_head_att_query_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
"multi_head_att_value_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
"multi_head_att_value_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
"post_att_layer_norm_bias":
("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
"post_att_layer_norm_scale":
("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
"post_ffn_layer_norm_bias":
("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
"post_ffn_layer_norm_scale":
("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
}
for f in file_list:
if not f.startswith("encoder_layer"):
continue
layer_num = f.split('_')[2]
suffix_name = "_".join(f.split('_')[3:])
in_dir = encoder_map_dict[suffix_name][0]
rename = encoder_map_dict[suffix_name][1]
encoder_layer = "EncoderSubLayer_" + layer_num
shutil.copyfile(
os.path.join(static_model_path, f),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
encoder_layer + "/" + in_dir + "/" + rename))
if __name__ == "__main__":
if len(sys.argv) < 3:
usage()
exit(1)
static_model_path = sys.argv[1]
dygraph_model_path = sys.argv[2]
convert_static_to_dygraph(static_model_path, dygraph_model_path)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
def cast_fp16_to_fp32(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP16,
"out_dtype": fluid.core.VarDesc.VarType.FP32
})
def cast_fp32_to_fp16(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP32,
"out_dtype": fluid.core.VarDesc.VarType.FP16
})
def copy_to_master_param(p, block):
v = block.vars.get(p.name, None)
if v is None:
raise ValueError("no param name %s found!" % p.name)
new_p = fluid.framework.Parameter(
block=block,
shape=v.shape,
dtype=fluid.core.VarDesc.VarType.FP32,
type=v.type,
lod_level=v.lod_level,
stop_gradient=p.stop_gradient,
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip,
name=v.name + ".master")
return new_p
def create_master_params_grads(params_grads, main_prog, startup_prog,
loss_scaling):
master_params_grads = []
tmp_role = main_prog._current_role
OpRole = fluid.core.op_proto_and_checker_maker.OpRole
main_prog._current_role = OpRole.Backward
for p, g in params_grads:
# create master parameters
master_param = copy_to_master_param(p, main_prog.global_block())
startup_master_param = startup_prog.global_block()._clone_variable(
master_param)
startup_p = startup_prog.global_block().var(p.name)
cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
# cast fp16 gradients to fp32 before apply gradients
if g.name.find("layer_norm") > -1:
if loss_scaling > 1:
scaled_g = g / float(loss_scaling)
else:
scaled_g = g
master_params_grads.append([p, scaled_g])
continue
master_grad = fluid.layers.cast(g, "float32")
if loss_scaling > 1:
master_grad = master_grad / float(loss_scaling)
master_params_grads.append([master_param, master_grad])
main_prog._current_role = tmp_role
return master_params_grads
def master_param_to_train_param(master_params_grads, params_grads, main_prog):
for idx, m_p_g in enumerate(master_params_grads):
train_p, _ = params_grads[idx]
if train_p.name.find("layer_norm") > -1:
continue
with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import six
import ast
import copy
import numpy as np
import paddle.fluid as fluid
def cast_fp32_to_fp16(exe, main_program):
print("Cast parameters to float16 data format.")
for param in main_program.global_block().all_parameters():
if not param.name.endswith(".master"):
param_t = fluid.global_scope().find_var(param.name).get_tensor()
data = np.array(param_t)
if param.name.find("layer_norm") == -1:
param_t.set(np.float16(data).view(np.uint16), exe.place)
master_param_var = fluid.global_scope().find_var(param.name +
".master")
if master_param_var is not None:
master_param_var.get_tensor().set(data, exe.place)
def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
assert os.path.exists(
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
def existed_persitables(var):
if not fluid.io.is_persistable(var):
return False
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
fluid.io.load_vars(
exe,
init_checkpoint_path,
main_program=main_program,
predicate=existed_persitables)
print("Load model from {}".format(init_checkpoint_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_pretraining_params(exe,
pretraining_params_path,
main_program,
use_fp16=False):
assert os.path.exists(pretraining_params_path
), "[%s] cann't be found." % pretraining_params_path
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(os.path.join(pretraining_params_path, var.name))
fluid.io.load_vars(
exe,
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
print("Load pretraining parameters from {}.".format(
pretraining_params_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_from_static_model(dir_path, cls_model, bert_config):
def load_numpy_weight(file_name):
if six.PY2:
res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
else:
res = np.load(
os.path.join(dir_path, file_name),
allow_pickle=True,
encoding='latin1')
assert res is not None
return res
# load word embedding
_param = load_numpy_weight("word_embedding")
cls_model.bert_layer._src_emb.set_dict({"weight": _param})
print("INIT word embedding")
_param = load_numpy_weight("pos_embedding")
cls_model.bert_layer._pos_emb.set_dict({"weight": _param})
print("INIT pos embedding")
_param = load_numpy_weight("sent_embedding")
cls_model.bert_layer._sent_emb.set_dict({"weight": _param})
print("INIT sent embedding")
_param0 = load_numpy_weight("pooled_fc.w_0")
_param1 = load_numpy_weight("pooled_fc.b_0")
cls_model.bert_layer.pooled_fc.set_dict({
"weight": _param0,
"bias": _param1
})
print("INIT pooled_fc")
_param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
_param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
cls_model.bert_layer.pre_process_layer._sub_layers[
"layer_norm_0"].set_dict({
"weight": _param0,
"bias": _param1
})
print("INIT pre_encoder layer norm")
for _i in range(bert_config["num_hidden_layers"]):
_param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._q_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_query_fc %d" % _i)
_param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._k_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_key_fc %d" % _i)
_param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._v_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_value_fc %d" % _i)
# init output fc
_param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._proj_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_output_fc %d" % _i)
# init layer_norm 1
_param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
_param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._postprocess_layer.layer_norm_0.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT layer norm in attention at %d layer" % _i)
# init layer_norm 2
_param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
_param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._postprocess_layer2.layer_norm_0.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT layer norm in FFN at %d layer" % _i)
# init FFN 1
_param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
_param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._positionwise_feed_forward._i2h.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT FFN-1 at %d layer" % _i)
# init FFN 2
_param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
_param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._positionwise_feed_forward._h2o.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT FFN-2 at %d layer" % _i)
# init cls fc
#_param_weight = "cls_out_w"
#_param_bias = "cls_out_b"
#_param_weight = load_numpy_weight(_param_weight)
#_param_bias = load_numpy_weight(_param_bias)
#cls_model.cls_fc.set_dict({"weight":_param_weight, "bias":_param_bias})
#print("INIT CLS FC layer")
return True
...@@ -20,6 +20,7 @@ __all__ = ['DARTSearch', 'count_parameters_in_MB'] ...@@ -20,6 +20,7 @@ __all__ = ['DARTSearch', 'count_parameters_in_MB']
import os import os
import logging import logging
from itertools import izip
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
...@@ -100,32 +101,17 @@ class DARTSearch(object): ...@@ -100,32 +101,17 @@ class DARTSearch(object):
def train_one_epoch(self, train_loader, valid_loader, architect, optimizer, def train_one_epoch(self, train_loader, valid_loader, architect, optimizer,
epoch): epoch):
objs = AvgrageMeter() objs = AvgrageMeter()
top1 = AvgrageMeter() ce_losses = AvgrageMeter()
top5 = AvgrageMeter() kd_losses = AvgrageMeter()
e_losses = AvgrageMeter()
self.model.train() self.model.train()
for step_id, ( step_id = 0
train_data, for train_data, valid_data in izip(train_loader(), valid_loader()):
valid_data) in enumerate(zip(train_loader(), valid_loader())):
train_image, train_label = train_data
valid_image, valid_label = valid_data
train_image = to_variable(train_image)
train_label = to_variable(train_label)
train_label.stop_gradient = True
valid_image = to_variable(valid_image)
valid_label = to_variable(valid_label)
valid_label.stop_gradient = True
n = train_image.shape[0]
if epoch >= self.epochs_no_archopt: if epoch >= self.epochs_no_archopt:
architect.step(train_image, train_label, valid_image, architect.step(train_data, valid_data)
valid_label)
logits = self.model(train_image) loss, ce_loss, kd_loss, e_loss = self.model.loss(train_data)
prec1 = fluid.layers.accuracy(input=logits, label=train_label, k=1)
prec5 = fluid.layers.accuracy(input=logits, label=train_label, k=5)
loss = fluid.layers.reduce_mean(
fluid.layers.softmax_with_cross_entropy(logits, train_label))
if self.use_data_parallel: if self.use_data_parallel:
loss = self.model.scale_loss(loss) loss = self.model.scale_loss(loss)
...@@ -137,16 +123,22 @@ class DARTSearch(object): ...@@ -137,16 +123,22 @@ class DARTSearch(object):
optimizer.minimize(loss) optimizer.minimize(loss)
self.model.clear_gradients() self.model.clear_gradients()
objs.update(loss.numpy(), n) batch_size = train_data[0].shape[0]
top1.update(prec1.numpy(), n) objs.update(loss.numpy(), batch_size)
top5.update(prec5.numpy(), n) ce_losses.update(ce_loss.numpy(), batch_size)
kd_losses.update(kd_loss.numpy(), batch_size)
e_losses.update(e_loss.numpy(), batch_size)
if step_id % self.log_freq == 0: if step_id % self.log_freq == 0:
#logger.info("Train Epoch {}, Step {}, loss {:.6f}; ce: {:.6f}; kd: {:.6f}; e: {:.6f}".format(
# epoch, step_id, objs.avg[0], ce_losses.avg[0], kd_losses.avg[0], e_losses.avg[0]))
logger.info( logger.info(
"Train Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}". "Train Epoch {}, Step {}, loss {}; ce: {}; kd: {}; e: {}".
format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[ format(epoch, step_id,
0])) loss.numpy(),
return top1.avg[0] ce_loss.numpy(), kd_loss.numpy(), e_loss.numpy()))
step_id += 1
return objs.avg[0]
def valid_one_epoch(self, valid_loader, epoch): def valid_one_epoch(self, valid_loader, epoch):
objs = AvgrageMeter() objs = AvgrageMeter()
...@@ -154,7 +146,7 @@ class DARTSearch(object): ...@@ -154,7 +146,7 @@ class DARTSearch(object):
top5 = AvgrageMeter() top5 = AvgrageMeter()
self.model.eval() self.model.eval()
for step_id, (image, label) in enumerate(valid_loader): for step_id, valid_data in enumerate(valid_loader):
image = to_variable(image) image = to_variable(image)
label = to_variable(label) label = to_variable(label)
n = image.shape[0] n = image.shape[0]
...@@ -244,14 +236,12 @@ class DARTSearch(object): ...@@ -244,14 +236,12 @@ class DARTSearch(object):
genotype = get_genotype(base_model) genotype = get_genotype(base_model)
logger.info('genotype = %s', genotype) logger.info('genotype = %s', genotype)
train_top1 = self.train_one_epoch(train_loader, valid_loader, self.train_one_epoch(train_loader, valid_loader, architect,
architect, optimizer, epoch) optimizer, epoch)
logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
if epoch == self.num_epochs - 1: if epoch == self.num_epochs - 1:
valid_top1 = self.valid_one_epoch(valid_loader, epoch) # valid_top1 = self.valid_one_epoch(valid_loader, epoch)
logger.info("Epoch {}, valid_acc {:.6f}".format(epoch, logger.info("Epoch {}, valid_acc {:.6f}".format(epoch, 1))
valid_top1))
if save_parameters: if save_parameters:
fluid.save_dygraph( fluid.save_dygraph(
self.model.state_dict(), self.model.state_dict(),
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from ..bert import cls
from .cls import *
__all__ = []
__all__ += cls.__all__
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT fine-tuning in Paddle Dygraph Mode."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import six
import sys
if six.PY2:
reload(sys)
sys.setdefaultencoding('utf8')
import ast
import time
import argparse
import numpy as np
import multiprocessing
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable, Layer
from .reader.cls import *
from .model.bert import BertConfig
from .model.cls import ClsModelLayer
from .optimization import Optimizer
from .utils.init import init_from_static_model
__all__ = ["BERTClassifier"]
def create_data(batch):
"""
convert data to variable
"""
src_ids = to_variable(batch[0], "src_ids")
position_ids = to_variable(batch[1], "position_ids")
sentence_ids = to_variable(batch[2], "sentence_ids")
input_mask = to_variable(batch[3], "input_mask")
labels = to_variable(batch[4], "labels")
labels.stop_gradient = True
return src_ids, position_ids, sentence_ids, input_mask, labels
class BERTClassifier(Layer):
def __init__(self,
num_labels,
task_name="mnli",
model_path=None,
use_cuda=True):
super(BERTClassifier, self).__init__()
self.task_name = task_name.lower()
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/"
bert_config_path = BERT_BASE_PATH + "/bert_config.json"
self.vocab_path = BERT_BASE_PATH + "/vocab.txt"
self.init_pretraining_params = BERT_BASE_PATH + "/dygraph_params/"
self.do_lower_case = True
self.bert_config = BertConfig(bert_config_path)
if use_cuda:
self.dev_count = fluid.core.get_cuda_device_count()
else:
self.dev_count = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
self.trainer_count = fluid.dygraph.parallel.Env().nranks
self.processors = {
'xnli': XnliProcessor,
'cola': ColaProcessor,
'mrpc': MrpcProcessor,
'mnli': MnliProcessor,
}
self.cls_model = ClsModelLayer(
self.bert_config, num_labels, return_pooled_out=True)
if model_path is not None:
#restore the model
print("Load params from %s" % model_path)
model_dict, _ = fluid.load_dygraph(model_path)
self.cls_model.load_dict(model_dict)
elif self.init_pretraining_params:
print("Load pre-trained model from %s" %
self.init_pretraining_params)
init_from_static_model(self.init_pretraining_params,
self.cls_model, self.bert_config)
else:
raise Exception(
"You should load pretrained model for training this teacher model."
)
def forward(self, input):
return self.cls_model(input)
def test(self, data_dir, batch_size=64, max_seq_len=512):
processor = self.processors[self.task_name](
data_dir=data_dir,
vocab_path=self.vocab_path,
max_seq_len=max_seq_len,
do_lower_case=self.do_lower_case,
in_tokens=False)
test_data_generator = processor.data_generator(
batch_size=batch_size, phase='dev', epoch=1, shuffle=False)
self.cls_model.eval()
total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], []
for batch in test_data_generator():
data_ids = create_data(batch)
total_loss, _, _, np_acces, np_num_seqs = self.cls_model(data_ids)
np_loss = total_loss.numpy()
np_acc = np_acces[-1].numpy()
np_avg_acc = np.mean([acc.numpy() for acc in np_acces])
np_num_seqs = np_num_seqs.numpy()
total_cost.extend(np_loss * np_num_seqs)
final_acc.extend(np_acc * np_num_seqs)
avg_acc.extend(np_avg_acc * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
print("[evaluation] classifier[-1] average acc: %f; average acc: %f" %
(np.sum(final_acc) / np.sum(total_num_seqs),
np.sum(avg_acc) / np.sum(total_num_seqs)))
self.cls_model.train()
def fit(self,
data_dir,
epoch,
batch_size=64,
use_cuda=True,
max_seq_len=512,
warmup_proportion=0.1,
use_data_parallel=False,
learning_rate=0.00005,
weight_decay=0.01,
lr_scheduler="linear_warmup_decay",
skip_steps=10,
save_steps=1000,
checkpoints="checkpoints"):
processor = self.processors[self.task_name](
data_dir=data_dir,
vocab_path=self.vocab_path,
max_seq_len=max_seq_len,
do_lower_case=self.do_lower_case,
in_tokens=False,
random_seed=5512)
shuffle_seed = 1 if self.trainer_count > 1 else None
train_data_generator = processor.data_generator(
batch_size=batch_size,
phase='train',
epoch=epoch,
dev_count=self.trainer_count,
shuffle=True,
shuffle_seed=shuffle_seed)
num_train_examples = processor.get_num_examples(phase='train')
max_train_steps = epoch * num_train_examples // batch_size // self.trainer_count
warmup_steps = int(max_train_steps * warmup_proportion)
print("Device count: %d" % self.dev_count)
print("Trainer count: %d" % self.trainer_count)
print("Num train examples: %d" % num_train_examples)
print("Max train steps: %d" % max_train_steps)
print("Num warmup steps: %d" % warmup_steps)
if use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
optimizer = Optimizer(
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=learning_rate,
model_cls=self.cls_model,
weight_decay=weight_decay,
scheduler=lr_scheduler,
loss_scaling=1.0,
parameter_list=self.cls_model.parameters())
if use_data_parallel:
self.cls_model = fluid.dygraph.parallel.DataParallel(
self.cls_model, strategy)
train_data_generator = fluid.contrib.reader.distributed_batch_reader(
train_data_generator)
steps = 0
time_begin = time.time()
for batch in train_data_generator():
data_ids = create_data(batch)
total_loss, logits, losses, accuracys, num_seqs = self.cls_model(
data_ids)
optimizer.optimization(
total_loss,
use_data_parallel=use_data_parallel,
model=self.cls_model)
self.cls_model.clear_gradients()
if steps != 0 and steps % skip_steps == 0:
time_end = time.time()
used_time = time_end - time_begin
current_example, current_epoch = processor.get_train_progress()
localtime = time.asctime(time.localtime(time.time()))
print(
"%s, epoch: %s, steps: %s, dy_graph loss: %f, acc: %f, speed: %f steps/s"
% (localtime, current_epoch, steps, total_loss.numpy(),
accuracys[-1].numpy(), skip_steps / used_time))
time_begin = time.time()
if steps != 0 and steps % save_steps == 0 and fluid.dygraph.parallel.Env(
).local_rank == 0:
self.test(data_dir, batch_size=64, max_seq_len=512)
save_path = os.path.join(checkpoints,
"steps" + "_" + str(steps))
fluid.save_dygraph(self.cls_model.state_dict(), save_path)
fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path)
print("Save model parameters and optimizer status at %s" %
save_path)
steps += 1
if fluid.dygraph.parallel.Env().local_rank == 0:
save_path = os.path.join(checkpoints, "final")
fluid.save_dygraph(self.cls_model.state_dict(), save_path)
fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path)
print("Save model parameters and optimizer status at %s" %
save_path)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
from .transformer_encoder import EncoderLayer, PrePostProcessLayer
class BertConfig(object):
def __init__(self, config_path):
self._config_dict = self._parse(config_path)
def _parse(self, config_path):
try:
with open(config_path) as json_file:
config_dict = json.load(json_file)
except Exception:
raise IOError("Error in parsing bert model config file '%s'" %
config_path)
else:
return config_dict
def __getitem__(self, key):
return self._config_dict[key]
def print_config(self):
for arg, value in sorted(six.iteritems(self._config_dict)):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
class BertModelLayer(Layer):
"""
bert
"""
def __init__(self, config, return_pooled_out=True, use_fp16=False):
super(BertModelLayer, self).__init__()
self._emb_size = config['hidden_size']
self._n_layer = config['num_hidden_layers']
self._n_head = config['num_attention_heads']
self._voc_size = config['vocab_size']
self._max_position_seq_len = config['max_position_embeddings']
self._sent_types = config['type_vocab_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._attention_dropout = config['attention_probs_dropout_prob']
self.return_pooled_out = return_pooled_out
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._dtype = "float16" if use_fp16 else "float32"
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
self._src_emb = Embedding(
size=[self._voc_size, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._word_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self._pos_emb = Embedding(
size=[self._max_position_seq_len, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._pos_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self._sent_emb = Embedding(
size=[self._sent_types, self._emb_size],
param_attr=fluid.ParamAttr(
name=self._sent_emb_name, initializer=self._param_initializer),
dtype=self._dtype)
self.pooled_fc = Linear(
input_dim=self._emb_size,
output_dim=self._emb_size,
param_attr=fluid.ParamAttr(
name="pooled_fc.w_0", initializer=self._param_initializer),
bias_attr="pooled_fc.b_0",
act="tanh")
self.pre_process_layer = PrePostProcessLayer(
"nd", self._emb_size, self._prepostprocess_dropout, "")
self._encoder = EncoderLayer(
hidden_act=self._hidden_act,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer)
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
"""
forward
"""
src_emb = self._src_emb(src_ids)
pos_emb = self._pos_emb(position_ids)
sent_emb = self._sent_emb(sentence_ids)
emb_out = src_emb + pos_emb
emb_out = emb_out + sent_emb
emb_out = self.pre_process_layer(emb_out)
self_attn_mask = fluid.layers.matmul(
x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(
x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(
x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
enc_outputs = self._encoder(emb_out, n_head_self_attn_mask)
if not self.return_pooled_out:
return enc_outputs
next_sent_feats = []
for enc_output in enc_outputs:
next_sent_feat = fluid.layers.slice(
input=enc_output, axes=[1], starts=[0], ends=[1])
next_sent_feat = self.pooled_fc(next_sent_feat)
next_sent_feat = fluid.layers.reshape(
next_sent_feat, shape=[-1, self._emb_size])
next_sent_feats.append(next_sent_feat)
return enc_outputs, next_sent_feats
class PretrainModelLayer(Layer):
"""
pretrain model
"""
def __init__(self,
config,
return_pooled_out=True,
weight_sharing=True,
use_fp16=False):
super(PretrainModelLayer, self).__init__()
self.config = config
self._voc_size = config['vocab_size']
self._emb_size = config['hidden_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._word_emb_name = "word_embedding"
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
self._weight_sharing = weight_sharing
self.use_fp16 = use_fp16
self._dtype = "float16" if use_fp16 else "float32"
self.bert_layer = BertModelLayer(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.pre_process_layer = PrePostProcessLayer(
"n", self._emb_size, self._prepostprocess_dropout, "pre_encoder")
self.pooled_fc = Linear(
input_dim=self._emb_size,
output_dim=self._emb_size,
param_attr=fluid.ParamAttr(
name="mask_lm_trans_fc.w_0",
initializer=self._param_initializer),
bias_attr="mask_lm_trans_fc.b_0",
act="tanh")
self.mask_lm_out_bias_attr = fluid.ParamAttr(
name="mask_lm_out_fc.b_0",
initializer=fluid.initializer.Constant(value=0.0))
if not self._weight_sharing:
self.out_fc = Linear(
input_dim=self._emb_size,
output_dim=self._voc_size,
param_attr=fluid.ParamAttr(
name="mask_lm_out_fc.w_0",
initializer=self._param_initializer),
bias_attr=self.mask_lm_out_bias_attr)
else:
self.fc_create_params = self.create_parameter(
shape=[self._voc_size],
dtype=self._dtype,
attr=self.mask_lm_out_bias_attr,
is_bias=True)
self.next_sent_fc = Linear(
input_dim=self._emb_size,
output_dim=2,
param_attr=fluid.ParamAttr(
name="next_sent_fc.w_0", initializer=self._param_initializer),
bias_attr="next_sent_fc.b_0")
def forward(self, src_ids, position_ids, sentence_ids, input_mask,
mask_label, mask_pos, labels):
"""
forward
"""
mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
reshaped_emb_out = fluid.layers.reshape(
x=enc_output, shape=[-1, self._emb_size])
mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
mask_trans_feat = self.pooled_fc(mask_feat)
mask_trans_feat = self.pre_process_layer(None, mask_trans_feat, "n",
self._prepostprocess_dropout)
if self._weight_sharing:
fc_out = fluid.layers.matmul(
x=mask_trans_feat,
y=self.bert_layer._src_emb._w,
transpose_y=True)
fc_out += self.fc_create_params
else:
fc_out = self.out_fc(mask_trans_feat)
mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
logits=fc_out, label=mask_label)
mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
next_sent_fc_out = self.next_sent_fc(next_sent_feat)
next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
logits=next_sent_fc_out, label=labels, return_softmax=True)
next_sent_acc = fluid.layers.accuracy(
input=next_sent_softmax, label=labels)
mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
loss = mean_next_sent_loss + mean_mask_lm_loss
return next_sent_acc, mean_mask_lm_loss, loss
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear, Layer
from .bert import BertModelLayer
class ClsModelLayer(Layer):
"""
classify model
"""
def __init__(self,
config,
num_labels,
is_training=True,
return_pooled_out=True,
loss_scaling=1.0,
use_fp16=False):
super(ClsModelLayer, self).__init__()
self.config = config
self.is_training = is_training
self.use_fp16 = use_fp16
self.loss_scaling = loss_scaling
self.n_layers = config['num_hidden_layers']
self.bert_layer = BertModelLayer(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.cls_fc = list()
for i in range(self.n_layers):
fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
name="cls_out_%d_w" % i,
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="cls_out_%d_b" % i,
initializer=fluid.initializer.Constant(0.)))
fc = self.add_sublayer("cls_fc_%d" % i, fc)
self.cls_fc.append(fc)
def forward(self, data_ids):
"""
forward
"""
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
input_mask = data_ids[3]
labels = data_ids[4]
enc_outputs, next_sent_feats = self.bert_layer(
src_ids, position_ids, sentence_ids, input_mask)
logits = []
losses = []
accuracys = []
for next_sent_feat, fc in zip(next_sent_feats, self.cls_fc):
cls_feat = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logit = fc(cls_feat)
logits.append(logit)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logit, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
losses.append(loss)
if self.use_fp16 and self.loss_scaling > 1.0:
loss *= self.loss_scaling
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
accuracys.append(accuracy)
total_loss = fluid.layers.sum(losses)
return total_loss, logits, losses, accuracys, num_seqs
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer
class PrePostProcessLayer(Layer):
"""
PrePostProcessLayer
"""
def __init__(self, process_cmd, d_model, dropout_rate, name):
super(PrePostProcessLayer, self).__init__()
self.process_cmd = process_cmd
self.functors = []
self.exec_order = ""
for cmd in self.process_cmd:
if cmd == "a": # add residual connection
self.functors.append(
lambda x, y: x + y if y is not None else x)
self.exec_order += "a"
elif cmd == "n": # add layer normalization
self.functors.append(
self.add_sublayer(
"layer_norm_%d" % len(
self.sublayers(include_sublayers=False)),
LayerNorm(
normalized_shape=d_model,
param_attr=fluid.ParamAttr(
name=name + "_layer_norm_scale",
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
name=name + "_layer_norm_bias",
initializer=fluid.initializer.Constant(0.)))))
self.exec_order += "n"
elif cmd == "d": # add dropout
if dropout_rate:
self.functors.append(lambda x: fluid.layers.dropout(
x, dropout_prob=dropout_rate, is_test=False))
self.exec_order += "d"
def forward(self, x, residual=None):
for i, cmd in enumerate(self.exec_order):
if cmd == "a":
x = self.functors[i](x, residual)
else:
x = self.functors[i](x)
return x
class PositionwiseFeedForwardLayer(Layer):
"""
PositionwiseFeedForwardLayer
"""
def __init__(self,
hidden_act,
d_inner_hid,
d_model,
dropout_rate,
param_initializer=None,
name=""):
super(PositionwiseFeedForwardLayer, self).__init__()
self._i2h = Linear(
input_dim=d_model,
output_dim=d_inner_hid,
param_attr=fluid.ParamAttr(
name=name + '_fc_0.w_0', initializer=param_initializer),
bias_attr=name + '_fc_0.b_0',
act=hidden_act)
self._h2o = Linear(
input_dim=d_inner_hid,
output_dim=d_model,
param_attr=fluid.ParamAttr(
name=name + '_fc_1.w_0', initializer=param_initializer),
bias_attr=name + '_fc_1.b_0')
self._dropout_rate = dropout_rate
def forward(self, x):
"""
forward
:param x:
:return:
"""
hidden = self._i2h(x)
if self._dropout_rate:
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
upscale_in_train="upscale_in_train",
is_test=False)
out = self._h2o(hidden)
return out
class MultiHeadAttentionLayer(Layer):
"""
MultiHeadAttentionLayer
"""
def __init__(self,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.,
cache=None,
gather_idx=None,
static_kv=False,
param_initializer=None,
name=""):
super(MultiHeadAttentionLayer, self).__init__()
self._n_head = n_head
self._d_key = d_key
self._d_value = d_value
self._d_model = d_model
self._dropout_rate = dropout_rate
self._q_fc = Linear(
input_dim=d_model,
output_dim=d_key * n_head,
param_attr=fluid.ParamAttr(
name=name + '_query_fc.w_0', initializer=param_initializer),
bias_attr=name + '_query_fc.b_0')
self._k_fc = Linear(
input_dim=d_model,
output_dim=d_key * n_head,
param_attr=fluid.ParamAttr(
name=name + '_key_fc.w_0', initializer=param_initializer),
bias_attr=name + '_key_fc.b_0')
self._v_fc = Linear(
input_dim=d_model,
output_dim=d_value * n_head,
param_attr=fluid.ParamAttr(
name=name + '_value_fc.w_0', initializer=param_initializer),
bias_attr=name + '_value_fc.b_0')
self._proj_fc = Linear(
input_dim=d_value * n_head,
output_dim=d_model,
param_attr=fluid.ParamAttr(
name=name + '_output_fc.w_0', initializer=param_initializer),
bias_attr=name + '_output_fc.b_0')
def forward(self, queries, keys, values, attn_bias):
"""
forward
:param queries:
:param keys:
:param values:
:param attn_bias:
:return:
"""
# compute q ,k ,v
keys = queries if keys is None else keys
values = keys if values is None else values
q = self._q_fc(queries)
k = self._k_fc(keys)
v = self._v_fc(values)
# split head
q_hidden_size = q.shape[-1]
reshaped_q = fluid.layers.reshape(
x=q,
shape=[0, 0, self._n_head, q_hidden_size // self._n_head],
inplace=False)
transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
k_hidden_size = k.shape[-1]
reshaped_k = fluid.layers.reshape(
x=k,
shape=[0, 0, self._n_head, k_hidden_size // self._n_head],
inplace=False)
transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
v_hidden_size = v.shape[-1]
reshaped_v = fluid.layers.reshape(
x=v,
shape=[0, 0, self._n_head, v_hidden_size // self._n_head],
inplace=False)
transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
scaled_q = fluid.layers.scale(x=transpose_q, scale=self._d_key**-0.5)
# scale dot product attention
product = fluid.layers.matmul(
#x=transpose_q,
x=scaled_q,
y=transpose_k,
transpose_y=True)
#alpha=self._d_model**-0.5)
if attn_bias is not None:
product += attn_bias
weights = fluid.layers.softmax(product)
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
# combine heads
if len(out.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
final_out = fluid.layers.reshape(
x=trans_x,
shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
inplace=False)
# fc to output
proj_out = self._proj_fc(final_out)
return proj_out
class EncoderSubLayer(Layer):
"""
EncoderSubLayer
"""
def __init__(self,
hidden_act,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=""):
super(EncoderSubLayer, self).__init__()
self.name = name
self._preprocess_cmd = preprocess_cmd
self._postprocess_cmd = postprocess_cmd
self._prepostprocess_dropout = prepostprocess_dropout
self._preprocess_layer = PrePostProcessLayer(
self._preprocess_cmd,
d_model,
prepostprocess_dropout,
name=name + "_pre_att")
self._multihead_attention_layer = MultiHeadAttentionLayer(
d_key,
d_value,
d_model,
n_head,
attention_dropout,
None,
None,
False,
param_initializer,
name=name + "_multi_head_att")
self._postprocess_layer = PrePostProcessLayer(
self._postprocess_cmd,
d_model,
self._prepostprocess_dropout,
name=name + "_post_att")
self._preprocess_layer2 = PrePostProcessLayer(
self._preprocess_cmd,
d_model,
self._prepostprocess_dropout,
name=name + "_pre_ffn")
self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
hidden_act,
d_inner_hid,
d_model,
relu_dropout,
param_initializer,
name=name + "_ffn")
self._postprocess_layer2 = PrePostProcessLayer(
self._postprocess_cmd,
d_model,
self._prepostprocess_dropout,
name=name + "_post_ffn")
def forward(self, enc_input, attn_bias):
"""
forward
:param enc_input:
:param attn_bias:
:return:
"""
pre_process_multihead = self._preprocess_layer(enc_input)
attn_output = self._multihead_attention_layer(pre_process_multihead,
None, None, attn_bias)
attn_output = self._postprocess_layer(attn_output, enc_input)
pre_process2_output = self._preprocess_layer2(attn_output)
ffd_output = self._positionwise_feed_forward(pre_process2_output)
return self._postprocess_layer2(ffd_output, attn_output)
class EncoderLayer(Layer):
"""
encoder
"""
def __init__(self,
hidden_act,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name=""):
super(EncoderLayer, self).__init__()
self._preprocess_cmd = preprocess_cmd
self._encoder_sublayers = list()
self._prepostprocess_dropout = prepostprocess_dropout
self._n_layer = n_layer
self._hidden_act = hidden_act
self._preprocess_layer = PrePostProcessLayer(
self._preprocess_cmd, 3, self._prepostprocess_dropout,
"post_encoder")
for i in range(n_layer):
self._encoder_sublayers.append(
self.add_sublayer(
'esl_%d' % i,
EncoderSubLayer(
hidden_act,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
param_initializer,
name=name + '_layer_' + str(i))))
def forward(self, enc_input, attn_bias):
"""
forward
:param enc_input:
:param attn_bias:
:return:
"""
outputs = []
for i in range(self._n_layer):
enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
outputs.append(enc_output)
enc_input = enc_output
return outputs
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
class ConstantLR(LearningRateDecay):
def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
super(ConstantLR, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
def step(self):
return self.learning_rate
class LinearDecay(LearningRateDecay):
def __init__(self,
learning_rate,
warmup_steps,
decay_steps,
end_learning_rate=0.0001,
power=1.0,
cycle=False,
begin=0,
step=1,
dtype='float32'):
super(LinearDecay, self).__init__(begin, step, dtype)
self.learning_rate = learning_rate
self.warmup_steps = warmup_steps
self.decay_steps = decay_steps
self.end_learning_rate = end_learning_rate
self.power = power
self.cycle = cycle
def step(self):
if self.step_num < self.warmup_steps:
decayed_lr = self.learning_rate * (self.step_num /
self.warmup_steps)
decayed_lr = self.create_lr_var(decayed_lr)
else:
tmp_step_num = self.step_num
tmp_decay_steps = self.decay_steps
if self.cycle:
div_res = fluid.layers.ceil(
self.create_lr_var(tmp_step_num / float(self.decay_steps)))
if tmp_step_num == 0:
div_res = self.create_lr_var(1.0)
tmp_decay_steps = self.decay_steps * div_res
else:
tmp_step_num = self.create_lr_var(
tmp_step_num
if tmp_step_num < self.decay_steps else self.decay_steps)
decayed_lr = (self.learning_rate - self.end_learning_rate) * \
((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
return decayed_lr
class Optimizer(object):
def __init__(self,
warmup_steps,
num_train_steps,
learning_rate,
model_cls,
weight_decay,
scheduler='linear_warmup_decay',
loss_scaling=1.0,
parameter_list=None):
self.warmup_steps = warmup_steps
self.num_train_steps = num_train_steps
self.learning_rate = learning_rate
self.model_cls = model_cls
self.weight_decay = weight_decay
self.scheduler = scheduler
self.loss_scaling = loss_scaling
self.parameter_list = parameter_list
self.scheduled_lr = 0.0
self.optimizer = self.lr_schedule()
def lr_schedule(self):
if self.warmup_steps > 0:
if self.scheduler == 'noam_decay':
self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
self.warmup_steps * (self.learning_rate**2)),
self.warmup_steps)
elif self.scheduler == 'linear_warmup_decay':
self.scheduled_lr = LinearDecay(self.learning_rate,
self.warmup_steps,
self.num_train_steps, 0.0)
else:
raise ValueError("Unkown learning rate scheduler, should be "
"'noam_decay' or 'linear_warmup_decay'")
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
else:
self.scheduled_lr = ConstantLR(self.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=self.scheduled_lr,
parameter_list=self.parameter_list)
return optimizer
def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
def optimization(self, loss, use_data_parallel=False, model=None):
param_list = dict()
clip_norm_thres = 1.0
#grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
if use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
if self.weight_decay > 0:
for param in self.model_cls.parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
if use_data_parallel:
assert model is not None
model.apply_collective_grads()
#_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
_, param_grads = self.optimizer.minimize(loss)
if self.weight_decay > 0:
for param, grad in param_grads:
if self.exclude_from_weight_decay(param.name):
continue
if isinstance(self.scheduled_lr.step(), float):
updated_param = param.numpy() - param_list[
param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step()
else:
updated_param = param.numpy(
) - param_list[param.name].numpy(
) * self.weight_decay * self.scheduled_lr.step().numpy()
updated_param_var = fluid.dygraph.to_variable(updated_param)
param = updated_param_var
#param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
"""
Add mask for batch_tokens, return out, mask_label, mask_pos;
Note: mask_pos responding the batch_tokens after padded;
"""
max_len = max([len(sent) for sent in batch_tokens])
mask_label = []
mask_pos = []
prob_mask = np.random.rand(total_token_num)
# Note: the first token is [CLS], so [low=1]
replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
pre_sent_len = 0
prob_index = 0
for sent_index, sent in enumerate(batch_tokens):
mask_flag = False
prob_index += pre_sent_len
for token_index, token in enumerate(sent):
prob = prob_mask[prob_index + token_index]
if prob > 0.15:
continue
elif 0.03 < prob <= 0.15:
# mask
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
elif 0.015 < prob <= 0.03:
# random replace
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
sent[token_index] = replace_ids[prob_index + token_index]
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
else:
# keep the original token
if token != SEP and token != CLS:
mask_label.append(sent[token_index])
mask_pos.append(sent_index * max_len + token_index)
pre_sent_len = len(sent)
# ensure at least mask one word in a sentence
while not mask_flag:
token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
if sent[token_index] != SEP and sent[token_index] != CLS:
mask_label.append(sent[token_index])
sent[token_index] = MASK
mask_flag = True
mask_pos.append(sent_index * max_len + token_index)
mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
return batch_tokens, mask_label, mask_pos
def prepare_batch_data(insts,
total_token_num,
voc_size=0,
pad_id=None,
cls_id=None,
sep_id=None,
mask_id=None,
return_input_mask=True,
return_max_len=True,
return_num_token=False):
"""
1. generate Tensor of data
2. generate Tensor of position
3. generate self attention mask, [shape: batch_size * max_len * max_len]
"""
batch_src_ids = [inst[0] for inst in insts]
batch_sent_ids = [inst[1] for inst in insts]
batch_pos_ids = [inst[2] for inst in insts]
labels_list = []
# compatible with squad, whose example includes start/end positions,
# or unique id
for i in range(3, len(insts[0]), 1):
labels = [inst[i] for inst in insts]
labels = np.array(labels).astype("int64").reshape([-1, 1])
labels_list.append(labels)
# First step: do mask without padding
if mask_id >= 0:
out, mask_label, mask_pos = mask(
batch_src_ids,
total_token_num,
vocab_size=voc_size,
CLS=cls_id,
SEP=sep_id,
MASK=mask_id)
else:
out = batch_src_ids
# Second step: padding
src_id, self_input_mask = pad_batch_data(
out, pad_idx=pad_id, return_input_mask=True)
pos_id = pad_batch_data(
batch_pos_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
sent_id = pad_batch_data(
batch_sent_ids,
pad_idx=pad_id,
return_pos=False,
return_input_mask=False)
if mask_id >= 0:
return_list = [
src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
] + labels_list
else:
return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
return return_list if len(return_list) > 1 else return_list[0]
def pad_batch_data(insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and input mask.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array([
list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
])
return_list += [inst_data.astype("int64").reshape([-1, max_len])]
# position data
if return_pos:
inst_pos = np.array([
list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] *
(max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import os
import types
import csv
import numpy as np
import tokenization
from batching import prepare_batch_data
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def __init__(self,
data_dir,
vocab_path,
max_seq_len,
do_lower_case,
in_tokens,
random_seed=None):
self.data_dir = data_dir
self.max_seq_len = max_seq_len
self.tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_path, do_lower_case=do_lower_case)
self.vocab = self.tokenizer.vocab
self.in_tokens = in_tokens
np.random.seed(random_seed)
self.current_train_example = -1
self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
self.current_train_epoch = -1
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for prediction."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
def convert_example(self, index, example, labels, max_seq_len, tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
feature = convert_single_example(index, example, labels, max_seq_len,
tokenizer)
return feature
def generate_instance(self, feature):
"""
generate instance with given feature
Args:
feature: InputFeatures(object). A single set of features of data.
"""
input_pos = list(range(len(feature.input_ids)))
return [
feature.input_ids, feature.segment_ids, input_pos, feature.label_id
]
def generate_batch_data(self,
batch_data,
total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False):
return prepare_batch_data(
batch_data,
total_token_num,
voc_size=-1,
pad_id=self.vocab["[PAD]"],
cls_id=self.vocab["[CLS]"],
sep_id=self.vocab["[SEP]"],
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with io.open(input_file, "r", encoding="utf8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines
def get_num_examples(self, phase):
"""Get number of examples for train, dev or test."""
if phase not in ['train', 'dev', 'test']:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
return self.num_examples[phase]
def get_train_progress(self):
"""Gets progress for training phase."""
return self.current_train_example, self.current_train_epoch
def data_generator(self,
batch_size,
phase='train',
epoch=1,
dev_count=1,
shuffle=True,
shuffle_seed=None):
"""
Generate data for train, dev or test.
Args:
batch_size: int. The batch size of generated data.
phase: string. The phase for which to generate data.
epoch: int. Total epoches to generate data.
shuffle: bool. Whether to shuffle examples.
"""
if phase == 'train':
examples = self.get_train_examples(self.data_dir)
self.num_examples['train'] = len(examples)
elif phase == 'dev':
examples = self.get_dev_examples(self.data_dir)
self.num_examples['dev'] = len(examples)
elif phase == 'test':
examples = self.get_test_examples(self.data_dir)
self.num_examples['test'] = len(examples)
elif phase == 'search_train':
examples = self.get_train_examples(self.data_dir)
self.num_examples['search_train'] = len(examples) / 2
examples = examples[:self.num_examples['search_train']]
elif phase == 'search_valid':
examples = self.get_train_examples(self.data_dir)
self.num_examples['search_valid'] = len(examples) / 2
examples = examples[self.num_examples['search_train']:]
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
def instance_reader():
for epoch_index in range(epoch):
if shuffle:
if shuffle_seed is not None:
np.random.seed(shuffle_seed)
np.random.shuffle(examples)
if phase == 'train' or phase == 'search_train':
self.current_train_epoch = epoch_index
for (index, example) in enumerate(examples):
if phase == 'train' or phase == "search_train":
self.current_train_example = index + 1
feature = self.convert_example(
index, example,
self.get_labels(), self.max_seq_len, self.tokenizer)
instance = self.generate_instance(feature)
yield instance
def batch_reader(reader, batch_size, in_tokens):
batch, total_token_num, max_len = [], 0, 0
for instance in reader():
token_ids, sent_ids, pos_ids, label = instance[:4]
max_len = max(max_len, len(token_ids))
if in_tokens:
to_append = (len(batch) + 1) * max_len <= batch_size
else:
to_append = len(batch) < batch_size
if to_append:
batch.append(instance)
total_token_num += len(token_ids)
else:
yield batch, total_token_num
batch, total_token_num, max_len = [instance], len(
token_ids), len(token_ids)
if len(batch) > 0:
yield batch, total_token_num
def wrapper():
all_dev_batches = []
for batch_data, total_token_num in batch_reader(
instance_reader, batch_size, self.in_tokens):
batch_data = self.generate_batch_data(
batch_data,
total_token_num,
voc_size=-1,
mask_id=-1,
return_input_mask=True,
return_max_len=False,
return_num_token=False)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
yield batch
all_dev_batches = []
return wrapper
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
class XnliProcessor(DataProcessor):
"""Processor for the XNLI data set."""
def get_train_examples(self, data_dir):
"""See base class."""
self.language = "zh"
lines = self._read_tsv(
os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" %
self.language))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_dev_examples(self, data_dir):
"""See base class."""
self.language = "zh"
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "dev-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_test_examples(self, data_dir):
"""See base class."""
self.language = "zh"
lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "test-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
class MnliProcessor(DataProcessor):
"""Processor for the MultiNLI data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
"dev_matched")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type,
tokenization.convert_to_unicode(line[0]))
text_a = tokenization.convert_to_unicode(line[8])
text_b = tokenization.convert_to_unicode(line[9])
if set_type == "test":
label = "contradiction"
else:
label = tokenization.convert_to_unicode(line[-1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class MrpcProcessor(DataProcessor):
"""Processor for the MRPC data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[3])
text_b = tokenization.convert_to_unicode(line[4])
if set_type == "test":
label = "0"
else:
label = tokenization.convert_to_unicode(line[0])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class ColaProcessor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
# Only the test set has a header
if set_type == "test" and i == 0:
continue
guid = "%s-%s" % (set_type, i)
if set_type == "test":
text_a = tokenization.convert_to_unicode(line[1])
label = "0"
else:
text_a = tokenization.convert_to_unicode(line[3])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(
guid=guid, text_a=text_a, text_b=None, label=label))
return examples
def convert_single_example_to_unicode(guid, single_example):
text_a = tokenization.convert_to_unicode(single_example[0])
text_b = tokenization.convert_to_unicode(single_example[1])
label = tokenization.convert_to_unicode(single_example[2])
return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
def convert_single_example(ex_index, example, label_list, max_seq_length,
tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
label_id = label_map[example.label]
feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id)
return feature
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer):
"""Convert a set of `InputExample`s to a list of `InputFeatures`."""
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
print("Writing example %d of %d" % (ex_index, len(examples)))
feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)
features.append(feature)
return features
if __name__ == '__main__':
pass
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import unicodedata
import six
import io
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
fin = io.open(vocab_file, encoding="utf8")
for num, line in enumerate(fin):
items = convert_to_unicode(line.strip()).split("\t")
if len(items) > 2:
break
token = items[0]
index = items[1] if len(items) == 2 else num
token = token.strip()
vocab[token] = int(index)
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import sys
import os
def usage():
"""
usage information
"""
print
print("please use command: ")
print(
"python convert_static_to_dygraph.py input_params_dir output_params_dir"
)
print
def convert_static_to_dygraph(static_model_path, dygraph_model_path):
"""
convert paddle static bert model to dygraph model
"""
def mkdir(path):
if not os.path.isdir(path):
if os.path.split(path)[0]:
mkdir(os.path.split(path)[0])
else:
return
os.mkdir(path)
if os.path.exists(dygraph_model_path):
shutil.rmtree(dygraph_model_path)
mkdir(dygraph_model_path)
if not os.path.exists(static_model_path):
print("paddle static model path doesn't exist.....")
return -1
file_list = []
for root, dirs, files in os.walk(static_model_path):
file_list.extend(files)
os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0"))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
#os.chdir(static_model_path)
#convert embedding file
embedding_type = ["word", "pos", "sent"]
for i in range(3):
src_name = embedding_type[i] + "_embedding"
trg_name = "Embedding_" + str(i) + "." + src_name
shutil.copyfile(
os.path.join(static_model_path, src_name),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
#convert pre_encoder file
shutil.copyfile(
os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil.copyfile(
os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
#convert mask lm params file
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
))
shutil.copyfile(
os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
))
shutil.copyfile(
os.path.join(static_model_path, "next_sent_fc.b_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "next_sent_fc.w_0"),
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
shutil.copyfile(
os.path.join(static_model_path, "pooled_fc.b_0"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
shutil.copyfile(
os.path.join(static_model_path, "pooled_fc.w_0"),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
encoder_num = 0
for f in file_list:
if not f.startswith("encoder_layer"):
continue
layer_num = f.split('_')[2]
if int(layer_num) > encoder_num:
encoder_num = int(layer_num)
encoder_num += 1
for i in range(encoder_num):
encoder_dir = "EncoderSubLayer_" + str(i)
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir))
os.makedirs(
os.path.join(dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir +
"/PositionwiseFeedForwardLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
os.makedirs(
os.path.join(
dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
"EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
encoder_map_dict = {
"ffn_fc_0.b_0":
("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
"ffn_fc_0.w_0":
("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
"ffn_fc_1.b_0":
("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
"ffn_fc_1.w_0":
("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
"multi_head_att_key_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
"multi_head_att_key_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
"multi_head_att_output_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
"multi_head_att_output_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
"multi_head_att_query_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
"multi_head_att_query_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
"multi_head_att_value_fc.b_0":
("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
"multi_head_att_value_fc.w_0":
("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
"post_att_layer_norm_bias":
("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
"post_att_layer_norm_scale":
("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
"post_ffn_layer_norm_bias":
("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
"post_ffn_layer_norm_scale":
("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
}
for f in file_list:
if not f.startswith("encoder_layer"):
continue
layer_num = f.split('_')[2]
suffix_name = "_".join(f.split('_')[3:])
in_dir = encoder_map_dict[suffix_name][0]
rename = encoder_map_dict[suffix_name][1]
encoder_layer = "EncoderSubLayer_" + layer_num
shutil.copyfile(
os.path.join(static_model_path, f),
os.path.join(
dygraph_model_path,
"PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
encoder_layer + "/" + in_dir + "/" + rename))
if __name__ == "__main__":
if len(sys.argv) < 3:
usage()
exit(1)
static_model_path = sys.argv[1]
dygraph_model_path = sys.argv[2]
convert_static_to_dygraph(static_model_path, dygraph_model_path)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
def cast_fp16_to_fp32(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP16,
"out_dtype": fluid.core.VarDesc.VarType.FP32
})
def cast_fp32_to_fp16(i, o, prog):
prog.global_block().append_op(
type="cast",
inputs={"X": i},
outputs={"Out": o},
attrs={
"in_dtype": fluid.core.VarDesc.VarType.FP32,
"out_dtype": fluid.core.VarDesc.VarType.FP16
})
def copy_to_master_param(p, block):
v = block.vars.get(p.name, None)
if v is None:
raise ValueError("no param name %s found!" % p.name)
new_p = fluid.framework.Parameter(
block=block,
shape=v.shape,
dtype=fluid.core.VarDesc.VarType.FP32,
type=v.type,
lod_level=v.lod_level,
stop_gradient=p.stop_gradient,
trainable=p.trainable,
optimize_attr=p.optimize_attr,
regularizer=p.regularizer,
gradient_clip_attr=p.gradient_clip_attr,
error_clip=p.error_clip,
name=v.name + ".master")
return new_p
def create_master_params_grads(params_grads, main_prog, startup_prog,
loss_scaling):
master_params_grads = []
tmp_role = main_prog._current_role
OpRole = fluid.core.op_proto_and_checker_maker.OpRole
main_prog._current_role = OpRole.Backward
for p, g in params_grads:
# create master parameters
master_param = copy_to_master_param(p, main_prog.global_block())
startup_master_param = startup_prog.global_block()._clone_variable(
master_param)
startup_p = startup_prog.global_block().var(p.name)
cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
# cast fp16 gradients to fp32 before apply gradients
if g.name.find("layer_norm") > -1:
if loss_scaling > 1:
scaled_g = g / float(loss_scaling)
else:
scaled_g = g
master_params_grads.append([p, scaled_g])
continue
master_grad = fluid.layers.cast(g, "float32")
if loss_scaling > 1:
master_grad = master_grad / float(loss_scaling)
master_params_grads.append([master_param, master_grad])
main_prog._current_role = tmp_role
return master_params_grads
def master_param_to_train_param(master_params_grads, params_grads, main_prog):
for idx, m_p_g in enumerate(master_params_grads):
train_p, _ = params_grads[idx]
if train_p.name.find("layer_norm") > -1:
continue
with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import six
import ast
import copy
import numpy as np
import paddle.fluid as fluid
def cast_fp32_to_fp16(exe, main_program):
print("Cast parameters to float16 data format.")
for param in main_program.global_block().all_parameters():
if not param.name.endswith(".master"):
param_t = fluid.global_scope().find_var(param.name).get_tensor()
data = np.array(param_t)
if param.name.find("layer_norm") == -1:
param_t.set(np.float16(data).view(np.uint16), exe.place)
master_param_var = fluid.global_scope().find_var(param.name +
".master")
if master_param_var is not None:
master_param_var.get_tensor().set(data, exe.place)
def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
assert os.path.exists(
init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
def existed_persitables(var):
if not fluid.io.is_persistable(var):
return False
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
fluid.io.load_vars(
exe,
init_checkpoint_path,
main_program=main_program,
predicate=existed_persitables)
print("Load model from {}".format(init_checkpoint_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_pretraining_params(exe,
pretraining_params_path,
main_program,
use_fp16=False):
assert os.path.exists(pretraining_params_path
), "[%s] cann't be found." % pretraining_params_path
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(os.path.join(pretraining_params_path, var.name))
fluid.io.load_vars(
exe,
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
print("Load pretraining parameters from {}.".format(
pretraining_params_path))
if use_fp16:
cast_fp32_to_fp16(exe, main_program)
def init_from_static_model(dir_path, cls_model, bert_config):
def load_numpy_weight(file_name):
if six.PY2:
res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
else:
res = np.load(
os.path.join(dir_path, file_name),
allow_pickle=True,
encoding='latin1')
assert res is not None
return res
# load word embedding
_param = load_numpy_weight("word_embedding")
cls_model.bert_layer._src_emb.set_dict({"weight": _param})
print("INIT word embedding")
_param = load_numpy_weight("pos_embedding")
cls_model.bert_layer._pos_emb.set_dict({"weight": _param})
print("INIT pos embedding")
_param = load_numpy_weight("sent_embedding")
cls_model.bert_layer._sent_emb.set_dict({"weight": _param})
print("INIT sent embedding")
_param0 = load_numpy_weight("pooled_fc.w_0")
_param1 = load_numpy_weight("pooled_fc.b_0")
cls_model.bert_layer.pooled_fc.set_dict({
"weight": _param0,
"bias": _param1
})
print("INIT pooled_fc")
_param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
_param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
cls_model.bert_layer.pre_process_layer._sub_layers[
"layer_norm_0"].set_dict({
"weight": _param0,
"bias": _param1
})
print("INIT pre_encoder layer norm")
for _i in range(bert_config["num_hidden_layers"]):
_param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._q_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_query_fc %d" % _i)
_param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._k_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_key_fc %d" % _i)
_param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._v_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_value_fc %d" % _i)
# init output fc
_param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
_param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._multihead_attention_layer._proj_fc.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT multi_head_att_output_fc %d" % _i)
# init layer_norm 1
_param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
_param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._postprocess_layer.layer_norm_0.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT layer norm in attention at %d layer" % _i)
# init layer_norm 2
_param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
_param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._postprocess_layer2.layer_norm_0.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT layer norm in FFN at %d layer" % _i)
# init FFN 1
_param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
_param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._positionwise_feed_forward._i2h.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT FFN-1 at %d layer" % _i)
# init FFN 2
_param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
_param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
_param_weight = load_numpy_weight(_param_weight)
_param_bias = load_numpy_weight(_param_bias)
cls_model.bert_layer._encoder._sub_layers[
"esl_%d" % _i]._positionwise_feed_forward._h2o.set_dict({
"weight": _param_weight,
"bias": _param_bias
})
print("INIT FFN-2 at %d layer" % _i)
# init cls fc
#_param_weight = "cls_out_w"
#_param_bias = "cls_out_b"
#_param_weight = load_numpy_weight(_param_weight)
#_param_bias = load_numpy_weight(_param_bias)
#cls_model.cls_fc.set_dict({"weight":_param_weight, "bias":_param_bias})
#print("INIT CLS FC layer")
return True
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册